In [2]:
from bs4 import BeautifulSoup as soup  # HTML data structure
from urllib.request import urlopen as uReq  # Web client
import pandas as pd

<h2> Building a Dictionary of Team Abbriviations and Full Names </h2>

In [3]:
team_names_dict = dict()

page_url = "https://en.wikipedia.org/wiki/Wikipedia:WikiProject_National_Basketball_Association/National_Basketball_Association_team_abbreviations"

#this downloads the webpage into an object
uClient = uReq(page_url)
page_soup = soup(uClient.read(), "html.parser")
uClient.close()

team_containers = page_soup.findAll("tr")

#setting the dicitonary of team names with keys as their abbrivations on basketball reference
for row in team_containers[1:]:
    
    names = row.findAll("td")
    
    full_name = names[0].text.replace("\n","")
    abr = names[1].text.replace("\n","")
    
    team_names_dict[full_name] = abr 

#Need to manually insert older teams    
team_names_dict['SEA'] = 'Seattle SuperSonics'
team_names_dict['PHO'] =  'Phoenix Suns'
team_names_dict['NJN'] = 'New Jersey Nets'
team_names_dict['KCK']= 'Kansas City Kings'
team_names_dict['WSB']= 'Washington Bullets'
team_names_dict['SDC']='San Diego Clippers'
team_names_dict['CHH']='Charlotte Hornets'
team_names_dict['NOH']='New Orleans Hornets'
team_names_dict['BRK']='Brooklyn Nets'
#wtf espn 
team_names_dict['LAC']= 'LA Clippers'
team_names_dict['CHO']= 'Charlotte Hornets'

<h2> Getting Advanced Stats for each Season </h2> 
<p> We already have the counting stats from past seasons from a previous project </p>

In [None]:
Advanced_df = pd.DataFrame()

for year in range(2020,2021):
    
    #Here we enter the URL we are trying to web scrape from 
    page_url = "https://www.basketball-reference.com/leagues/NBA_"+str(year)+"_advanced.html"

    #this basically downloads the webpage into an object
    uClient = uReq(page_url)
    # parses html into a soup data structure to traverse html
    # as if it were a json data type.
    page_soup_advanced_stats  = soup(uClient.read(), "html.parser")
    uClient.close()

    #getting headers
    player_table = page_soup_advanced_stats.findAll("table", {"id": "advanced_stats"})
    headers = player_table[0].find_all("thead")[0].find_all("tr")[0].find_all("th")

    columns = []
    for col in headers[1:]: 
        columns.append(col.text)

    #setting up our historical players df 
    Players_df = pd.DataFrame(columns = columns)

    players = player_table[0].findAll("tbody")[0].find_all("tr")

    num_of_fake_rows = 0

    for i,player in enumerate(players):

        #getting list of stats
        stats = player.find_all("td")

        #storing stats in an array
        stat_arr = []
        for stat in stats: 

            stat_arr.append(stat.text)

        # some rows are just for showing headers again and this skips over them
        if not stat_arr:
            num_of_fake_rows = num_of_fake_rows + 1
            continue

        #this stores all the stats to be taken into account when looking into MVP 
        player_df = dict(zip(Players_df.columns,stat_arr))

        #getting the stats in the dataframe 
        Players_df = Players_df.append(player_df,ignore_index=True)

    #exporting dataframe to a csv file 
    out_filename = "csv_files/players_advanced_stats/"+str(year)+"_Players_Advanced.csv"
    Players_df.to_csv(path_or_buf = out_filename, index=False)
    

<h2> Now we are going to build our Past MVP Dataframes </h2> 

In [None]:
Main_df = pd.DataFrame()

for year in range(1980,2020):

    ################# FIRST WE ARE GETTING TEAM STANDINGS ################# 

    #Now we get the stats for team record 
    page_url = "https://www.espn.com/nba/standings/_/season/"+str(year)+"/group/league"
    uClient = uReq(page_url)
    page_soup_team  = soup(uClient.read(), "html.parser")
    uClient.close()

    #Making a dictionary of team standings  
    team_table = page_soup_team.find("tbody",{"class":"Table__TBODY"}).findAll("tr")

    standings = dict()
    for i,row in enumerate(team_table,start=1):
        team_name = row.find("span",{"class":"hide-mobile"}).text
        standings[team_name] = i   

    ################# NOW WE STORE WHO HAD MVP VOTES THIS SEASON #################     

    #First we get the stats from former MVPs
    page_url = "https://www.basketball-reference.com/awards/awards_"+str(year)+".html"
    uClient = uReq(page_url)
    page_soup_stats  = soup(uClient.read(), "html.parser")
    uClient.close()

    #getting the table with the MVP data
    mvp_table = page_soup_stats.findAll("table", {"id": "mvp"})


    #get each row of players
    player_votes = mvp_table[0].find_all("td", {"data-stat":"award_share"})
    player_names = mvp_table[0].find_all("td", {"data-stat":"player"})


    MVP_Votes = dict()
    for i in range(len(player_votes)):
        MVP_Votes[player_names[i].text] = player_votes[i].text 
        
    ############# Here weare going to store who was. all nba this season ###########
    
    
    

    ################ Now we check for if they were an all-star ###############

    page_url = "https://basketball.realgm.com/nba/allstar/game/rosters/"+str(year)
    uClient = uReq(page_url)
    all_star_soup = soup(uClient.read(), "html.parser")
    uClient.close()

    allstars = all_star_soup.findAll("td", {"data-th":"Player"})

    allstar_arr = []
    for a in allstars:
        allstar_arr.append(a.text)




    ################ Now we build the data_frame ##############

    #loading dataframes
    counting_stats_df = pd.read_csv('csv_files/player_counting_stats/player_stats_'+str(year)+'.csv')
    advanced_stats_df = pd.read_csv("csv_files/players_advanced_stats/"+str(year)+"_Players_Advanced.csv")


    #fixing names so rows match
    advanced_stats_df = advanced_stats_df.rename(columns={"Player":"NAME","Tm":"TEAM","MP":"MPT"})

    #droppoing redundant rows
    advanced_stats_df = advanced_stats_df.drop(['G','Pos'], 1)
    advanced_stats_df.dropna(how='all', axis=1, inplace =True)

    #merging the two dataframes
    result = pd.merge(counting_stats_df,advanced_stats_df,on=['NAME'])
    result.drop_duplicates(subset=['NAME','TEAM_x'], keep="first", inplace=True)
    result =result.drop(['TEAM_y'],axis=1).rename(columns={"TEAM_x":"TEAM"})
    result.reset_index(drop=True, inplace=True)

    Past_MVP_df = pd.DataFrame(columns = result.columns)

    Past_MVP_df['ALLSTAR'] = None
    Past_MVP_df['SHARE'] = None
    Past_MVP_df['TEAM_STANDING'] = None


    for i,player in result.iterrows():

        ################# Adding ALLSTAR Column ##################
        if player['NAME'].replace('*','') in allstar_arr:
            player['ALLSTAR'] = 1
        else:
            player['ALLSTAR'] = 0

        ##############  Adding MVP Share Column ###########
        if player['NAME'].replace('*','') in MVP_Votes:
            player['SHARE'] = MVP_Votes[player['NAME'].replace('*','')]
        else:
            player['SHARE'] = 0.000


        ################ Adding Team standing column ############

        team_abrv = player["TEAM"].replace(' ','')

        if team_abrv not in team_names_dict and team_abrv !=  'TOT' :
            #print(team_abrv + " not in dict")
            continue

        try:
            #if a player played on multiple teams in a year then its tough to take a look at their record so we 
            #just won't use them because there isn't that many
            if team_abrv == 'TOT': 

                #since the team that a player is on now will likely be 2 rows below current player we take that player
                curr_team_player = result.loc[i+2]

                #Extracting the team ID

                curr_team_id = curr_team_player['TEAM'].replace(' ','')

                #this finds the team name for the player
                team_name = team_names_dict[curr_team_id]
                #Extract the standings for the current team 
                player['TEAM_STANDING'] = standings[team_name] 
            else:
                #this finds the team name for the player
                team_name = team_names_dict[team_abrv]
                player['TEAM_STANDING'] = standings[team_name]

        except:
            #print( team_abrv+ " ESPN doesn't have them in the standings this year" )
            continue

        Past_MVP_df = Past_MVP_df.append(player,ignore_index=True)
        
    print(year)
    Main_df = Main_df.append(Past_MVP_df,ignore_index=True)


In [None]:
Main_df

#exporting dataframe to a csv file 
out_filename = "csv_files/Past_Player_Data.csv"
Main_df.to_csv(path_or_buf = out_filename, index=False)

<h2> Now we are going to get the data for current players </h2> 

In [None]:
Current_df = pd.DataFrame()

year = 2020

################# FIRST WE ARE GETTING TEAM STANDINGS ################# 

#Now we get the stats for team record 
page_url = "https://www.espn.com/nba/standings/_/season/"+str(year)+"/group/league"
uClient = uReq(page_url)
page_soup_team  = soup(uClient.read(), "html.parser")
uClient.close()

#Making a dictionary of team standings  
team_table = page_soup_team.find("tbody",{"class":"Table__TBODY"}).findAll("tr")

standings = dict()
for i,row in enumerate(team_table,start=1):
    team_name = row.find("span",{"class":"hide-mobile"}).text
    standings[team_name] = i   

################ Now we check for if they were an all-star ###############

page_url = "https://basketball.realgm.com/nba/allstar/game/rosters/"+str(year)
uClient = uReq(page_url)
all_star_soup = soup(uClient.read(), "html.parser")
uClient.close()

allstars = all_star_soup.findAll("td", {"data-th":"Player"})

allstar_arr = []
for a in allstars:
    allstar_arr.append(a.text)

    

#loading dataframes
counting_stats_df = pd.read_csv('csv_files/player_counting_stats/player_stats_'+str(year)+'.csv')
advanced_stats_df = pd.read_csv("csv_files/players_advanced_stats/"+str(year)+"_Players_Advanced.csv")


#fixing names so rows match
advanced_stats_df = advanced_stats_df.rename(columns={"Player":"NAME","Tm":"TEAM","MP":"MPT"})

#droppoing redundant rows
advanced_stats_df = advanced_stats_df.drop(['G','Pos'], 1)
advanced_stats_df.dropna(how='all', axis=1, inplace =True)

#merging the two dataframes
result = pd.merge(counting_stats_df,advanced_stats_df,on=['NAME'])
result.drop_duplicates(subset=['NAME','TEAM_x'], keep="first", inplace=True)
result =result.drop(['TEAM_y'],axis=1).rename(columns={"TEAM_x":"TEAM"})
result.reset_index(drop=True, inplace=True)

Current_df = pd.DataFrame(columns = result.columns)
Current_df['ALLSTAR'] = None
Current_df['TEAM_STANDING'] = None

for i,player in result.iterrows():

    ################# Adding ALLSTAR Column ##################
    if player['NAME'].replace('*','') in allstar_arr:
        player['ALLSTAR'] = 1
    else:
        player['ALLSTAR'] = 0


    ################ Adding Team standing column ############

    team_abrv = player["TEAM"].replace(' ','')

    if team_abrv not in team_names_dict and team_abrv !=  'TOT' :
        #print(team_abrv + " not in dict")
        continue

    try:
        #if a player played on multiple teams in a year then its tough to take a look at their record so we 
        #just won't use them because there isn't that many
        if team_abrv == 'TOT': 

            #since the team that a player is on now will likely be 2 rows below current player we take that player
            curr_team_player = result.loc[i+2]

            #Extracting the team ID

            curr_team_id = curr_team_player['TEAM'].replace(' ','')

            #this finds the team name for the player
            team_name = team_names_dict[curr_team_id]
            #Extract the standings for the current team 
            player['TEAM_STANDING'] = standings[team_name] 
        else:
            #this finds the team name for the player
            team_name = team_names_dict[team_abrv]
            player['TEAM_STANDING'] = standings[team_name]

    except:
        #print( team_abrv+ " ESPN doesn't have them in the standings this year" )
        continue

    Current_df = Current_df.append(player,ignore_index=True)

    
Current_df

In [None]:
#exporting dataframe to a csv file 
out_filename = "csv_files/Current_Players.csv"
Current_df.to_csv(path_or_buf = out_filename, index=False)