In [1]:
from bs4 import BeautifulSoup as soup  # HTML data structure
from urllib.request import urlopen as uReq  # Web client
import pandas as pd

<h2> Building a Dictionary of Team Abbriviations and Full Names </h2>

In [2]:
team_names_dict = dict()

page_url = "https://en.wikipedia.org/wiki/Wikipedia:WikiProject_National_Basketball_Association/National_Basketball_Association_team_abbreviations"

#this downloads the webpage into an object
uClient = uReq(page_url)
page_soup = soup(uClient.read(), "html.parser")
uClient.close()

team_containers = page_soup.findAll("tr")

#setting the dicitonary of team names with keys as their abbrivations on basketball reference
for row in team_containers[1:]:
    
    names = row.findAll("td")
    
    full_name = names[0].text.replace("\n","")
    abr = names[1].text.replace("\n","")
    
    team_names_dict[full_name] = abr 

#Need to manually insert older teams    
team_names_dict['SEA'] = 'Seattle SuperSonics'
team_names_dict['PHO'] =  'Phoenix Suns'
team_names_dict['NJN'] = 'New Jersey Nets'
team_names_dict['KCK']= 'Kansas City Kings'
team_names_dict['WSB']= 'Washington Bullets'
team_names_dict['SDC']='San Diego Clippers'
team_names_dict['CHH']='Charlotte Hornets'
team_names_dict['NOH']='New Orleans Hornets'
team_names_dict['BRK']='Brooklyn Nets'
#wtf espn 
team_names_dict['LAC']= 'LA Clippers'
team_names_dict['CHO']= 'Charlotte Hornets'

<h2> Getting All-NBA Data for each season </h2>

In [3]:
page_url = "https://www.landofbasketball.com/awards/all_nba_teams_year.htm"
uClient = uReq(page_url)
All_NBA_Soup = soup(uClient.read(), "html.parser")
uClient.close()

#gets the all of the players divs
all_players = All_NBA_Soup.findAll("div",{"class":"indice-item margen-r2"})


year = 2020
all_nba_rosters = dict()

for i,div in enumerate(all_players,0):
    
    #15 players per team untill 1988
    if year > 1988 and i % 15 == 0:
        year -= 1
        all_nba_rosters[year] = []
    #then it turns into 10 players per team
    elif  year <= 1988 and (i+5) % 10 == 0: 
        year -= 1
        all_nba_rosters[year] = []
    #Building a dictionary of all of the players
    all_nba_rosters[year].append(div.text.replace('\r\n ',''))

    


<h2> Getting Advanced Stats for each Season </h2> 
<p> We already have the counting stats from past seasons from a previous project </p>

In [None]:
Advanced_df = pd.DataFrame()

for year in range(2020,2021):
    
    #Here we enter the URL we are trying to web scrape from 
    page_url = "https://www.basketball-reference.com/leagues/NBA_"+str(year)+"_advanced.html"

    #this basically downloads the webpage into an object
    uClient = uReq(page_url)
    # parses html into a soup data structure to traverse html
    # as if it were a json data type.
    page_soup_advanced_stats  = soup(uClient.read(), "html.parser")
    uClient.close()

    #getting headers
    player_table = page_soup_advanced_stats.findAll("table", {"id": "advanced_stats"})
    headers = player_table[0].find_all("thead")[0].find_all("tr")[0].find_all("th")

    columns = []
    for col in headers[1:]: 
        columns.append(col.text)

    #setting up our historical players df 
    Players_df = pd.DataFrame(columns = columns)

    players = player_table[0].findAll("tbody")[0].find_all("tr")

    num_of_fake_rows = 0

    for i,player in enumerate(players):

        #getting list of stats
        stats = player.find_all("td")

        #storing stats in an array
        stat_arr = []
        for stat in stats: 

            stat_arr.append(stat.text)

        # some rows are just for showing headers again and this skips over them
        if not stat_arr:
            num_of_fake_rows = num_of_fake_rows + 1
            continue

        #this stores all the stats to be taken into account when looking into MVP 
        player_df = dict(zip(Players_df.columns,stat_arr))

        #getting the stats in the dataframe 
        Players_df = Players_df.append(player_df,ignore_index=True)

    #exporting dataframe to a csv file 
    out_filename = "csv_files/players_advanced_stats/"+str(year)+"_Players_Advanced.csv"
    Players_df.to_csv(path_or_buf = out_filename, index=False)
    

<h2> Now we are going to build our Past MVP Dataframes </h2> 

In [6]:
Main_df = pd.DataFrame()

for year in range(1980,2020):

    ################# FIRST WE ARE GETTING TEAM STANDINGS ################# 

    #Now we get the stats for team record 
    page_url = "https://www.espn.com/nba/standings/_/season/"+str(year)+"/group/league"
    uClient = uReq(page_url)
    page_soup_team  = soup(uClient.read(), "html.parser")
    uClient.close()

    #Making a dictionary of team standings  
    team_table = page_soup_team.find("tbody",{"class":"Table__TBODY"}).findAll("tr")

    standings = dict()
    for i,row in enumerate(team_table,start=1):
        team_name = row.find("span",{"class":"hide-mobile"}).text
        standings[team_name] = i   

        
    ################ Now we check for if they were an all-star ###############

    page_url = "https://basketball.realgm.com/nba/allstar/game/rosters/"+str(year)
    uClient = uReq(page_url)
    all_star_soup = soup(uClient.read(), "html.parser")
    uClient.close()

    allstars = all_star_soup.findAll("td", {"data-th":"Player"})

    allstar_arr = []
    for a in allstars:
        allstar_arr.append(a.text)




    ################ Now we build the data_frame ##############

    #loading dataframes
    counting_stats_df = pd.read_csv('csv_files/player_counting_stats/player_stats_'+str(year)+'.csv')
    advanced_stats_df = pd.read_csv("csv_files/players_advanced_stats/"+str(year)+"_Players_Advanced.csv")


    #fixing names so rows match
    advanced_stats_df = advanced_stats_df.rename(columns={"Player":"NAME","Tm":"TEAM","MP":"MPT"})

    #droppoing redundant rows
    advanced_stats_df = advanced_stats_df.drop(['G','Pos'], 1)
    advanced_stats_df.dropna(how='all', axis=1, inplace =True)

    #merging the two dataframes
    result = pd.merge(counting_stats_df,advanced_stats_df,on=['NAME'])
    result.drop_duplicates(subset=['NAME','TEAM_x'], keep="first", inplace=True)
    result =result.drop(['TEAM_y'],axis=1).rename(columns={"TEAM_x":"TEAM"})
    result.reset_index(drop=True, inplace=True)

    ALL_NBA_DF = pd.DataFrame(columns = result.columns)

    ALL_NBA_DF['ALLSTAR'] = None
    ALL_NBA_DF['ALL_NBA'] = None
    ALL_NBA_DF['TEAM_STANDING'] = None

    all_nba_players_arr = all_nba_rosters[year]
    for i,player in result.iterrows():

        ################# Adding ALLSTAR Column ##################
        if player['NAME'].replace('*','') in allstar_arr:
            player['ALLSTAR'] = 1
        else:
            player['ALLSTAR'] = 0

        ##############  Adding ALL-NBA Column ###########
        if player['NAME'].replace('*','') in all_nba_players_arr:
            player['ALL_NBA'] = 1
        else:
            player['ALL_NBA'] = 0


        ################ Adding Team standing column ############

        team_abrv = player["TEAM"].replace(' ','')

        if team_abrv not in team_names_dict and team_abrv !=  'TOT' :
            #print(team_abrv + " not in dict")
            continue

        try:
            #if a player played on multiple teams in a year then its tough to take a look at their record so we 
            #just won't use them because there isn't that many
            if team_abrv == 'TOT': 

                #since the team that a player is on now will likely be 2 rows below current player we take that player
                curr_team_player = result.loc[i+2]

                #Extracting the team ID

                curr_team_id = curr_team_player['TEAM'].replace(' ','')

                #this finds the team name for the player
                team_name = team_names_dict[curr_team_id]
                #Extract the standings for the current team 
                player['TEAM_STANDING'] = standings[team_name] 
            else:
                #this finds the team name for the player
                team_name = team_names_dict[team_abrv]
                player['TEAM_STANDING'] = standings[team_name]

        except:
            #print( team_abrv+ " ESPN doesn't have them in the standings this year" )
            continue

        ALL_NBA_DF = ALL_NBA_DF.append(player,ignore_index=True)
        
    print(year)
    Main_df = Main_df.append(ALL_NBA_DF,ignore_index=True)


1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019


In [7]:
Main_df

#exporting dataframe to a csv file 
out_filename = "csv_files/Past_Player_Data.csv"
Main_df.to_csv(path_or_buf = out_filename, index=False)

<h2> Now we are going to get the data for current players </h2> 

In [8]:
Current_df = pd.DataFrame()

year = 2020

################# FIRST WE ARE GETTING TEAM STANDINGS ################# 

#Now we get the stats for team record 
page_url = "https://www.espn.com/nba/standings/_/season/"+str(year)+"/group/league"
uClient = uReq(page_url)
page_soup_team  = soup(uClient.read(), "html.parser")
uClient.close()

#Making a dictionary of team standings  
team_table = page_soup_team.find("tbody",{"class":"Table__TBODY"}).findAll("tr")

standings = dict()
for i,row in enumerate(team_table,start=1):
    team_name = row.find("span",{"class":"hide-mobile"}).text
    standings[team_name] = i   

################ Now we check for if they were an all-star ###############

page_url = "https://basketball.realgm.com/nba/allstar/game/rosters/"+str(year)
uClient = uReq(page_url)
all_star_soup = soup(uClient.read(), "html.parser")
uClient.close()

allstars = all_star_soup.findAll("td", {"data-th":"Player"})

allstar_arr = []
for a in allstars:
    allstar_arr.append(a.text)

    

#loading dataframes
counting_stats_df = pd.read_csv('csv_files/player_counting_stats/player_stats_'+str(year)+'.csv')
advanced_stats_df = pd.read_csv("csv_files/players_advanced_stats/"+str(year)+"_Players_Advanced.csv")


#fixing names so rows match
advanced_stats_df = advanced_stats_df.rename(columns={"Player":"NAME","Tm":"TEAM","MP":"MPT"})

#droppoing redundant rows
advanced_stats_df = advanced_stats_df.drop(['G','Pos'], 1)
advanced_stats_df.dropna(how='all', axis=1, inplace =True)

#merging the two dataframes
result = pd.merge(counting_stats_df,advanced_stats_df,on=['NAME'])
result.drop_duplicates(subset=['NAME','TEAM_x'], keep="first", inplace=True)
result =result.drop(['TEAM_y'],axis=1).rename(columns={"TEAM_x":"TEAM"})
result.reset_index(drop=True, inplace=True)

Current_df = pd.DataFrame(columns = result.columns)
Current_df['ALLSTAR'] = None
Current_df['TEAM_STANDING'] = None

for i,player in result.iterrows():

    ################# Adding ALLSTAR Column ##################
    if player['NAME'].replace('*','') in allstar_arr:
        player['ALLSTAR'] = 1
    else:
        player['ALLSTAR'] = 0


    ################ Adding Team standing column ############

    team_abrv = player["TEAM"].replace(' ','')

    if team_abrv not in team_names_dict and team_abrv !=  'TOT' :
        #print(team_abrv + " not in dict")
        continue

    try:
        #if a player played on multiple teams in a year then its tough to take a look at their record so we 
        #just won't use them because there isn't that many
        if team_abrv == 'TOT': 

            #since the team that a player is on now will likely be 2 rows below current player we take that player
            curr_team_player = result.loc[i+2]

            #Extracting the team ID

            curr_team_id = curr_team_player['TEAM'].replace(' ','')

            #this finds the team name for the player
            team_name = team_names_dict[curr_team_id]
            #Extract the standings for the current team 
            player['TEAM_STANDING'] = standings[team_name] 
        else:
            #this finds the team name for the player
            team_name = team_names_dict[team_abrv]
            player['TEAM_STANDING'] = standings[team_name]

    except:
        #print( team_abrv+ " ESPN doesn't have them in the standings this year" )
        continue

    Current_df = Current_df.append(player,ignore_index=True)

    
Current_df

Unnamed: 0,NAME,TEAM,POS,AGE,G,GS,MP,FG,FGA,FG%,...,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,ALLSTAR,TEAM_STANDING
0,Steven Adams,OKC,C,26,58,58,27.0,4.5,7.6,.591,...,3.7,2.5,6.2,0.190,2.1,1.1,3.2,2.0,0,9
1,Bam Adebayo,MIA,PF,22,65,65,34.4,6.3,11.1,.567,...,4.6,3.6,8.1,0.175,1.6,2.1,3.6,3.2,1,8
2,LaMarcus Aldridge,SAS,C,34,53,53,33.1,7.4,15.0,.493,...,3.1,1.4,4.4,0.122,1.8,-0.5,1.3,1.5,0,20
3,Nickeil Alexander-Walker,NOP,SG,21,41,0,12.2,1.9,5.5,.339,...,-0.7,0.4,-0.4,-0.034,-3.5,-1.4,-4.9,-0.4,0,18
4,Grayson Allen,MEM,SG,24,30,0,16.6,2.6,5.9,.449,...,0.5,0.3,0.7,0.070,-1.0,-1.5,-2.6,-0.1,0,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
619,Thaddeus Young,CHI,PF,31,64,16,24.9,4.2,9.4,.448,...,-0.2,2.1,1.9,0.058,-1.4,0.2,-1.1,0.4,0,24
620,Trae Young,ATL,PG,21,60,60,35.3,9.1,20.8,.437,...,5.4,0.5,5.9,0.134,6.3,-2.3,4.0,3.2,1,27
621,Cody Zeller,CHO,C,27,58,39,23.1,4.3,8.3,.524,...,2.4,1.2,3.6,0.129,0.2,-0.8,-0.6,0.5,0,23
622,Ante Žižić,CLE,C,23,22,0,10.0,1.9,3.3,.569,...,0.3,0.2,0.5,0.106,-1.7,-1.5,-3.2,-0.1,0,29


In [9]:
#exporting dataframe to a csv file 
out_filename = "csv_files/Current_Players.csv"
Current_df.to_csv(path_or_buf = out_filename, index=False)