In [34]:
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
import pandas as pd

## Build a Dictionary of Team Abbreviations, and Full Names

In [35]:
team_names_dict = dict()

page_url = "https://en.wikipedia.org/wiki/Wikipedia:WikiProject_National_Basketball_Association/National_Basketball_Association_team_abbreviations"

# download the webpage into an object
uClient = uReq(page_url)
page_soup = soup(uClient.read(), 'html.parser')
uClient.close()

team_containers = page_soup.findAll("tr")

# set the dictionary of team names, with keys as team
for row in team_containers[1:]:
    
    names = row.findAll("td")
    
    full_name = names[0].text.replace("\n","")
    abr = names[1].text.replace("\n", "")
    
    team_names_dict[full_name] = abr

# Need to manually insert older team
team_names_dict['SEA'] = 'Seattle SuperSonics'
team_names_dict['PHO'] = 'Phoenix Suns'
team_names_dict['NJN'] = 'New Jersey Nets'
team_names_dict['KCK'] = 'Kansas City Kings'
team_names_dict['WSB'] = 'Washington Bullets'
team_names_dict['SDC'] = 'San Diego Clippers'
team_names_dict['CHH'] = 'Charlotte Hornets'
team_names_dict['NOH'] = 'New Orleans Hornets'
team_names_dict['BRK'] = 'Brookyln Nets'

team_names_dict['LAC'] = 'LA Clippers'
team_names_dict['CHO'] = 'Charlotte Hornets'

## Getting All-NBA Data for each season

In [36]:
page_url = "https://www.landofbasketball.com/awards/all_nba_teams_year.htm"
uClient = uReq(page_url)
All_NBA_Soup = soup(uClient.read(), 'html.parser')
uClient.close()

# gets all of the players divs
all_players = All_NBA_Soup.findAll("div", {"class":"indice-item margen-r2"})

year = 2020
all_nba_rosters = dict()


for i, div in enumerate(all_players, 0):
    
    # 15 players per team until 1988
    if year > 1988 and i % 5 == 0:
        year -= 1
        all_nba_rosters[year] = []
    # then its 10 palyes per team by convention
    elif year <= 1988 and (i+5) % 10 == 0:
        year -= 1
        all_nba_rosters[year] = []
    # building a dictionary of all the players
    all_nba_rosters[year].append(div.text.replace('\r\n ',''))

## Getting Advanced Stats for Each season

In [37]:
Advanced_df = pd.DataFrame()

for year in range(1980, 2021):
    
    # enter the URL to scrape from
    page_url = "https://www.basketball-reference.com/leagues/NBA_"+str(year)+"_advanced.html"
    
    # download the webpage as an object
    uClient = uReq(page_url)
    
    # parse html itno a soup data structure
    page_soup_advanced_stats = soup(uClient.read(), 'html.parser')
    uClient.close()
    
    # get headers
    player_table = page_soup_advanced_stats.findAll("table", {'id':'advanced_stats'})
    headers = player_table[0].find_all("thead")[0].find_all('tr')[0].find_all('th')
    
    columns = []
    for col in headers[1:]:
        columns.append(col.text)
    
    # setting up histoical players df
    Players_df = pd.DataFrame(columns=columns)
    
    players = player_table[0].find_all('tbody')[0].find_all('tr')
    
    num_of_fake_rows = 0
    
    for i, player in enumerate(players):
        
        # get list of stats
        stats = player.find_all('td')
        
        # store stats in ana ay
        stat_arr = []
        for stat in stats:
            
            stat_arr.append(stat.text)
        
        if not stat_arr:
            num_of_fake_rows = num_of_fake_rows +1
            continue
        
        # stores all stats when detemining MVP
        player_df = dict(zip(Players_df.columns, stat_arr))
        
        # getting the stats in the DF
        Players_df = Players_df.append(player_df, ignore_index=True)
        
        # expoting DF to a csv file
        out_filename = str(year)+'Players_Advanced.csv'
        Players_df.to_csv(path_or_buf= out_filename, index=False)

## Now going to build the past MVP dataframes

In [38]:
Main_df = pd.DataFrame()

for year in range(1980, 2020):
    
    ######### First need to get team standings ######
    
    # Now get the stats for team record
    page_url = "https://www.espn.com/nba/standings/_/season/"+str(year)+"/group/league"
    uClient = uReq(page_url)
    page_soup_team = soup(uClient.read(), 'html.parser')
    uClient.close()
    
    # make a dictionary of team standings
    team_table = page_soup_team.find("tbody",{"class":"Table__TBODY"}).findAll("tr")
    
    standings = dict()
    for i, row in enumerate(team_table, start=1):
        team_name = row.find('span', {'class':'hide-mobile'}).text
        standings[team_name] = i
    
    
    ####### check for all start status ######
    page_url = "https://basketball.realgm.com/nba/allstar/game/rosters/"+str(year)
    uClient = uReq(page_url)
    all_star_soup = soup(uClient.read(), 'html.parser')
    uClient.close()
    
    allstars = all_star_soup.findAll("td", {'data-th':'Player'})
    
    allstar_arr = []
    for a in allstars:
        all_star_arr.append(a.text)
        
    
    ##### Now build the dataframe ######
    
    # loading DF
    counting_stats_df = pd.read_csv('All_NBA_analysis/player_stats_'+str(year)+'.csv')
    advanced_stats_df = pd.read_csv('All_NBA_analysis/'+str(year)+"Players_Advanced.csv")
    
    # fixing names so rows match
    advanced_stats_df = advanced_stats_df.rename(columns={"Player":"NAME","Tm":"TEAM","MP":"MPT"})
    
    # drop redundant rows
    advanced_stats_df = advanced_stats_df.drop(['G', 'Pos'], 1)
    advanced_stats_df.dropna(how='all', axis=1, inplace=True)
    
    # merge the 2 dataframes
    result = pd.merge(counting_stats_df, advanced_stats_df, on=['NAME'])
    result.drop_duplicates(subset=['NAME', 'TEAM_x'], keep='first', inplace=True)
    result = result.drop(['TEAM_y'], axis=1).rename(columns={"TEAM_x":"TEAM"})
    
    
    result.reset_index(drop=True, inplace=True)
    
    ALL_NBA_df = pd.DataFrame(columns = result.columns)
    
    ALL_NBA_df['ALLSTAR'] = None
    ALL_NBA_df['ALL_NBA'] = None
    ALL_NBA_df['TEAM_STANDING'] = None
    
    all_nba_players_arr = all_nba_rosters[year]
    for i, player in result.iterrows():
        
        # Adding ALLstar column
        if player['NAME'].replace('*', '') in allstar_arr:
            player['ALLSTAR'] =1
        else:
            player['ALLSTAR'] = 0
        
        
        # adding all-NBA column
        if player['NAME'].replace('*','') in all_nba_players_arr:
            player['ALL_NBA'] = 1
        else:
            player['ALL_NBA'] = 0
        
        # adding team standing column ######
        team_abrv = player['TEAM'].replace(' ', '')
        
        if team_abrv not in team_names_dict and team_abrv != 'TOT':
            continue
            
        try:
            if team_abrv == 'TOT':
                
                curr_team_player = result.loc[i+2]
                
                curr_team_id = curr_team_player['TEAM'].replace(' ', '')
                
                team_name = team_names_dictp[curr_team_id]
                player['TEAM_STANDING'] = standings[team_name]
            else:
                team_name = team_names_dict[team_abrv]
                player['TEAM_STANDING'] = standings[team_name]
        except:
            continue
        
        ALL_NBA_df = ALL_NBA_df.append(player, ignore_index=True)
    print(year)
    Main_df = Main_df.append(ALL_NBA_df, ignore_index=True)
    
                                                                              

  advanced_stats_df = advanced_stats_df.drop(['G', 'Pos'], 1)


1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019


In [39]:
Main_df

out_filename = 'Past_Player_Data.csv'
Main_df.to_csv(path_or_buf=out_filename, index=False)

## Get data fo current players

In [40]:
current_df = pd.DataFrame()

year = 2020

###### first get team standings ######

#Now we get the stats for team record 
page_url = "https://www.espn.com/nba/standings/_/season/"+str(year)+"/group/league"
uClient = uReq(page_url)
page_soup_team  = soup(uClient.read(), "html.parser")
uClient.close()

# make a dictioanry of team standings
team_table = page_soup_team.find("tbody",{"class":"Table__TBODY"}).findAll("tr")

standings = dict()
for i,row in enumerate(team_table,start=1):
    team_name = row.find("span",{"class":"hide-mobile"}).text
    standings[team_name] = i   

#### now check for all star status ####

page_url = "https://basketball.realgm.com/nba/allstar/game/rosters/"+str(year)
uClient = uReq(page_url)
all_star_soup = soup(uClient.read(), "html.parser")
uClient.close()

allstars = all_star_soup.findAll("td", {"data-th":"Player"})

allstar_arr = []
for a in allstars:
    allstar_arr.append(a.text)

# laoding dataframes
counting_stats_df = pd.read_csv('All_NBA_analysis/player_stats_'+str(year)+'.csv')
advanced_stats_df = pd.read_csv('All_NBA_analysis/'+str(year)+"Players_Advanced.csv")

# fixing names so rows match
advanced_stats_df = advanced_stats_df.rename(columns={"Player":"NAME","Tm":"TEAM","MP":"MPT"})

# drop redundant rows
advanced_stats_df = advanced_stats_df.drop(['G','Pos'], 1)
advanced_stats_df.dropna(how='all', axis=1, inplace =True)


#merging the two dataframes
result = pd.merge(counting_stats_df,advanced_stats_df,on=['NAME'])
result.drop_duplicates(subset=['NAME','TEAM_x'], keep="first", inplace=True)
result =result.drop(['TEAM_y'],axis=1).rename(columns={"TEAM_x":"TEAM"})
result.reset_index(drop=True, inplace=True)

current_df = pd.DataFrame(columns = result.columns)
current_df['ALLSTAR'] = None
current_df['TEAM_STANDING'] = None

for i, player in result.iterrows():
    
    if player['NAME'].replace('*', '') in allstar_arr:
        player['ALLSTAR'] = 1
    else: 
        player['ALLSTAR'] = 0
        
    ### adding team standing column #######
    
    team_abrv = player['TEAM'].replace(' ', '')
    if team_abrv not in team_names_dict and team_abrv != 'TOT' :
        continjue
        
    try:
        if team_abrv == 'TOT':
            
            curr_team_player = result.loc[i+2]
            
            curr_team_id = curr_team_player['TEAM'].replace(' ', '')
            
            team_name = team_names_dict[curr_team_id]
            
            player['TEAM_STANDING'] = standings[team_name]
        else:
            team_name = team_names_dict[team_abrv]
            player['TEAM_STANDING'] = standings[team_name]
    except:
        continue
    
    current_df = current_df.append(player, ignore_index=True)

current_df
                                        

  advanced_stats_df = advanced_stats_df.drop(['G','Pos'], 1)


Unnamed: 0,NAME,TEAM,POS,AGE,G,GS,MP,FG,FGA,FG%,...,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,ALLSTAR,TEAM_STANDING
0,Steven Adams,OKC,C,26,58,58,27.0,4.5,7.6,.591,...,3.8,2.7,6.5,0.185,1.9,1.1,2.9,2.1,0,9
1,Bam Adebayo,MIA,PF,22,65,65,34.4,6.3,11.1,.567,...,4.6,3.9,8.5,0.168,1.4,2.0,3.4,3.3,1,11
2,LaMarcus Aldridge,SAS,C,34,53,53,33.1,7.4,15.0,.493,...,3.0,1.4,4.5,0.122,1.8,-0.5,1.4,1.5,0,19
3,Nickeil Alexander-Walker,NOP,SG,21,41,0,12.2,1.9,5.5,.339,...,-0.7,0.4,-0.2,-0.020,-3.2,-1.4,-4.6,-0.4,0,21
4,Grayson Allen,MEM,SG,24,30,0,16.6,2.6,5.9,.449,...,0.8,0.4,1.2,0.082,-0.1,-1.2,-1.3,0.1,0,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
598,Thaddeus Young,CHI,PF,31,64,16,24.9,4.2,9.4,.448,...,-0.2,2.1,1.9,0.058,-1.4,0.2,-1.2,0.3,0,24
599,Trae Young,ATL,PG,21,60,60,35.3,9.1,20.8,.437,...,5.3,0.6,5.9,0.133,6.2,-2.3,3.9,3.1,1,27
600,Cody Zeller,CHO,C,27,58,39,23.1,4.3,8.3,.524,...,2.3,1.3,3.6,0.129,0.2,-0.8,-0.6,0.5,0,22
601,Ante Žižić,CLE,C,23,22,0,10.0,1.9,3.3,.569,...,0.3,0.2,0.5,0.106,-1.7,-1.5,-3.2,-0.1,0,29


In [41]:
# export DF to csv file
out_filename = 'Current_players.csv'
current_df.to_csv(path_or_buf=out_filename, index=False)