In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import pandas as pd

from bs4 import BeautifulSoup
import requests
import lxml
import unicodedata

## Utility functions

In [105]:
def strip_accents_and_periods(text):
    '''Normalize player name spellings'''
    try:
        text = unicode(text, 'utf-8')
    except NameError: # unicode is a default on python 3 
        pass
    text = unicodedata.normalize('NFD', text)\
           .encode('ascii', 'ignore')\
           .decode("utf-8")
    return str(text).replace('.','')
        
        
def lowercasestrip(string):
    '''Lowercase player names and remove spaces/punctuation'''
    try:
        string = (string.replace('-','').replace('_','').
                  replace(' ','').replace('.','').replace('\'','').lower())
    except:
        pass
    return string

def dollarstoint(string):
    '''Convert hoopshype salary info to integer number of dollas'''
    try:
        string = str(string)
        string = string.replace('$','').replace(',','') 
        return int(string)
    except:
        return np.nan

def heightinfeet(height):
    '''convert bball-ref height info to feet'''
    try:
        height_ft  = float(height.split('-')[0]) + float(height.split('-')[1])/12
        return height_ft
    except:
        return np.nan

def update_row_with_dict(df,d,idx):
    '''update the entries of a row in a dataframe, 
       using a dictionary to supply the vlues'''
    for key in d.keys():
        df.loc[idx, key] = d.get(key)

## Scrape past year stats 
* Scraping individual player stats from basketball-reference.com:

In [3]:
def scrape_player(playerurl, fa_year):
    '''Get season stats from basketball-reference.com'''
    d = {}
    prev_season = str(fa_year-1) + '-' + str(fa_year-2000)

    playerresponse = requests.get(playerurl)
    playerpage = playerresponse.text
    playersoup = BeautifulSoup(playerpage, "lxml")

    # Get height
    try:
        weight = playersoup.find('span',{'itemprop': 'weight'}).getText().replace('lb','')
    except:
        weight = np.nan
    d['weight']= weight

    # Get weight
    try:
        height = playersoup.find('span',{'itemprop': 'height'}).getText()
    except:
        height = np.nan
    d['height']= height

    # Get basic Stats:  G, GS, MP, FG%, 3P, 3P%, FT, FT%, ORB, TRB, AST, STL, BLK, TOV, PTS
    
    dfpergame = pd.read_html(str(playersoup.find(id='per_game')))[0]
    dfpergame = dfpergame.drop_duplicates(subset=['Season'])
    dfpergame = dfpergame.set_index('Season')

    featurelist = ['Age', 'G', 'GS', 'MP', 'FG%', '3P', '3P%', '3PA', 'FT', 'FT%','FTA', 'ORB', 'TRB', 
                    'AST', 'STL', 'BLK', 'TOV', 'PTS']
    
    for feature in featurelist:
        try: 
            d[feature] = dfpergame.loc[prev_season, feature] 
        except: 
            d[feature] = np.nan

    # Get advanced stats

    dfadvanced = pd.read_html(str(playersoup.find(id='advanced')))[0]
    dfadvanced = dfadvanced.drop_duplicates(subset=['Season'])
    dfadvanced = dfadvanced.set_index('Season')
    
    featurelist = ['USG%', 'TS%']
    
    for feature in featurelist:
        try: 
            d[feature] = dfadvanced.loc[prev_season, feature] 
        except: 
            d[feature] = np.nan

    return(d)

#### Example player:  Kevin Durant

In [4]:
playerurl = 'https://www.basketball-reference.com/players/d/duranke01.html'
fayear = 2016
d = scrape_player(playerurl, fayear)
print(d)

{'weight': '240', 'height': '6-10', 'Age': 27.0, 'G': '72', 'GS': '72', 'MP': '35.8', 'FG%': '.505', '3P': '2.6', '3P%': '.387', '3PA': '6.7', 'FT': '6.2', 'FT%': '.898', 'FTA': '6.9', 'ORB': '0.6', 'TRB': '8.2', 'AST': '5.0', 'STL': '1.0', 'BLK': '1.2', 'TOV': '3.5', 'PTS': '28.2', 'USG%': 30.6, 'TS%': 0.634}


* Scrape all free agents in a given year:

In [5]:
def scrape_fa_year(fa_year):
    '''find all free agents in a given year and scrape their previous season stats'''

    FA_url = 'https://www.basketball-reference.com/friv/free_agents.cgi?year=' + str(fa_year) 
    prev_season = str(fa_year-1) + '-' + str(fa_year-2000)

    response = requests.get(FA_url)
    page = response.text
    FAsoup = BeautifulSoup(page, "lxml")

    table = FAsoup.find('table')
    rows = [row for row in table.find_all('tr')]  # tr tag is for rows
    rows_data = [[td.getText() for td in rows[i].findAll('td')]
                    for i in range(len(rows))]
        
    df = pd.DataFrame()
    for i in range(1,len(rows)):
        try:
            name_year = (rows_data[i][0] + ' ' + str(fa_year-1)).replace(' ','_')  #prev year
            name_year = strip_accents_and_periods(name_year)
        
            name = (rows_data[i][0]).replace(' ','-')
            name = strip_accents_and_periods(name)
        
            pos = rows_data[i][1]
            d = {'pos': pos, 'name':name}
            update_row_with_dict(df,d,name_year)
        
            playerurl = 'https://www.basketball-reference.com' + rows[i].find_all('td')[0].find('a')['href']
            playerdict = scrape_player(playerurl, fa_year)
            update_row_with_dict(df,playerdict,name_year)
        except:
            continue

    df['year'] = fa_year-1
    
    df['name-year']= df.index
    df.reset_index()
    df['player'] = df['name'].apply(lowercasestrip)
    df = df.set_index('player', drop=True)
    
    return df

* Scrape last 5 years of player stats

In [6]:
# This takes 1-2 minutes per year

df2016 = scrape_fa_year(2017)
df2017 = scrape_fa_year(2018)
df2018 = scrape_fa_year(2019)
df2019 = scrape_fa_year(2020)
df2020 = scrape_fa_year(2021)

df2020.head(5)

Unnamed: 0_level_0,pos,name,weight,height,Age,G,GS,MP,FG%,3P,...,TRB,AST,STL,BLK,TOV,PTS,USG%,TS%,year,name-year
player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
chrispaul,G,Chris-Paul,175,6-0,35.0,70.0,70.0,31.4,0.499,1.5,...,4.5,8.9,1.4,0.3,2.2,16.4,22.6,0.599,2020,Chris_Paul_2020
kawhileonard,F,Kawhi-Leonard,225,6-7,29.0,52.0,52.0,34.1,0.512,1.9,...,6.5,5.2,1.6,0.4,2.0,24.8,28.6,0.622,2020,Kawhi_Leonard_2020
enesfreedom,C,Enes-Freedom,250,6-10,28.0,72.0,35.0,24.4,0.604,0.0,...,11.0,1.2,0.5,0.7,1.1,11.2,17.4,0.636,2020,Enes_Freedom_2020
demarderozan,G-F,DeMar-DeRozan,220,6-6,31.0,61.0,61.0,33.7,0.495,0.3,...,4.2,6.9,0.9,0.2,2.0,21.6,26.1,0.591,2020,DeMar_DeRozan_2020
montrezlharrell,F-C,Montrezl-Harrell,240,6-7,27.0,69.0,1.0,22.9,0.622,0.0,...,6.2,1.1,0.7,0.7,1.1,13.5,21.7,0.65,2020,Montrezl_Harrell_2020


## Scrape past year salaries

In [7]:
def get_salaries(salary_url):
    '''Returns a dataframe with players and salary info for a given year
       Works with hoopshype.com urls'''
    
    response = requests.get(salary_url)
    page = response.text
    salarysoup = BeautifulSoup(page, "lxml")

    table = salarysoup.find('table')
    table_rows = table.find_all('tr')

    lis = []
    for tr in table_rows:
        td = tr.find_all('td')
        row = [tr.text.strip() for tr in td]
        lis.append(row)

    df = pd.DataFrame(lis)
    df = df.iloc[1:,[1,2]]   # drop first row
    df = df.rename(columns ={1:'Name', 2:'Salary'})
    df['Player'] = df['Name'].apply(lowercasestrip)
    df = df.set_index('Player',drop=True)
    
    return df

In [8]:
dfsal2016 = get_salaries('https://hoopshype.com/salaries/players/2016-2017/')
dfsal2017 = get_salaries('https://hoopshype.com/salaries/players/2017-2018/')
dfsal2018 = get_salaries('https://hoopshype.com/salaries/players/2018-2019/')
dfsal2019 = get_salaries('https://hoopshype.com/salaries/players/2019-2020/')
dfsal2020 = get_salaries('https://hoopshype.com/salaries/players/2020-2021/')
dfsal2021 = get_salaries('https://hoopshype.com/salaries/players/')

dfsal2020.head(5)

Unnamed: 0_level_0,Name,Salary
Player,Unnamed: 1_level_1,Unnamed: 2_level_1
stephencurry,Stephen Curry,"$43,006,362"
chrispaul,Chris Paul,"$41,358,814"
russellwestbrook,Russell Westbrook,"$41,358,814"
jamesharden,James Harden,"$41,254,920"
johnwall,John Wall,"$41,254,920"


## Combine stats data and salary data

In [9]:
#Minimum salary for players with 1+ years of experience
minsalary = {2016:     875000,  2017:   1313000, 
             2018:    1339000,  2019:   1446000,   
             2020:    1446000,  2021:   1489000, }

salarycap = {2016:   94143000,  2017:  99093000, 
             2018:  101869000,  2019: 109140000,   
             2020:  109140000,  2021: 112414000, }    


def df_statsandsals(df_stats, df_prevsal, df_currentsal):
    '''create new dataframe with columns for current and previous salary info'''

    dfnew = df_stats.copy()
    for index, row in df_stats.iterrows():
        
        try:
            dfnew.loc[index, 'Player'] = df_currentsal.loc[index, 'Name']
        except: 
            dfnew.loc[index, 'Player'] = np.nan
            
        try:
            dfnew.loc[index, 'Current_Sal'] = df_prevsal.loc[index, 'Salary']
        except: 
            dfnew.loc[index, 'Current_Sal'] = np.nan

        try:
            dfnew.loc[index, 'Next_Sal'] = df_currentsal.loc[index, 'Salary']
        except: 
            dfnew.loc[index, 'Next_Sal'] = np.nan
            
        try:
            dfnew.loc[index, 'min_sal_curr'] = minsalary[dfnew.loc[index, 'year']]
        except: 
            dfnew.loc[index, 'min_sal_curr'] = np.nan
            
        try:
            dfnew.loc[index, 'min_sal_next'] = minsalary[dfnew.loc[index, 'year']+1]
        except: 
            dfnew.loc[index, 'min_sal_next'] = np.nan
            
        try:
            dfnew.loc[index, 'sal_cap_curr'] = salarycap[dfnew.loc[index, 'year']]
        except: 
            dfnew.loc[index, 'sal_cap_curr'] = np.nan
        
        try:
            dfnew.loc[index, 'sal_cap_next'] = salarycap[dfnew.loc[index, 'year']+1]
        except: 
            dfnew.loc[index, 'sal_cap_next'] = np.nan
               
    return dfnew

* Combine stats and salaries by year:

In [29]:
display(dfsal2016.head())
display(df2016.head())

Unnamed: 0_level_0,Name,Salary
Player,Unnamed: 1_level_1,Unnamed: 2_level_1
lebronjames,LeBron James,"$30,963,450"
alhorford,Al Horford,"$26,540,100"
kevindurant,Kevin Durant,"$26,540,100"
jamesharden,James Harden,"$26,540,100"
mikeconley,Mike Conley,"$26,540,100"


Unnamed: 0_level_0,pos,name,weight,height,Age,G,GS,MP,FG%,3P,...,TRB,AST,STL,BLK,TOV,PTS,USG%,TS%,year,name-year
player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
stephencurry,G,Stephen-Curry,185,6-2,28.0,79.0,79.0,33.4,0.468,4.1,...,4.5,6.6,1.8,0.2,3.0,25.3,30.1,0.624,2016,Stephen_Curry_2016
kevindurant,F-G,Kevin-Durant,240,6-10,28.0,62.0,62.0,33.4,0.537,1.9,...,8.3,4.8,1.1,1.6,2.2,25.1,27.8,0.651,2016,Kevin_Durant_2016
gordonhayward,F-G,Gordon-Hayward,225,6-7,26.0,73.0,73.0,34.5,0.471,2.0,...,5.4,3.5,1.0,0.3,1.9,21.9,27.6,0.595,2016,Gordon_Hayward_2016
kylelowry,G,Kyle-Lowry,196,6-0,30.0,60.0,60.0,37.4,0.464,3.2,...,4.8,7.0,1.5,0.3,2.9,22.4,24.9,0.623,2016,Kyle_Lowry_2016
ottoporter,F,Otto-Porter,198,6-8,23.0,80.0,80.0,32.6,0.516,1.9,...,6.4,1.5,1.5,0.5,0.6,13.4,15.1,0.628,2016,Otto_Porter_2016


In [10]:
df2016full = df_statsandsals(df2016, dfsal2016, dfsal2017)
df2017full = df_statsandsals(df2017, dfsal2017, dfsal2018)
df2018full = df_statsandsals(df2018, dfsal2018, dfsal2019)
df2019full = df_statsandsals(df2019, dfsal2019, dfsal2020)
df2020full = df_statsandsals(df2020, dfsal2020, dfsal2021)

df2020full.head(4)

Unnamed: 0_level_0,pos,name,weight,height,Age,G,GS,MP,FG%,3P,...,TS%,year,name-year,Player,Current_Sal,Next_Sal,min_sal_curr,min_sal_next,sal_cap_curr,sal_cap_next
player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
chrispaul,G,Chris-Paul,175,6-0,35.0,70.0,70.0,31.4,0.499,1.5,...,0.599,2020,Chris_Paul_2020,Chris Paul,"$41,358,814","$30,800,000",1446000.0,1489000.0,109140000.0,112414000.0
kawhileonard,F,Kawhi-Leonard,225,6-7,29.0,52.0,52.0,34.1,0.512,1.9,...,0.622,2020,Kawhi_Leonard_2020,Kawhi Leonard,"$34,379,100","$39,344,900",1446000.0,1489000.0,109140000.0,112414000.0
enesfreedom,C,Enes-Freedom,250,6-10,28.0,72.0,35.0,24.4,0.604,0.0,...,0.636,2020,Enes_Freedom_2020,,,,1446000.0,1489000.0,109140000.0,112414000.0
demarderozan,G-F,DeMar-DeRozan,220,6-6,31.0,61.0,61.0,33.7,0.495,0.3,...,0.591,2020,DeMar_DeRozan_2020,DeMar DeRozan,"$27,739,975","$26,000,000",1446000.0,1489000.0,109140000.0,112414000.0


* Combine all years into one dataframe

In [11]:
df_raw  = pd.concat([df2016full,df2017full, df2018full, df2019full, df2020full])
df_raw  = df_raw.set_index('name-year')
df_raw.to_csv(r'/Users/andrei/Dropbox/Metis/7-Engineering/andrei-eng-project/data/df_raw.csv')

print(df_raw.shape)
df_raw.sample(4)

(1305, 32)


Unnamed: 0_level_0,pos,name,weight,height,Age,G,GS,MP,FG%,3P,...,USG%,TS%,year,Player,Current_Sal,Next_Sal,min_sal_curr,min_sal_next,sal_cap_curr,sal_cap_next
name-year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
RJ_Hunter_2018,G,RJ-Hunter,185,6-5,25.0,1.0,0.0,26.0,0.462,4.0,...,23.0,0.612,2018,,"$77,250",,1339000.0,1446000.0,101869000.0,109140000.0
Zaza_Pachulia_2018,C,Zaza-Pachulia,270,6-11,34.0,68.0,3.0,12.9,0.44,0.0,...,15.0,0.539,2018,,"$2,393,887",,1339000.0,1446000.0,101869000.0,109140000.0
Frank_Kaminsky_2018,F-C,Frank-Kaminsky,240,7-0,25.0,47.0,0.0,16.1,0.463,1.1,...,22.1,0.587,2018,Frank Kaminsky,"$3,627,842","$4,767,000",1339000.0,1446000.0,101869000.0,109140000.0
JaVale_McGee_2016,C,JaVale-McGee,270,7-0,29.0,77.0,10.0,9.6,0.652,0.0,...,23.8,0.642,2016,JaVale McGee,"$1,403,611","$2,116,955",875000.0,1313000.0,94143000.0,99093000.0


In [12]:
df_raw.columns

Index(['pos', 'name', 'weight', 'height', 'Age', 'G', 'GS', 'MP', 'FG%', '3P',
       '3P%', '3PA', 'FT', 'FT%', 'FTA', 'ORB', 'TRB', 'AST', 'STL', 'BLK',
       'TOV', 'PTS', 'USG%', 'TS%', 'year', 'Player', 'Current_Sal',
       'Next_Sal', 'min_sal_curr', 'min_sal_next', 'sal_cap_curr',
       'sal_cap_next'],
      dtype='object')

## Clean and save data

In [None]:
def clean(df_raw):
    df = df_raw.copy()
    
    dropcols = ['weight', 'height', 'Age', 'Current_Sal', 'Next_Sal', 'FG%', 'USG%', 'TS%', 'G']
    df = df.dropna(subset=dropcols)
    
    df['FG%'].fillna(df['FG%'].median(), inplace=True)  # no FG: assume median
    df['FT%'].fillna(df['FT%'].median(), inplace=True)  # no FT: assume median
    df['3P%'].fillna(0, inplace=True)                   # no 3P: assume the worst!
    
    df['height_ft'] = df['height'].apply(heightinfeet)

    df['Current_Sal'] = df['Current_Sal'].apply(dollarstoint)
    df['Next_Sal'] = df['Next_Sal'].apply(dollarstoint)
         
    df = df[df.Next_Sal > df.min_sal_next] 
    
    df['Current_Sal'] = df['Current_Sal']*salarycap[2021]/df['sal_cap_curr']
    df['Current_Sal'] = df['Current_Sal']/1e6
    
    df['Next_Sal'] = df['Next_Sal']*salarycap[2021]/df['sal_cap_next']
    df['Next_Sal'] = df['Next_Sal']/1e6
  

    cols = bPTS','AST','Age','weight','G','GS','MP','3P','3PA','FT','FTA','ORB', 'TRB', 'AST', 'STL', 'BLK',
            'TOV', 'PTS','3P%','FT%','FG%']
    for col in cols:
        df[col] = df[col].apply(lambda s: pd.to_numeric(s, errors='coerce'))
   
    return df

df_clean = clean(df_raw)
df_clean.to_csv(r'/Users/andrei/Dropbox/Metis/7-Engineering/andrei-eng-project/data/df_clean.csv')



In [23]:
print(df_clean.shape)
df_clean.sample(5)

(566, 33)


Unnamed: 0_level_0,pos,name,weight,height,Age,G,GS,MP,FG%,3P,...,TS%,year,Player,Current_Sal,Next_Sal,min_sal_curr,min_sal_next,sal_cap_curr,sal_cap_next,height_ft
name-year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Denzel_Valentine_2020,G,Denzel-Valentine,220,6-4,27.0,62.0,3.0,16.7,0.373,1.3,...,0.479,2020,Denzel Valentine,4.782075,1.93935,1446000.0,1489000.0,109140000.0,112414000.0,6.333333
Deron_Williams_2016,G,Deron-Williams,200,6-3,32.0,64.0,44.0,25.9,0.438,1.3,...,0.541,2016,Deron Williams,17.774458,6.210759,875000.0,1313000.0,94143000.0,99093000.0,6.25
Anthony_Tolliver_2016,F-C,Anthony-Tolliver,240,6-8,31.0,65.0,9.0,22.7,0.442,1.4,...,0.595,2016,Anthony Tolliver,9.552617,6.001131,875000.0,1313000.0,94143000.0,99093000.0,6.666667
Taj_Gibson_2019,F,Taj-Gibson,232,6-9,34.0,62.0,56.0,16.5,0.584,0.1,...,0.61,2019,Taj Gibson,9.269984,3.382189,1446000.0,1446000.0,109140000.0,109140000.0,6.75
Reggie_Bullock_2018,G-F,Reggie-Bullock,205,6-6,27.0,63.0,60.0,29.8,0.412,2.3,...,0.563,2018,Reggie Bullock,2.758788,4.119993,1339000.0,1446000.0,101869000.0,109140000.0,6.5


## Scrape current year salary data (hoopshype)

In [48]:
dfsal2021.to_csv(r'/Users/andrei/Dropbox/Metis/7-Engineering/andrei-eng-project/data/dfsal2021_hh.csv')

dfsal2021.info()

<class 'pandas.core.frame.DataFrame'>
Index: 545 entries, stephencurry to freddiegillespie
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    545 non-null    object
 1   Salary  545 non-null    object
dtypes: object(2)
memory usage: 28.9+ KB


In [31]:
dfsal2021

Unnamed: 0_level_0,Name,Salary
Player,Unnamed: 1_level_1,Unnamed: 2_level_1
stephencurry,Stephen Curry,"$45,780,966"
jamesharden,James Harden,"$44,310,840"
johnwall,John Wall,"$44,310,840"
russellwestbrook,Russell Westbrook,"$44,211,146"
lebronjames,LeBron James,"$41,180,544"
...,...,...
mamadidiakite,Mamadi Diakite,"$100,000"
davonreed,Davon Reed,"$93,136"
demetriusjackson,Demetrius Jackson,"$92,857"
justinrobinson,Justin Robinson,"$74,093"


## Scrape current year salary data (bball-ref)

In [119]:
url = 'https://www.basketball-reference.com/contracts/players.html'

def scrape_salaries_bbr(url):
    '''Get current season stats for all players from basketball-reference.com'''
    d = {}
  
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, "lxml")

    df = pd.read_html(str(soup.find(id='player-contracts')))[0]
    return df


dfsal2021_bbr = scrape_salaries_bbr(url)
dfsal2021_bbr.columns = dfsal2021_bbr.columns.to_flat_index()

my_dict = {dfsal2021_bbr.columns[3]:'Salary', dfsal2021_bbr.columns[1]:'Name' }
dfsal2021_bbr = dfsal2021_bbr.rename(columns=my_dict)
dfsal2021_bbr = dfsal2021_bbr[['Name','Salary']]

dfsal2021_bbr = dfsal2021_bbr.drop_duplicates()
dfsal2021_bbr = dfsal2021_bbr.dropna()

#Normalize names

dfsal2021_bbr['Player'] = dfsal2021_bbr['Name'].apply(strip_accents_and_periods).apply(lowercasestrip)
dfsal2021_bbr = dfsal2021_bbr.set_index('Player')

dfsal2021_bbr.to_csv(r'/Users/andrei/Dropbox/Metis/7-Engineering/andrei-eng-project/data/dfsal2021_bbr.csv')

dfsal2021_bbr.head()

Unnamed: 0_level_0,Name,Salary
Player,Unnamed: 1_level_1,Unnamed: 2_level_1
stephencurry,Stephen Curry,"$45,780,966"
johnwall,John Wall,"$44,310,840"
russellwestbrook,Russell Westbrook,"$44,211,146"
jamesharden,James Harden,"$43,848,000"
lebronjames,LeBron James,"$41,180,544"


In [129]:
dfsal2021_bbr.loc['santialdama']

Name      Santi Aldama
Salary      $1,994,520
Name: santialdama, dtype: object