In [3]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import re

year=[] 
for i in range(1997,2018):
    year.append(i)

all_url=[]
for x in year:
   all_url.append('https://www.basketball-reference.com/leagues/NBA_{}_advanced.html'.format(x))

#Create function to scrape URL 
def urlScraping(url):
    
    nba=BeautifulSoup(urlopen(url))

    #Obtain Headers
    headers=[th.getText() for th in nba.findAll('tr')[0].findAll('th')]
    headers=headers[1:]

    #Obtain player data in rows
    rows = nba.findAll('tr')[1:]
    player_stats=[[td.getText() for td in rows[i].findAll('td')] for i in range (len(rows))]

    #Obtain playerID from rows
    player_id=[]
    player=[[data['data-append-csv'] for data in rows[i].findAll('td',{'data-append-csv':True})] for i in range(len(rows))]

    for x in player:
        for y in x:
            player_id.append(y)


    #Create stats data frame
    st=pd.DataFrame(player_stats, columns=headers)

    col= [c for c in st.columns if c.isspace()==False]

    st=st[col]

    st=st[st['Player'].notnull()]

    st.reset_index(inplace=True)

    #Add Year to DataFrame
    yr = [x.getText() for x in nba.findAll('span')]
    r=re.compile(r'\d')
    yr_list=list(filter(r.match, yr))
    nba_year=int(re.findall('\d+', yr_list[0])[0])
    nba_year=nba_year+1

    st['Year']=nba_year


    #Add PlayerID to stats
    st['player_id']=player_id

    #Drop index column
    st.drop('index', axis=1, inplace=True)

    #Convert objects to floats/int
    st[['G', 'MP', 'PER', 'TS%', '3PAr', 'FTr','ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS','DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP']]=st[['G', 'MP', 'PER', 'TS%', '3PAr', 'FTr','ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS','DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP']].apply(pd.to_numeric)
    
    return st

In [4]:
#Loop to create CSV Files
for url in all_url:
    data = urlScraping(url)
    name = 'data/nba_advancedstats_'+str(data['Year'][0])+'.csv'
    data.to_csv(name)
    

In [6]:
#Concat nba_advanced_stats data
file_name='data/nba_advancedstats_{}.csv'
adv_stats_list=[]
for x in range(1997,2018):
    adv_stats_list.append(pd.read_csv(file_name.format(x)))

adv_stats=pd.concat(adv_stats_list)


In [7]:
#Salary and inflation
inflation=pd.read_csv('data/CPIAUCNS.csv', parse_dates=True)
salary=pd.read_csv('data/salaries_1985to2018.csv')

In [8]:
#Convert date to datetime in order to extract year
inflation.DATE=pd.to_datetime(inflation.DATE)
inflation['YEAR']=inflation.DATE.dt.year

#Narrow down inflation to Year and Inflation
inflation=inflation[['YEAR','CPIAUCNS']]

#Create a mean for each year
inflation=inflation.groupby('YEAR').mean()

#Reset Index
inflation=inflation.reset_index()

#Create a multiplier for inflation
inflation['CPIMult']= (inflation.iloc[-1,1]) / (inflation['CPIAUCNS']) 

In [9]:
#Merge salary and inflation to adjust salary
adj_salary=salary.merge(inflation, left_on='season_end', right_on='YEAR')

#Multiply salary by inflation multiplier to get adjusted salary
adj_salary['adj_salary']=(adj_salary.salary*adj_salary.CPIMult).round()

#Display on certain fields
adj_salary=adj_salary[['player_id', 'adj_salary','YEAR', 'season', 'team']]

In [10]:
#Merge adv_stats and adj_salary

df=pd.merge(adv_stats,adj_salary,how='left',left_on=['player_id','Year'], right_on=['player_id','YEAR'])

#Removing null values on adj_salary because it accounted for less than 5% of the data
df=df[df.adj_salary.notnull()]

In [12]:
print (df)

       Unnamed: 0               Player Pos  Age   Tm   G    MP   PER    TS%  \
0               0   Mahmoud Abdul-Rauf  PG   27  SAC  75  2131  13.2  0.524   
1               1  Shareef Abdur-Rahim  PF   20  VAN  80  2802  17.4  0.518   
2               2       Rafael Addison  SF   32  CHH  41   355  10.6  0.476   
3               3       Cory Alexander  PG   23  SAS  80  1454  14.3  0.528   
4               4         Jerome Allen  SG   24  TOT  76   943   8.7  0.451   
5               5         Jerome Allen  SG   24  IND  51   692   8.7  0.489   
6               6         Jerome Allen  SG   24  DEN  25   251   8.9  0.376   
7               7           Ray Allen*  SG   21  MIL  82  2532  14.6  0.541   
9               9         Ashraf Amaya  PF   25  WSB  31   144   8.8  0.382   
10             10        Greg Anderson   C   32  SAS  82  1659  11.4  0.531   
11             11       Kenny Anderson  PG   26  POR  82  3081  19.5  0.540   
12             12        Nick Anderson  SG   29  ORL