In [2]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import re
import requests

In [3]:
year=[] 
for i in range(1984,2020):
    year.append(i)

all_url=[]
for x in year:
   all_url.append('https://www.basketball-reference.com/leagues/NBA_{}_advanced.html'.format(x))

#Create function to scrape URL 
def urlScraping(url):
    
    nba=BeautifulSoup(urlopen(url))

    #Obtain Headers
    headers=[th.getText() for th in nba.findAll('tr')[0].findAll('th')]
    headers=headers[1:]

    #Obtain player data in rows
    rows = nba.findAll('tr')[1:]
    player_stats=[[td.getText() for td in rows[i].findAll('td')] for i in range (len(rows))]

    #Obtain playerID from rows
    player_id=[]
    player=[[data['data-append-csv'] for data in rows[i].findAll('td',{'data-append-csv':True})] for i in range(len(rows))]

    for x in player:
        for y in x:
            player_id.append(y)


    #Create stats data frame
    st=pd.DataFrame(player_stats, columns=headers)

    col= [c for c in st.columns if c.isspace()==False]

    st=st[col]

    st=st[st['Player'].notnull()]

    st.reset_index(inplace=True)

    #Add Year to DataFrame
    yr = [x.getText() for x in nba.findAll('span')]
    r=re.compile(r'\d')
    yr_list=list(filter(r.match, yr))
    nba_year=int(re.findall('\d+', yr_list[0])[0])
    

    st['Year']=nba_year


    #Add PlayerID to stats
    st['player_id']=player_id

    #Drop index column
    st.drop('index', axis=1, inplace=True)

    #Convert objects to floats/int
    st[['G', 'MP', 'PER', 'TS%', '3PAr', 'FTr','ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS','DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP']]=st[['G', 'MP', 'PER', 'TS%', '3PAr', 'FTr','ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS','DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP']].apply(pd.to_numeric)
    
    return st

In [4]:
#Loop to create CSV Files
#for url in all_url:
#    data = urlScraping(url)
#    name = 'data/nba_advancedstats_'+str(data['Year'][0])+'.csv'
#    data.to_csv(name)
    

In [5]:
#Concat nba_advanced_stats data
file_name='data/nba_advancedstats_{}.csv'
adv_stats_list=[]
for x in range(1984,2020):
    adv_stats_list.append(pd.read_csv(file_name.format(x)))

adv_stats=pd.concat(adv_stats_list)

In [6]:
#To get salary data, the URL needs to be setup

player_id=[x for x in adv_stats['player_id'].unique()]
first_letter = []
for x in player_id:
    y=x[0]
    first_letter.append(y)

tup=list(zip(first_letter,player_id))

salary_url=[]
for x in tup:
    salary_url.append('https://www.basketball-reference.com/players/{}/{}.html'.format(*x))
    
#Create a function
def salScraping(url):
    html=requests.get(url)
    html_doc=html.text.replace('<!--','').replace('-->','')
    content=BeautifulSoup(html_doc,'html.parser')
    sal=content.find(id='all_salaries')
    id=content.find('meta',attrs={'property':'og:url'})
    pl_id=re.findall(r'/www.basketball-reference.com/players/[a-z]/(.*).html', str(id))
    header=[th.getText() for th in sal.findAll('tr')[0].findAll('th')]
    header=header[1:]
    rows=sal.findAll('tr')[1:]
    team=sal.findAll('td', {'data-stat':'team_name'})
    pl_sal=[[td.getText() for td in rows[i].findAll('td')] for i in range (len(rows))]
    pl_season=[[th.getText() for th in rows[i].findAll('th')] for i in range (len(rows))]
    pl_team=[[x['href'] for x in team[i].findAll('a', {'href':True})] for i in range(len(team))]
    
    df_sal=pd.DataFrame(pl_sal,columns=header)
    
    season=[]
    
    for x in pl_season:
        for y in x:
           season.append(y) 
    
    teams=[]
    
    for x in pl_team:
        for y in x:
            y=re.findall(r'/teams/(.*)/', y)
            for z in y:
                teams.append(z)
    
    start_yr=[]
    
    for x in season:
        x=re.findall('\d\d\d\d',x)
        for y in x:
            start_yr.append(int(y))

    end_yr=[]
    
    for x in season:
        x=re.findall('\d\d\d\d',x)
        for y in x:
            y=int(y)+1
            end_yr.append(y)

    df_sal['Season']=season
    df_sal.drop(df_sal.index[-1], inplace=True)
    df_sal['team_id']=teams
    df_sal['season_start']=start_yr
    df_sal['season_end']=end_yr
    player_id=pd.DataFrame(pl_id, columns=['player_id'])
    df_salary=pd.concat([df_sal, player_id], sort=False)
    df_salary.bfill(inplace=True)
    df_salary.reset_index(inplace=True)
    df_salary.drop(df_salary.index[-1],inplace=True)
    df_salary.reset_index(inplace=True)
    df_salary.drop(['level_0','index'], axis=1, inplace=True)
    return df_salary

In [7]:
#Loop to create CSV Files
#for url in salary_url:
#    try:
#        data = salScraping(url)
#        name = 'data/nba_salary/nba_salary_'+str(data['player_id'][0])+'.csv'
#        data.to_csv(name)
#    except AttributeError:
#        print ('No salary for: '+str(url))
#   except:
#        print ('Error with: '+str(url))
        

In [10]:
#Concat nba_salary data

file_name='data/nba_salary/nba_salary_{}.csv'
player_id=[x for x in adv_stats['player_id'].unique()]
salary_list=[]
for x in player_id:
    try:
        salary_list.append(pd.read_csv(file_name.format(x)))
    except FileNotFoundError:
        print('No Salary: '+str(x))

salary=pd.concat(salary_list)

No Salary: aleksch01
No Salary: anderjj01
No Salary: blackco01
No Salary: bratzmi01
No Salary: brewero01
No Salary: carteho01
No Salary: collido02
No Salary: coopejo01
No Salary: crissch01
No Salary: durrade01
No Salary: eavesje01
No Salary: edelike01
No Salary: fordph01
No Salary: glennmi01
No Salary: grangst01
No Salary: gravebu01
No Salary: hollili01
No Salary: jacksra01
No Salary: johnsge03
No Salary: jonesea01
No Salary: jonesoz01
No Salary: kelsegr01
No Salary: lagarto01
No Salary: lowesi01
No Salary: mccrasc01
No Salary: mcdowha01
No Salary: michela01
No Salary: nattke01
No Salary: paultbi01
No Salary: popeda01
No Salary: rautile01
No Salary: romarlo01
No Salary: russeca02
No Salary: russewa01
No Salary: sapplwa01
No Salary: schefto01
No Salary: schwejo01
No Salary: sittoch01
No Salary: slubyto01
No Salary: smithro01
No Salary: suttlda01
No Salary: thirdda01
No Salary: towneli01
No Salary: willigu02
No Salary: willira01
No Salary: willisa02
No Salary: wilsomi01
No Salary: wilsoo

In [11]:
#Convert Salary into float 
salary['Salary']=salary.Salary.str.replace('$','')
salary['Salary']=salary.Salary.str.replace(',','')
salary['clean_Salary']=salary.Salary.str.extract('(\d+)', expand=True)
salary['clean_Salary']=salary.clean_Salary.astype(float)

#Remove NaN clean_Salary. The Salary is either not available or '< Minimum'
salary.dropna(inplace=True)

In [28]:
nba_adv_stats_and_salary=pd.merge(adv_stats,salary,how='left',left_on=['player_id','Tm','Year'], right_on=['player_id','team_id','season_start'])

#Back Fill NaN. Reason is because some players were traded midway. Missing values are for new teams and total. 
#nba_adv_stats_and_salary.bfill(inplace=True)

nba_adv_stats_and_salary.drop(columns=['Unnamed: 0_x','Unnamed: 0_y'], inplace=True)

file_name='data/nba_adv_stats_and_salary.csv'

nba_adv_stats_and_salary.to_csv(file_name, index=False)

In [29]:
cap = pd.read_excel('data/salaryCap.xlsx')

In [30]:
salcap_and_salary = nba_adv_stats_and_salary.merge(cap, left_on='season_start', right_on='Year')

salcap_and_salary.drop(columns='Year_y', inplace=True)

salcap_and_salary.rename(columns = {'Year_x':'Year'}, inplace=True)

salcap_and_salary['%_of_cap'] = salcap_and_salary['clean_Salary']/salcap_and_salary['Salary Cap']

salcap_and_salary.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9940 entries, 0 to 9939
Data columns (total 38 columns):
Player          9940 non-null object
Pos             9940 non-null object
Age             9940 non-null int64
Tm              9940 non-null object
G               9940 non-null int64
MP              9940 non-null int64
PER             9935 non-null float64
TS%             9907 non-null float64
3PAr            9904 non-null float64
FTr             9904 non-null float64
ORB%            9935 non-null float64
DRB%            9935 non-null float64
TRB%            9935 non-null float64
AST%            9935 non-null float64
STL%            9935 non-null float64
BLK%            9935 non-null float64
TOV%            9914 non-null float64
USG%            9935 non-null float64
OWS             9940 non-null float64
DWS             9940 non-null float64
WS              9940 non-null float64
WS/48           9935 non-null float64
OBPM            9940 non-null float64
DBPM            9940 non-nul

In [31]:
salcap_and_salary.to_csv('data/salcap_and_salary.csv', index=False)

In [50]:
salcap_and_salary.head(10)

Unnamed: 0,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,FTr,...,Team,Lg,Salary,Season,team_id,season_start,season_end,clean_Salary,Salary Cap,%_of_cap
0,Mahmoud Abdul-Rauf,PG,26,DEN,57,2029,18.6,0.535,0.324,0.164,...,Denver Nuggets,NBA,2600000,1995-96,DEN,1995.0,1996.0,2600000.0,23000000,0.113043
1,Rafael Addison,SF,31,CHH,53,516,9.6,0.489,0.055,0.133,...,Charlotte Hornets,NBA,360000,1995-96,CHH,1995.0,1996.0,360000.0,23000000,0.015652
2,Cory Alexander,PG,22,SAS,60,560,8.7,0.506,0.426,0.161,...,San Antonio Spurs,NBA,490000,1995-96,SAS,1995.0,1996.0,490000.0,23000000,0.021304
3,Jerome Allen,SG,23,MIN,41,362,7.8,0.447,0.314,0.343,...,Minnesota Timberwolves,NBA,200000,1995-96,MIN,1995.0,1996.0,200000.0,23000000,0.008696
4,Derrick Alston,C,23,PHI,73,1614,11.8,0.518,0.008,0.289,...,Philadelphia 76ers,NBA,350000,1995-96,PHI,1995.0,1996.0,350000.0,23000000,0.015217
5,John Amaechi,PF,25,CLE,28,357,3.1,0.456,0.0,0.471,...,Cleveland Cavaliers,NBA,200000,1995-96,CLE,1995.0,1996.0,200000.0,23000000,0.008696
6,Ashraf Amaya,PF,24,VAN,54,1104,11.8,0.534,0.004,0.591,...,Vancouver Grizzlies,NBA,225000,1995-96,VAN,1995.0,1996.0,225000.0,23000000,0.009783
7,Greg Anderson,PF,31,SAS,46,344,7.7,0.466,0.021,0.532,...,San Antonio Spurs,NBA,485000,1995-96,SAS,1995.0,1996.0,485000.0,23000000,0.021087
8,Kenny Anderson,PG,25,NJN,31,1042,19.8,0.511,0.261,0.495,...,New Jersey Nets,NBA,3898000,1995-96,NJN,1995.0,1996.0,3898000.0,23000000,0.169478
9,Nick Anderson,SG,28,ORL,77,2717,16.5,0.562,0.476,0.265,...,Orlando Magic,NBA,3200000,1995-96,ORL,1995.0,1996.0,3200000.0,23000000,0.13913


In [49]:
mean = salcap_and_salary['%_of_cap'].mean()
std = salcap_and_salary['%_of_cap'].std()