# NOTE: Web Scraper set to only collect data one season at a time, must change request.get() links dependent on year as well as changing the salary column key 

In [100]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from fuzzywuzzy import process
from unidecode import unidecode

## Use requests pkg to load in raw bytes of HTML

### .decode() changes bytes to str

In [101]:
resp = requests.get('https://www.basketball-reference.com/leagues/NBA_2015_per_game.html')
html_data = resp.content.decode()

soup = BeautifulSoup(html_data)

### find_all() selects HTML lines based on tags

In [102]:

# [1:] in next line to remove 'Rk' attribute that is arbitrary depending on sorting rule
attribute_headers = [header.getText() for header in soup.find_all('tr',limit=1)[0].find_all('th')][1:]

rows = soup.find_all('tbody',limit=1)[0].find_all('tr',class_='full_table')
player_stats = np.array([[attr.getText() for attr in row.find_all('td') ] for row in rows])


In [103]:

df_NBA_Player_Stats = pd.DataFrame(player_stats, columns=attribute_headers)
df_NBA_Player_Stats = df_NBA_Player_Stats.apply(pd.to_numeric, errors='ignore')

df_NBA_Player_Stats['Player'] = df_NBA_Player_Stats['Player'].apply(unidecode)

df_NBA_Player_Stats.head()


Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,Quincy Acy,PF,24,NYK,68,22,18.9,2.2,4.9,0.459,...,0.784,1.2,3.3,4.4,1.0,0.4,0.3,0.9,2.2,5.9
1,Jordan Adams,SG,20,MEM,30,0,8.3,1.2,2.9,0.407,...,0.609,0.3,0.6,0.9,0.5,0.5,0.2,0.5,0.8,3.1
2,Steven Adams,C,21,OKC,70,67,25.3,3.1,5.7,0.544,...,0.502,2.8,4.6,7.5,0.9,0.5,1.2,1.4,3.2,7.7
3,Jeff Adrien,PF,28,MIN,17,0,12.6,1.1,2.6,0.432,...,0.579,1.4,3.2,4.5,0.9,0.2,0.5,0.5,1.8,3.5
4,Arron Afflalo,SG,29,TOT,78,72,32.1,4.8,11.3,0.424,...,0.843,0.3,2.8,3.2,1.7,0.5,0.1,1.5,2.1,13.3


In [104]:
#For salary data
resp = requests.get('https://hoopshype.com/salaries/players/2014-2015/')
html_data = resp.content.decode()

soup = BeautifulSoup(html_data)

In [105]:
attribute_headers = [header.getText().strip() for header in soup.find_all('tr',limit=1)[0].find_all('td', limit=3)][1:]
attribute_headers[1] += ' Salary'

In [106]:

rows = soup.find_all('tbody',limit=1)[0].find_all('tr')
rows[0].find_all('td')[1:3][1].getText().strip('')
player_salaries = np.array([[attr.getText().strip('$\t\n').replace(',','') for attr in row.find_all('td')[1:3] ] for row in rows])

### When changing year of data must change key of salaries in cell below

In [107]:
df_salaries = pd.DataFrame(player_salaries, columns=attribute_headers)
df_salaries['Salary'] = df_salaries['2014/15 Salary'].apply(int)
df_salaries['Player'] = df_salaries['Player'].apply(unidecode)


#### Next step is to use fuzzy wuzzy to connect nicknames of players that didn't match during merge above ('https://towardsdatascience.com/fuzzywuzzy-basica-and-merging-datasets-on-names-with-different-transcriptions-e2bb6e179fbf')

### Use FuzzyWuzzy method process.extractOne() to merge along names including nicknames and truncated names from different sets

In [108]:
names_extract = df_NBA_Player_Stats['Player'].apply(process.extractOne, args=(df_salaries['Player'],))
for i, name in df_NBA_Player_Stats['Player'].items():
        if names_extract[i][1] >= 86:
            df_NBA_Player_Stats.at[i,'Player'] = names_extract[i][0]


In [109]:
df_NBA_Player_Stats


Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,Quincy Acy,PF,24,NYK,68,22,18.9,2.2,4.9,0.459,...,0.784,1.2,3.3,4.4,1.0,0.4,0.3,0.9,2.2,5.9
1,Jordan Adams,SG,20,MEM,30,0,8.3,1.2,2.9,0.407,...,0.609,0.3,0.6,0.9,0.5,0.5,0.2,0.5,0.8,3.1
2,Steven Adams,C,21,OKC,70,67,25.3,3.1,5.7,0.544,...,0.502,2.8,4.6,7.5,0.9,0.5,1.2,1.4,3.2,7.7
3,Jeff Adrien,PF,28,MIN,17,0,12.6,1.1,2.6,0.432,...,0.579,1.4,3.2,4.5,0.9,0.2,0.5,0.5,1.8,3.5
4,Arron Afflalo,SG,29,TOT,78,72,32.1,4.8,11.3,0.424,...,0.843,0.3,2.8,3.2,1.7,0.5,0.1,1.5,2.1,13.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
487,James Young,SG,19,BOS,31,0,10.7,1.2,3.3,0.353,...,0.552,0.3,1.1,1.4,0.4,0.3,0.1,0.2,0.7,3.4
488,Nick Young,SG,29,LAL,42,0,23.8,4.1,11.3,0.366,...,0.892,0.4,1.9,2.3,1.0,0.5,0.3,1.0,2.0,13.4
489,Thaddeus Young,PF,26,TOT,76,68,32.0,5.9,12.7,0.466,...,0.655,1.7,3.7,5.4,2.3,1.6,0.3,1.5,2.3,14.1
490,Cody Zeller,C,22,CHO,62,45,24.0,2.8,6.0,0.461,...,0.774,1.6,4.3,5.8,1.6,0.5,0.8,1.0,2.5,7.6


### Save Df to .csv file locally

In [110]:
df_NBA_complete = pd.merge(df_NBA_Player_Stats,df_salaries, on='Player')

df_NBA_complete.to_csv('NBA_Player_Stats3.csv',index=False)