In [181]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from fuzzywuzzy import process
from unidecode import unidecode

## Use requests pkg to load in raw bytes of HTML

### .decode() changes bytes to str

In [182]:
resp = requests.get('https://www.basketball-reference.com/leagues/NBA_2023_per_game.html')
html_data = resp.content.decode()

soup = BeautifulSoup(html_data)


### find_all() selects HTML lines based on tags

In [183]:

# [1:] in next line to remove 'Rk' attribute that is arbitrary depending on sorting rule
attribute_headers = [header.getText() for header in soup.find_all('tr',limit=1)[0].find_all('th')][1:]

rows = soup.find_all('tbody',limit=1)[0].find_all('tr',class_='full_table')
player_stats = np.array([[attr.getText() for attr in row.find_all('td') ] for row in rows])


In [184]:

df_NBA_Player_Stats = pd.DataFrame(player_stats, columns=attribute_headers)
df_NBA_Player_Stats = df_NBA_Player_Stats.apply(pd.to_numeric, errors='ignore')
df_NBA_Player_Stats['Player'] = df_NBA_Player_Stats['Player'].apply(unidecode)

df_NBA_Player_Stats.tail()


Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
511,McKinley Wright IV,PG,24,DAL,20,1,10.3,1.2,2.5,0.469,...,0.636,0.3,1.0,1.3,1.9,0.4,0.2,0.6,0.9,2.9
512,Thaddeus Young,PF,34,TOR,52,9,15.1,2.1,3.8,0.551,...,0.692,1.3,1.8,3.2,1.4,1.0,0.1,0.8,1.7,4.6
513,Trae Young,PG,24,ATL,58,58,35.1,8.4,19.6,0.427,...,0.888,0.8,2.2,3.0,10.1,1.1,0.2,4.1,1.5,26.6
514,Cody Zeller,C,30,MIA,7,0,13.7,2.0,3.1,0.636,...,0.737,1.6,1.1,2.7,0.7,0.1,0.4,0.7,2.4,6.0
515,Ivica Zubac,C,25,LAC,61,61,29.2,4.0,6.5,0.619,...,0.703,3.3,6.8,10.1,1.1,0.4,1.3,1.7,2.9,10.2


In [185]:
#For salary data
resp = requests.get('https://hoopshype.com/salaries/players/')
html_data = resp.content.decode()

soup = BeautifulSoup(html_data)

In [186]:
attribute_headers = [header.getText().strip() for header in soup.find_all('tr',limit=1)[0].find_all('td', limit=3)][1:]
attribute_headers[1] += ' Salary'

In [187]:

rows = soup.find_all('tbody',limit=1)[0].find_all('tr')
rows[0].find_all('td')[1:3][1].getText().strip('')
player_salaries = np.array([[attr.getText().strip('$\t\n').replace(',','') for attr in row.find_all('td')[1:3] ] for row in rows])

In [188]:
df_salaries = pd.DataFrame(player_salaries, columns=attribute_headers)
df_salaries['2022/23 Salary'] = df_salaries['2022/23 Salary'].apply(int)
df_salaries['Player'] = df_salaries['Player'].apply(unidecode)


### Use FuzzyWuzzy to merge along names including nicknames and truncated names from different sets

In [189]:
names_extract = df_NBA_Player_Stats['Player'].apply(process.extractOne, args=(df_salaries['Player'],))
for i, name in df_NBA_Player_Stats['Player'].items():
        if names_extract[i][1] >= 86:
            df_NBA_Player_Stats.at[i,'Player'] = names_extract[i][0]
df_NBA_Player_Stats.head()


Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,Precious Achiuwa,C,23,TOR,42,11,22.5,3.7,7.7,0.48,...,0.694,1.9,4.4,6.3,1.0,0.6,0.6,1.2,2.0,9.7
1,Steven Adams,C,29,MEM,42,42,27.0,3.7,6.3,0.597,...,0.364,5.1,6.5,11.5,2.3,0.9,1.1,1.9,2.3,8.6
2,Bam Adebayo,C,25,MIA,61,61,35.0,8.3,15.5,0.54,...,0.809,2.5,7.1,9.6,3.3,1.2,0.8,2.5,2.8,21.2
3,Ochai Agbaji,SG,22,UTA,43,6,16.6,2.0,4.4,0.444,...,0.724,0.6,1.1,1.7,0.7,0.2,0.1,0.4,1.5,5.4
4,Santiago Aldama,PF,22,MEM,61,18,21.9,3.3,7.0,0.473,...,0.714,1.0,3.6,4.5,1.2,0.6,0.7,0.7,2.0,9.2


In [190]:
df_NBA_complete = pd.merge(df_NBA_Player_Stats,df_salaries, on='Player')
df_NBA_complete

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,2022/23 Salary
0,Precious Achiuwa,C,23,TOR,42,11,22.5,3.7,7.7,0.480,...,1.9,4.4,6.3,1.0,0.6,0.6,1.2,2.0,9.7,2840160
1,Steven Adams,C,29,MEM,42,42,27.0,3.7,6.3,0.597,...,5.1,6.5,11.5,2.3,0.9,1.1,1.9,2.3,8.6,17926829
2,Bam Adebayo,C,25,MIA,61,61,35.0,8.3,15.5,0.540,...,2.5,7.1,9.6,3.3,1.2,0.8,2.5,2.8,21.2,30351780
3,Ochai Agbaji,SG,22,UTA,43,6,16.6,2.0,4.4,0.444,...,0.6,1.1,1.7,0.7,0.2,0.1,0.4,1.5,5.4,3918360
4,Santiago Aldama,PF,22,MEM,61,18,21.9,3.3,7.0,0.473,...,1.0,3.6,4.5,1.2,0.6,0.7,0.7,2.0,9.2,2094120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
508,McKinley Wright,PG,24,DAL,20,1,10.3,1.2,2.5,0.469,...,0.3,1.0,1.3,1.9,0.4,0.2,0.6,0.9,2.9,508891
509,Thaddeus Young,PF,34,TOR,52,9,15.1,2.1,3.8,0.551,...,1.3,1.8,3.2,1.4,1.0,0.1,0.8,1.7,4.6,8000000
510,Trae Young,PG,24,ATL,58,58,35.1,8.4,19.6,0.427,...,0.8,2.2,3.0,10.1,1.1,0.2,4.1,1.5,26.6,37096500
511,Cody Zeller,C,30,MIA,7,0,13.7,2.0,3.1,0.636,...,1.6,1.1,2.7,0.7,0.1,0.4,0.7,2.4,6.0,743922


#### Next step is to use fuzzy wuzzy to connect nicknames of players that didn't match during merge above ('https://towardsdatascience.com/fuzzywuzzy-basica-and-merging-datasets-on-names-with-different-transcriptions-e2bb6e179fbf')