In [1]:
'''import required packages'''

import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
'''identify the url where the data is present'''

url = 'https://www.basketball-reference.com/boxscores/202110190LAL.html'

'''Pandas has a feature which can read tables from a webpage'''
tables = pd.read_html(url) 

In [3]:
'''we need to identify the tables for 
   both teams basic and advanced stats'''

away_basic = tables[0]
away_adv = tables[7]
home_basic = tables[8]
home_adv = tables[15]

In [4]:
'''we can preview the data by using .head(5) 
   to show the top of the data table'''

away_basic.head(5)

Unnamed: 0_level_0,Unnamed: 0_level_0,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats,Basic Box Score Stats
Unnamed: 0_level_1,Starters,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,+/-
0,Stephen Curry,36:11,5,21,0.238,2,8,0.25,9,9,...,1,9,10,10,3,0,4,1,21,4
1,Draymond Green,29:56,2,5,0.4,0,0,,2,4,...,3,5,8,6,1,0,5,3,6,-2
2,Andrew Wiggins,25:44,5,10,0.5,2,5,0.4,0,0,...,1,6,7,1,1,0,2,3,12,-5
3,Jordan Poole,25:04,8,18,0.444,4,11,0.364,0,1,...,0,2,2,3,1,0,2,2,20,2
4,Kevon Looney,15:28,3,5,0.6,0,0,,1,2,...,2,2,4,1,1,1,1,0,7,-8


In [5]:
'''In the above preview of the data there are two levels of headers, 
   which we don't need. The following lines are dropping the unwanted
   headers
   '''
away_basic = away_basic.droplevel(level=0, axis=1)
away_adv = away_adv.droplevel(level=0, axis=1)
home_basic = home_basic.droplevel(level=0, axis=1)
home_adv = home_adv.droplevel(level=0, axis=1)

In [6]:
away_adv.head(5)

Unnamed: 0,Starters,MP,TS%,eFG%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,ORtg,DRtg,BPM
0,Stephen Curry,36:11,0.421,0.286,0.381,0.429,2.7,26.0,14.0,38.6,3.5,0.0,13.8,31.2,94,97,1.8
1,Draymond Green,29:56,0.444,0.4,0.0,0.8,9.8,17.4,13.5,25.5,1.4,0.0,42.5,15.3,84,102,-4.8
2,Andrew Wiggins,25:44,0.6,0.6,0.5,0.0,3.8,24.3,13.7,5.9,1.7,0.0,16.7,18.2,97,100,-1.0
3,Jordan Poole,25:04,0.542,0.556,0.611,0.056,0.0,8.3,4.0,22.4,1.7,0.0,9.8,31.8,95,102,1.9
4,Kevon Looney,15:28,0.595,0.6,0.0,0.4,12.7,13.5,13.1,9.8,2.8,5.9,14.5,17.3,114,96,4.3


In [7]:
'''Now we can combine our basic and advanced datasets together 
   by merging them based on the Starters and MP columns, which
   match'''

away_df = pd.merge(away_basic,away_adv,on=['Starters', 'MP'])
home_df = pd.merge(home_basic,home_adv,on=['Starters', 'MP'])

In [8]:
away_df.head(6)

Unnamed: 0,Starters,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,...,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,ORtg,DRtg,BPM
0,Stephen Curry,36:11,5,21,.238,2,8,.250,9,9,...,26.0,14.0,38.6,3.5,0.0,13.8,31.2,94,97,1.8
1,Draymond Green,29:56,2,5,.400,0,0,,2,4,...,17.4,13.5,25.5,1.4,0.0,42.5,15.3,84,102,-4.8
2,Andrew Wiggins,25:44,5,10,.500,2,5,.400,0,0,...,24.3,13.7,5.9,1.7,0.0,16.7,18.2,97,100,-1.0
3,Jordan Poole,25:04,8,18,.444,4,11,.364,0,1,...,8.3,4.0,22.4,1.7,0.0,9.8,31.8,95,102,1.9
4,Kevon Looney,15:28,3,5,.600,0,0,,1,2,...,13.5,13.1,9.8,2.8,5.9,14.5,17.3,114,96,4.3
5,Reserves,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,...,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,ORtg,DRtg,BPM


In [9]:
'''Next we need to fix up the name of the first column and remove the 
   Reserves row present in the table above.'''

away_df = away_df[away_df['Starters'].str.contains('Reserves')==False]
away_df = away_df.rename(columns={'Starters': "Players"}, errors="raise")
    
home_df = home_df[home_df['Starters'].str.contains('Reserves')==False]
home_df = home_df.rename(columns={'Starters': "Players"}, errors="raise")

In [10]:
away_df.head(6)

Unnamed: 0,Players,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,...,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,ORtg,DRtg,BPM
0,Stephen Curry,36:11,5,21,0.238,2,8,0.25,9,9,...,26.0,14.0,38.6,3.5,0.0,13.8,31.2,94,97,1.8
1,Draymond Green,29:56,2,5,0.4,0,0,,2,4,...,17.4,13.5,25.5,1.4,0.0,42.5,15.3,84,102,-4.8
2,Andrew Wiggins,25:44,5,10,0.5,2,5,0.4,0,0,...,24.3,13.7,5.9,1.7,0.0,16.7,18.2,97,100,-1.0
3,Jordan Poole,25:04,8,18,0.444,4,11,0.364,0,1,...,8.3,4.0,22.4,1.7,0.0,9.8,31.8,95,102,1.9
4,Kevon Looney,15:28,3,5,0.6,0,0,,1,2,...,13.5,13.1,9.8,2.8,5.9,14.5,17.3,114,96,4.3
6,Damion Lee,29:03,4,10,0.4,1,5,0.2,6,6,...,3.6,1.7,9.6,0.0,0.0,0.0,17.0,129,106,-0.7


In [11]:
'''We should also add a new column to identify the players for the home
   and away team. This information is something you may wish to analyze
   at a later date (e.g. how does this player perform on the road?)'''

away_df['Home-Away'] = 'Away'
home_df['Home-Away'] = 'Home'

In [12]:
'''Before we finish we will want to add a column for the team, as players may 
   be traded throughout the season. We will also want to record the match
   details (Team A @ Team B) and date. We get this information using
   BeautifulSoup, which enables us to extract the html of the webpage.'''

r = requests.get(url)
soup = BeautifulSoup(r.content, features='lxml')
html = soup.find_all('div', class_ = 'box')

for item in html:
    h2 =', '.join([x.get_text() for x in item.find_all('h2')])
    match =([x.get_text() for x in item.find_all('h1')]) #we'll come back too

print(h2) # below is all the h2 elements from the webpage.

, Line Score, Four Factors, Golden State Warriors Basic and Advanced Stats, Golden State Warriors (Q1), Golden State Warriors (Q2), Golden State Warriors (H1), Golden State Warriors (Q3), Golden State Warriors (Q4), Golden State Warriors (H2), , Los Angeles Lakers Basic and Advanced Stats, Los Angeles Lakers (Q1), Los Angeles Lakers (Q2), Los Angeles Lakers (H1), Los Angeles Lakers (Q3), Los Angeles Lakers (Q4), Los Angeles Lakers (H2), , Team and League Schedules


In [13]:
'''From the list of strings above we want to pick out one example for each
   team. We can does this by eliminating the rest based on a condition.
   In the code below we are removing all string the does not have the
   term - Basic and Advanced Stats'''

my_list = h2.split(',')
my_list = [s for s in my_list if "Basic and Advanced Stats" in s] 

print(my_list)

[' Golden State Warriors Basic and Advanced Stats', ' Los Angeles Lakers Basic and Advanced Stats']


In [14]:
'''Then we can remove that string to just leave the team names.'''

my_list = [x.replace("Basic and Advanced Stats", '')
           .replace("Basic and Advanced Stats", '') for x in my_list]

print(my_list)

[' Golden State Warriors ', ' Los Angeles Lakers ']


In [15]:
'''Because the away team tables appear first in the webpage can be assured
   that the team that appears first in our list is the away team. Therefore
   we can allocate them as such.'''

away_team = my_list[0]
home_team = my_list[1]
away_df['Team'] = away_team
home_df['Team'] = home_team

In [16]:
'''Earlier we identified the match details with BeautifulSoup.'''

print(match)

['Golden State Warriors at Los Angeles Lakers Box Score, October 19, 2021']


In [17]:
'''Before we add it our data tables, we should combine the home and away
   data frames as the match information is required for both and we have 
   no more conditional values to add.'''

frames = [away_df, home_df]
df = pd.concat(frames)
df

Unnamed: 0,Players,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,...,AST%,STL%,BLK%,TOV%,USG%,ORtg,DRtg,BPM,Home-Away,Team
0,Stephen Curry,36:11,5,21,.238,2,8,.250,9,9,...,38.6,3.5,0.0,13.8,31.2,94,97,1.8,Away,Golden State Warriors
1,Draymond Green,29:56,2,5,.400,0,0,,2,4,...,25.5,1.4,0.0,42.5,15.3,84,102,-4.8,Away,Golden State Warriors
2,Andrew Wiggins,25:44,5,10,.500,2,5,.400,0,0,...,5.9,1.7,0.0,16.7,18.2,97,100,-1.0,Away,Golden State Warriors
3,Jordan Poole,25:04,8,18,.444,4,11,.364,0,1,...,22.4,1.7,0.0,9.8,31.8,95,102,1.9,Away,Golden State Warriors
4,Kevon Looney,15:28,3,5,.600,0,0,,1,2,...,9.8,2.8,5.9,14.5,17.3,114,96,4.3,Away,Golden State Warriors
6,Damion Lee,29:03,4,10,.400,1,5,.200,6,6,...,9.6,0.0,0.0,0.0,17.0,129,106,-0.7,Away,Golden State Warriors
7,Nemanja Bjelica,25:55,6,7,.857,1,1,1.000,2,2,...,24.8,1.6,0.0,11.3,13.3,174,100,13.0,Away,Golden State Warriors
8,Andre Iguodala,22:58,4,7,.571,2,5,.400,2,2,...,12.8,0.0,3.9,0.0,13.4,154,104,6.6,Away,Golden State Warriors
9,Juan Toscano-Anderson,12:23,2,3,.667,1,2,.500,1,2,...,11.7,0.0,0.0,20.5,15.4,114,104,1.4,Away,Golden State Warriors
10,Otto Porter Jr.,11:35,1,3,.333,1,2,.500,2,2,...,0.0,3.7,0.0,20.5,16.4,101,97,-1.9,Away,Golden State Warriors


In [18]:
'''With the match details now added we have completed this walkthrough and
   can save on dataset to excel. Of course more data can be extracted and
   further edits will be require, but what we have is enough to get started
   with.'''
match = match[0]
df['Match'] = match
df
#df.to_excel('nba_dataset.xlsx', index=False)

Unnamed: 0,Players,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,...,STL%,BLK%,TOV%,USG%,ORtg,DRtg,BPM,Home-Away,Team,Match
0,Stephen Curry,36:11,5,21,.238,2,8,.250,9,9,...,3.5,0.0,13.8,31.2,94,97,1.8,Away,Golden State Warriors,Golden State Warriors at Los Angeles Lakers Bo...
1,Draymond Green,29:56,2,5,.400,0,0,,2,4,...,1.4,0.0,42.5,15.3,84,102,-4.8,Away,Golden State Warriors,Golden State Warriors at Los Angeles Lakers Bo...
2,Andrew Wiggins,25:44,5,10,.500,2,5,.400,0,0,...,1.7,0.0,16.7,18.2,97,100,-1.0,Away,Golden State Warriors,Golden State Warriors at Los Angeles Lakers Bo...
3,Jordan Poole,25:04,8,18,.444,4,11,.364,0,1,...,1.7,0.0,9.8,31.8,95,102,1.9,Away,Golden State Warriors,Golden State Warriors at Los Angeles Lakers Bo...
4,Kevon Looney,15:28,3,5,.600,0,0,,1,2,...,2.8,5.9,14.5,17.3,114,96,4.3,Away,Golden State Warriors,Golden State Warriors at Los Angeles Lakers Bo...
6,Damion Lee,29:03,4,10,.400,1,5,.200,6,6,...,0.0,0.0,0.0,17.0,129,106,-0.7,Away,Golden State Warriors,Golden State Warriors at Los Angeles Lakers Bo...
7,Nemanja Bjelica,25:55,6,7,.857,1,1,1.000,2,2,...,1.6,0.0,11.3,13.3,174,100,13.0,Away,Golden State Warriors,Golden State Warriors at Los Angeles Lakers Bo...
8,Andre Iguodala,22:58,4,7,.571,2,5,.400,2,2,...,0.0,3.9,0.0,13.4,154,104,6.6,Away,Golden State Warriors,Golden State Warriors at Los Angeles Lakers Bo...
9,Juan Toscano-Anderson,12:23,2,3,.667,1,2,.500,1,2,...,0.0,0.0,20.5,15.4,114,104,1.4,Away,Golden State Warriors,Golden State Warriors at Los Angeles Lakers Bo...
10,Otto Porter Jr.,11:35,1,3,.333,1,2,.500,2,2,...,3.7,0.0,20.5,16.4,101,97,-1.9,Away,Golden State Warriors,Golden State Warriors at Los Angeles Lakers Bo...
