In [1]:
import pandas as pd
import numpy as np
import requests
import time
import re
pd.options.display.max_columns = 100

### Scrape advanced stats from basketball reference to ultimately pair with the historical o/u data.  When paired, these statistics will be "prior year" statistics.  For example, the stats below for the 2021 Washington Wizards will be merged with the 2022 Wizards row in historical o/u dataframe.  We are trying to predict "over" or "under" for a given season based off a team's statistics from the prior year.  

In [2]:
seasons = ['2021', '2019', '2018','2017', '2016', '2015', '2014', '2013', '2011', '2010', '2009', '2008', '2007', '2006', 
           '2005', '2004', '2003', '2002', '2001', '2000']

lst = []

for season in seasons:
    url = f'https://www.basketball-reference.com/leagues/NBA_{season}.html'
    info = pd.read_html(url)
    if int(season) >= 2016:
        df = pd.DataFrame(info[10])
    else:
        df = pd.DataFrame(info[8])
    df = df.droplevel(0, axis=1)
    df.columns = ['Rk', 'Team', 'Age', 'W', 'L', 'PW', 'PL', 'MOV', 'SOS', 'SRS', 'ORtg',
       'DRtg', 'NRtg', 'Pace', 'FTr', '3PAr', 'TS%', 'Unnamed: 17_level_1',
       'eFG%', 'TOV%', 'ORB%', 'FT/FGA', 'Unnamed: 22_level_1', 'opp_eFG%', 'opp_TOV%',
       'opp_DRB%', 'opp_FT/FGA', 'Unnamed: 27_level_1', 'Arena', 'Attend.',
       'Attend./G']
    df.drop(columns = ['Rk', 'Arena', 'Attend.', 'Attend./G'], inplace = True)
    prior_year = str(int(season) - 1)
    regex = re.compile('[^a-zA-Z 0-9]') # remove * at end of team name
    df['Team'] = df['Team'].map(lambda x: regex.sub('', x))
    df['Team_Year'] =  df['Team'] + '_' + prior_year[-2:] # assign stats to prior_year
    df['Year'] =  df['Team_Year'].map(lambda x: x[-2:])
    df = df[:-1] # drop last row which is an averages row
    df.dropna(axis = 1, inplace = True) # drop columns of all NaNs
    print(season, df.shape)
    lst.append(df)

df.tail()

2021 (30, 26)
2019 (30, 26)
2018 (30, 26)
2017 (30, 26)
2016 (30, 26)
2015 (30, 26)
2014 (30, 26)
2013 (30, 26)
2011 (30, 26)
2010 (30, 26)
2009 (30, 26)
2008 (30, 26)
2007 (30, 26)
2006 (30, 26)
2005 (30, 26)
2004 (29, 26)
2003 (29, 26)
2002 (29, 26)
2001 (29, 26)
2000 (29, 26)


Unnamed: 0,Team,Age,W,L,PW,PL,MOV,SOS,SRS,ORtg,DRtg,NRtg,Pace,FTr,3PAr,TS%,eFG%,TOV%,ORB%,FT/FGA,opp_eFG%,opp_TOV%,opp_DRB%,opp_FT/FGA,Team_Year,Year
24,Atlanta Hawks,27.6,28.0,54.0,26,56,-5.38,-0.04,-5.41,102.0,107.9,-5.9,91.7,0.292,0.12,0.503,0.46,14.1,31.7,0.217,0.481,11.3,71.0,0.196,Atlanta Hawks_99,99
25,Vancouver Grizzlies,25.3,22.0,60.0,25,57,-5.62,0.52,-5.1,102.3,108.5,-6.2,91.0,0.32,0.139,0.524,0.474,15.8,30.1,0.247,0.502,14.1,70.6,0.231,Vancouver Grizzlies_99,99
26,Golden State Warriors,26.9,19.0,63.0,20,62,-8.27,0.64,-7.63,99.8,108.4,-8.6,95.4,0.301,0.15,0.484,0.444,13.9,32.6,0.21,0.499,14.6,69.1,0.259,Golden State Warriors_99,99
27,Chicago Bulls,26.0,17.0,65.0,15,67,-9.4,0.17,-9.23,94.2,104.6,-10.4,89.4,0.338,0.167,0.49,0.443,18.0,29.9,0.24,0.484,14.5,69.9,0.24,Chicago Bulls_99,99
28,Los Angeles Clippers,24.2,15.0,67.0,13,69,-11.52,0.8,-10.73,97.8,110.1,-12.3,93.9,0.27,0.188,0.499,0.458,14.9,26.1,0.202,0.507,13.1,69.6,0.22,Los Angeles Clippers_99,99


In [3]:
df = pd.concat(lst)
print(df.shape)
df.tail()

(595, 26)


Unnamed: 0,Team,Age,W,L,PW,PL,MOV,SOS,SRS,ORtg,DRtg,NRtg,Pace,FTr,3PAr,TS%,eFG%,TOV%,ORB%,FT/FGA,opp_eFG%,opp_TOV%,opp_DRB%,opp_FT/FGA,Team_Year,Year
24,Atlanta Hawks,27.6,28.0,54.0,26,56,-5.38,-0.04,-5.41,102.0,107.9,-5.9,91.7,0.292,0.12,0.503,0.46,14.1,31.7,0.217,0.481,11.3,71.0,0.196,Atlanta Hawks_99,99
25,Vancouver Grizzlies,25.3,22.0,60.0,25,57,-5.62,0.52,-5.1,102.3,108.5,-6.2,91.0,0.32,0.139,0.524,0.474,15.8,30.1,0.247,0.502,14.1,70.6,0.231,Vancouver Grizzlies_99,99
26,Golden State Warriors,26.9,19.0,63.0,20,62,-8.27,0.64,-7.63,99.8,108.4,-8.6,95.4,0.301,0.15,0.484,0.444,13.9,32.6,0.21,0.499,14.6,69.1,0.259,Golden State Warriors_99,99
27,Chicago Bulls,26.0,17.0,65.0,15,67,-9.4,0.17,-9.23,94.2,104.6,-10.4,89.4,0.338,0.167,0.49,0.443,18.0,29.9,0.24,0.484,14.5,69.9,0.24,Chicago Bulls_99,99
28,Los Angeles Clippers,24.2,15.0,67.0,13,69,-11.52,0.8,-10.73,97.8,110.1,-12.3,93.9,0.27,0.188,0.499,0.458,14.9,26.1,0.202,0.507,13.1,69.6,0.22,Los Angeles Clippers_99,99


In [4]:
def change_team_name(name):
    if 'Nets' in name:
        return 'Brooklyn Nets'
    elif 'Grizzlies' in name:
        return 'Memphis Grizzlies'
    elif 'Seattle' in name:
        return 'Oklahoma City Thunder'
    elif 'Bobcats' in name:
        return 'Charlotte Hornets'
    else:
        return name

In [5]:
df['Team'] = df['Team'].map(change_team_name)

In [6]:
df.head()

Unnamed: 0,Team,Age,W,L,PW,PL,MOV,SOS,SRS,ORtg,DRtg,NRtg,Pace,FTr,3PAr,TS%,eFG%,TOV%,ORB%,FT/FGA,opp_eFG%,opp_TOV%,opp_DRB%,opp_FT/FGA,Team_Year,Year
0,Utah Jazz,28.5,52.0,20.0,55,17,9.25,-0.29,8.97,117.6,108.3,9.3,98.5,0.244,0.488,0.597,0.563,12.7,24.5,0.195,0.507,10.3,79.3,0.159,Utah Jazz_20,20
1,Los Angeles Clippers,28.8,47.0,25.0,49,23,6.18,-0.16,6.02,117.6,111.2,6.4,96.9,0.222,0.4,0.599,0.564,12.2,22.7,0.186,0.531,11.9,79.1,0.186,Los Angeles Clippers_20,20
2,Phoenix Suns,26.6,51.0,21.0,49,23,5.82,-0.15,5.67,117.2,111.3,5.9,97.2,0.212,0.392,0.597,0.564,11.5,20.8,0.177,0.534,12.4,78.5,0.194,Phoenix Suns_20,20
3,Milwaukee Bucks,28.1,46.0,26.0,48,24,5.89,-0.32,5.57,117.2,111.4,5.8,102.2,0.233,0.404,0.593,0.566,12.0,23.3,0.177,0.536,11.5,79.7,0.157,Milwaukee Bucks_20,20
4,Philadelphia 76ers,27.1,49.0,23.0,48,24,5.58,-0.31,5.28,113.2,107.6,5.6,99.5,0.293,0.347,0.579,0.541,12.8,23.2,0.225,0.521,13.8,78.2,0.2,Philadelphia 76ers_20,20


In [7]:
df['Team'] = df['Team'] + '_' + df['Year']
df.drop(columns = ['Team_Year', 'Year'], inplace = True)
df.head()

Unnamed: 0,Team,Age,W,L,PW,PL,MOV,SOS,SRS,ORtg,DRtg,NRtg,Pace,FTr,3PAr,TS%,eFG%,TOV%,ORB%,FT/FGA,opp_eFG%,opp_TOV%,opp_DRB%,opp_FT/FGA
0,Utah Jazz_20,28.5,52.0,20.0,55,17,9.25,-0.29,8.97,117.6,108.3,9.3,98.5,0.244,0.488,0.597,0.563,12.7,24.5,0.195,0.507,10.3,79.3,0.159
1,Los Angeles Clippers_20,28.8,47.0,25.0,49,23,6.18,-0.16,6.02,117.6,111.2,6.4,96.9,0.222,0.4,0.599,0.564,12.2,22.7,0.186,0.531,11.9,79.1,0.186
2,Phoenix Suns_20,26.6,51.0,21.0,49,23,5.82,-0.15,5.67,117.2,111.3,5.9,97.2,0.212,0.392,0.597,0.564,11.5,20.8,0.177,0.534,12.4,78.5,0.194
3,Milwaukee Bucks_20,28.1,46.0,26.0,48,24,5.89,-0.32,5.57,117.2,111.4,5.8,102.2,0.233,0.404,0.593,0.566,12.0,23.3,0.177,0.536,11.5,79.7,0.157
4,Philadelphia 76ers_20,27.1,49.0,23.0,48,24,5.58,-0.31,5.28,113.2,107.6,5.6,99.5,0.293,0.347,0.579,0.541,12.8,23.2,0.225,0.521,13.8,78.2,0.2


In [8]:
df.to_csv('./data/team_stats_00_22.csv', index = False)