In [1]:
import time
import pandas as pd
import numpy as np

from functools import reduce

import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
# from selenium.webdriver.firefox.options import Options

In [3]:
## example headless driver

options = Options()
options.headless = True

browser = webdriver.Firefox(executable_path="./drivers/geckodriver", options=options)
browser.get('https://duckduckgo.com')

print('Title: %s' % browser.title)
browser.quit()

Title: DuckDuckGo — Privacy, simplified.


In [20]:
## STARTING COLS

stat_types = ['traditional', 'advanced', 'misc', 'scoring', 'usage', 'defense']

stat_cols = ['%3PA',
                 '%3PM',
                 '%AST',
                 '%BLK',
                 '%BLKA',
                 '%DREB',
                 '%FGA',
                 '%FGA 2PT',
                 '%FGA 3PT',
                 '%FGM',
                 '%FTA',
                 '%FTM',
                 '%OREB',
                 '%PF',
                 '%PFD',
                 '%PTS',
                 '%PTS 2PT',
                 '%PTS 2PT MR',
                 '%PTS 3PT',
                 '%PTS FBPS',
                 '%PTS FT',
                 '%PTS OFFTO',
                 '%PTS PITP',
                 '%REB',
                 '%STL',
                 '%TOV',
                 '+/-',
                 '2FGM %AST',
                 '2FGM %UAST',
                 '2ND PTS',
                 '3FGM %AST',
                 '3FGM %UAST',
                 '3P%',
                 '3PA',
                 '3PM',
                 'AGE',
                 'AST',
                 'AST RATIO',
                 'AST%',
                 'AST/TO',
                 'BLK',
                 'BLKA',
                 'DD2',
                 'DEF WS',
                 'DEFRTG',
                 'DREB',
                 'DREB%',
                 'EFG%',
                 'FBPS',
                 'FG%',
                 'FGA',
                 'FGM',
                 'FGM %AST',
                 'FGM %UAST',
                 'FP',
                 'FT%',
                 'FTA',
                 'FTM',
                 'GP',
                 'L',
                 'NETRTG',
                 'OFFRTG',
                 'OPP 2ND PTS',
                 'OPP FBPS',
                 'OPP PITP',
                 'OPP PTS OFF TO',
                 'OREB',
                 'OREB%',
                 'PACE',
                 'PF',
                 'PFD',
                 'PIE',
                 'PITP',
                 'PTS',
                 'PTS OFF TO',
                 'REB',
                 'REB%',
                 'STL',
                 'TD3',
                 'TEAM',
                 'TO RATIO',
                 'TOT MIN',
                 'TOV',
                 'TS%',
                 'USG%',
                 'W']

In [3]:
## functions to navigate nba.com/stats pages

def sort_by_name(browser):
    xpath_player_sort = '/html/body/main/div[2]/div/div[2]/div/div/nba-stat-table/div[2]/div[1]/table/thead/tr/th[2]'
    browser.find_element_by_xpath(xpath_player_sort).click()
    browser.find_element_by_xpath(xpath_player_sort).click()
    return

def select_all_pages(browser):
    xpath_page_selection = '/html/body/main/div[2]/div/div[2]/div/div/nba-stat-table/div[1]/div/div/select'
    xpath_all_pages = xpath_page_selection + '/option[1]'
    browser.find_element_by_xpath(xpath_all_pages).click()
    return

def select_per_100(browser):
    xpath_per_mode = '/html/body/main/div[2]/div/div[2]/div/div/div[1]/div[3]/div/div/label/select'
    browser.find_element_by_xpath(xpath_per_mode).click()
    
    xpath_per_100 = '/html/body/main/div[2]/div/div[2]/div/div/div[1]/div[3]/div/div/label/select/option[3]'
    browser.find_element_by_xpath(xpath_per_100).click()
    
def select_stat_type(browser, stat_type):
    """
    stat type mappings
    1: Traditional
    2: Advanced
    3: Misc
    4: Scoring
    5: Usage
    6: Opponent
    7: Defense
    """
    stat_table_dict = {'traditional': 1, 'advanced': 2, 'misc': 3, 'scoring': 4, 'usage': 5, 'opponent': 6, 'defense':7}
    n = stat_table_dict[stat_type]
    
    ## select the header to get drop down
    xpath_stat_type_button = '/html/body/main/div[2]/div/div[2]/div/nav-dropdown/nav/section[3]/div/a'
    browser.find_element_by_xpath(xpath_stat_type_button).click()

    ## navigate to different stat type
    xpath_stat_type = f'/html/body/main/div[2]/div/div[2]/div/nav-dropdown/nav/section[3]/ul/li[{n}]/a/span'
    browser.find_element_by_xpath(xpath_stat_type).click()
    
    if n in [1, 3, 6, 7]:
        time.sleep(2)
        select_per_100(browser)
    return

def select_season(browser, season):
    seasons_dict = dict(zip(range(2019, 1995, -1), [i for i in range(1, 100)]))
    n = seasons_dict[season]
    ## click 'SEASON' header
    xpath_season_header = '/html/body/main/div[2]/div/div[2]/div/div/div[1]/div[1]/div/div/label/select'
    browser.find_element_by_xpath(xpath_season_header).click()
    ## select season
    xpath_select_season = f'/html/body/main/div[2]/div/div[2]/div/div/div[1]/div[1]/div/div/label/select/option[{n}]'
    browser.find_element_by_xpath(xpath_select_season).click()
    return


def remove_equal_cols(df):
    "works with non-unique column names"
    dup_col_idxs = set()
    for i in range(len(df.columns)-1):
        if i in dup_col_idxs: continue
        for j in range(i+1, len(df.columns)):
            if df.iloc[:, i].equals(df.iloc[:, j]):
                dup_col_idxs.add(j)
                
    ## testing...
#     removed_cols = [df.columns[idx] for idx in dup_col_idxs]
#     print(f'REMOVED {len(dup_col_idxs)} Dup Cols\n')
#     print('REMOVED cols:', removed_cols)
    
    df = df.iloc[:, list(set(range(len(df.columns)))-dup_col_idxs)].copy()
    
    return df

In [6]:
## Get df from raw table

def scrape_table(browser, stat_type: str):
    """
    Scrapes stats table present on browser and returns pandas DF. 
    
    1: Traditional
    2: Advanced
    3: Misc
    4: Scoring
    5: Usage
    6: Opponent
    7: Defense
    """
    stat_table_dict = {'traditional': 1, 'advanced': 2, 'misc': 3, 'scoring': 4, 'usage': 5, 'opponent': 6, 'defense':7}
    n = stat_table_dict[stat_type]
    
    ## read in raw table data
    xpath_stats_table = '/html/body/main/div[2]/div/div[2]/div/div/nba-stat-table/div[2]/div[1]'
    raw_table = browser.find_element_by_xpath(xpath_stats_table)
    table = raw_table.text.split('\n')
    
    ## columns for each stat table type...
    if n == 1:  # traditional
        cols = ['TEAM', 'AGE', 'GP', 'W', 'L', 'MIN', 'PTS', 'FGM', 'FGA', 'FG%', '3PM', '3PA', '3P%', 'FTM', 'FTA', 'FT%', 'OREB', 'DREB', 'REB', 'AST', 'TOV', 'STL', 'BLK', 'PF', 'FP', 'DD2', 'TD3', '+/-']
    
    elif n == 2:  # advanced
        cols = ['TEAM', 'AGE', 'GP', 'W', 'L', 'MIN', 'OFFRTG', 'DEFRTG', 'NETRTG', 'AST%', 'AST/TO', 'AST RATIO', 'OREB%', 'DREB%', 'REB%', 'TO RATIO', 'EFG%', 'TS%', 'USG%', 'PACE', 'PIE']
    
    elif n == 3: # misc
        cols = ['TEAM','AGE','GP','W','L','MIN','PTS OFF TO','2ND PTS','FBPS','PITP','OPP PTS OFF TO','OPP 2ND PTS','OPP FBPS','OPP PITP','BLK','BLKA','PF','PFD']

    elif n == 4: # scoring
        cols = ['TEAM', 'AGE', 'GP', 'W','L','MIN','%FGA 2PT','%FGA 3PT','%PTS 2PT','%PTS 2PT MR','%PTS 3PT','%PTS FBPS','%PTS FT','%PTS OFFTO','%PTS PITP','2FGM %AST','2FGM %UAST','3FGM %AST','3FGM %UAST','FGM %AST','FGM %UAST']
    
    elif n == 5:  # usage, adjusting 'MIN' to 'TOT MIN' as this is what usage provides and is different than MIN per 100 poss.
        cols = ['TEAM', 'AGE', 'GP', 'W', 'L', 'TOT MIN', 'USG%', '%FGM', '%FGA', '%3PM', '%3PA', '%FTM', '%FTA', '%OREB', '%DREB', '%REB', '%AST', '%TOV', '%STL', '%BLK', '%BLKA', '%PF', '%PFD', '%PTS']
    
#     ## MISSING DATA FROM PRIOR 2010'S, not using opponent stats
#     elif table_num == 6: # opponent
#         cols = ['TEAM','GP','W','L','MIN','OPP FGM','OPP FGA','OPP FG%','OPP 3PM','OPP 3PA','OPP 3P%','OPP FTM','OPP FTA','OPP FT%','OPP OREB','OPP DREB','OPP REB','OPP AST','OPP TOV','OPP STL','OPP BLK','OPP BLKA','OPP PF','OPP PFD','OPP PTS','+/-',]
    
    elif n == 7:  # defense
        cols = ['TEAM','AGE','GP','W','L','MIN','DEF RTG','DREB','DREB%','%DREB','STL','STL%','BLK','%BLK','OPP PTS OFF TOV','OPP PTS 2ND CHANCE','OPP PTS FB','OPP PTS PAINT','DEF WS']

    ### TESTING
    cols = [stat_type[0:2] + col for col in cols]
        
    ## extract position of first player row
    for i, line in enumerate(table):
        if line and line[0] == 'A' and len(line.split(' ')) == 2:
            body = table[i:] 
            break
            
    ## parse body of table now and extract player name index, stats
    player_names, all_stats = [], []
    for i in range(0, len(body), 2):
        player = body[i]
        
        if player in player_names:
            ## need to add team name to player index when players have multiple rows
            team = body[i+1].split(' ')[0]
            player += f' ({team})' # e.g. Marcus Williams (GSW)
            ## need to add team name to previous same player entry as well as order is different for varying tables within a season
            team = all_stats[-1][0]
            player_names[-1] += f' ({team})'
            
        if len(player.split(' ')) > 0 and len(player.split(' ')) < 4:  # avoid weird missing data rows
            player_names.append(player)
            player_stats = body[i+1].split(' ')
            all_stats.append(player_stats)
        
    return pd.DataFrame(all_stats, index=player_names, columns=cols)

In [8]:
def scrape_combine_season_data(season: int, stat_types: list):

#     options = Options()
    options = webdriver.firefox.options.Options()
    options.headless = True
    browser = webdriver.Firefox(executable_path="./drivers/geckodriver", options=options)

    ## set up browser
    generic_nba_stats_url = 'https://stats.nba.com/players/traditional/?sort=PTS&dir=-1'
#     browser = webdriver.Firefox(executable_path="./drivers/geckodriver")
    browser.get(generic_nba_stats_url)
    time.sleep(4)  # ensure loading...
    
    dfs = []    
    for stat_type in stat_types:
        select_stat_type(browser, stat_type); time.sleep(5)
        select_season(browser, season); time.sleep(5)
        sort_by_name(browser); time.sleep(5)
        select_all_pages(browser); time.sleep(5)
        df = scrape_table(browser, stat_type)
        dfs.append(df)
        time.sleep(5)
    
    browser.quit()
    combined_stats = reduce(lambda left, right: pd.merge(left, right[right.columns.difference(left.columns)], left_index=True, right_index=True), dfs)
    print('Combined shape:', combined_stats.shape)
    combined_stats = remove_equal_cols(combined_stats)
    print('Squeezed Combined shape:', combined_stats.shape)

    
    return combined_stats

In [9]:
def process_write_season_data(df, season, filepath):
    
#     assert len(df.columns) == 87
    
    df = df.loc[:, stat_cols].copy()
    df.set_index(df.index.astype(str) + ' ' + str(season)[-2:], inplace=True)
    
    with open(filepath, 'w') as f:
        df.to_csv(filepath)
    
    return df    

In [10]:
def scrape(seasons, stat_types, stat_dir):
    dfs = []
    for season in seasons:
        filepath = stat_dir + 'stats_' + str(season) + '.csv'
        df = scrape_combine_season_data(season, stat_types)
        df = process_write_season_data(df, season, filepath)
        dfs.append(df)
    
    df_combined = pd.concat(dfs)
    
    return df_combined, dfs

In [45]:
%%time

stat_dir = 'season_data/'
stats_09_10, dfs = scrape([2009,2010], stat_types, stat_dir)

Wall time: 10min 8s


In [46]:
stats_09_10.head()


Unnamed: 0,%3PA,%3PM,%AST,%BLK,%BLKA,%DREB,%FGA,%FGA 2PT,%FGA 3PT,%FGM,...,REB%,STL,TD3,TEAM,TO RATIO,TOT MIN,TOV,TS%,USG%,W
AJ Price 09,34.3,32.4,27.4,3.2,23.8,13.4,23.2,50.8,49.2,21.3,...,4.9,1.9,0,IND,10.8,865.0,3.3,53.0,21.9,23
Aaron Brooks 09,37.8,41.5,32.7,6.2,21.3,8.8,26.2,60.6,39.4,25.1,...,3.7,1.2,0,HOU,11.0,2919.0,4.0,54.9,25.4,42
Aaron Gray 09,0.0,0.0,16.5,35.5,25.0,30.8,15.4,100.0,0.0,17.2,...,17.6,1.5,0,NOH,13.3,311.0,2.9,55.1,16.0,11
Acie Law 09,18.4,14.7,26.0,4.2,39.4,10.3,19.4,78.7,21.3,18.6,...,3.8,2.3,0,CHI,12.1,234.0,3.6,58.5,21.4,11
Adam Morrison 09,21.2,13.9,16.8,8.0,30.0,13.6,20.1,75.3,24.7,17.7,...,6.5,0.6,0,LAL,8.5,241.0,2.1,41.8,17.8,23


In [48]:
stats_09_10.index.str.contains

<bound method StringMethods.contains of <pandas.core.strings.StringMethods object at 0x000001879C12E940>>

In [64]:
stats_09_10[stats_09_10.index.str.contains("\) ")]  # checking for double player entries

Unnamed: 0,%3PA,%3PM,%AST,%BLK,%BLKA,%DREB,%FGA,%FGA 2PT,%FGA 3PT,%FGM,...,REB%,STL,TD3,TEAM,TO RATIO,TOT MIN,TOV,TS%,USG%,W


In [21]:
%%time

## TESTING

# stat_types = ['traditional', 'advanced']
stats_09 = scrape_combine_season_data(2009, stat_types)

Combined shape: (441, 131)
Squeezed Combined shape: (441, 88)
Wall time: 5min 3s


In [22]:
cols = [col[2:] for col in stats_09.columns]

print(set(stat_cols) - set(cols))
set(cols) - set(stat_cols)


set()


{'MIN'}

In [21]:
print(stats_09.shape)
stats_09.head()

(441, 87)


Unnamed: 0,TEAM,AGE,GP,W,L,MIN,PTS,FGM,FGA,FG%,...,%PTS FBPS,%PTS FT,%PTS OFFTO,%PTS PITP,2FGM %AST,2FGM %UAST,3FGM %AST,3FGM %UAST,FGM %AST,FGM %UAST
AJ Price,IND,23,56,23,33,48.0,22.8,8.0,19.6,41.0,...,12.7,14.6,19.5,25.4,10.6,89.4,76.7,23.3,37.9,62.1
Aaron Brooks,HOU,25,82,42,40,50.4,27.7,9.9,23.0,43.2,...,14.5,15.3,13.7,29.6,21.0,79.0,71.3,28.7,39.3,60.7
Aaron Gray,NOH,25,32,11,21,50.8,17.0,6.9,13.4,51.2,...,6.7,19.2,9.6,75.0,33.3,66.7,0.0,0.0,33.3,66.7
Acie Law,CHI,25,26,11,15,49.1,23.7,7.4,15.8,46.7,...,22.1,33.6,20.4,49.6,30.0,70.0,60.0,40.0,34.3,65.7
Adam Morrison,LAL,25,31,23,8,50.1,15.4,6.7,17.7,37.6,...,2.7,6.8,21.6,48.6,70.4,29.6,60.0,40.0,68.8,31.3


In [26]:
set(stat_cols) - set(stats_09.columns)

{'%STL', 'DEFRTG', 'OPP 2ND PTS', 'OPP FBPS', 'OPP PITP', 'OPP PTS OFF TO'}

In [27]:
for col in stats_09.columns.sort_values():
    print(col)

%3PA
%3PM
%AST
%BLK
%BLKA
%DREB
%FGA
%FGA 2PT
%FGA 3PT
%FGM
%FTA
%FTM
%OREB
%PF
%PFD
%PTS
%PTS 2PT
%PTS 2PT MR
%PTS 3PT
%PTS FBPS
%PTS FT
%PTS OFFTO
%PTS PITP
%REB
%TOV
+/-
2FGM %AST
2FGM %UAST
2ND PTS
3FGM %AST
3FGM %UAST
3P%
3PA
3PM
AGE
AST
AST RATIO
AST%
AST/TO
BLK
BLKA
DD2
DEF RTG
DEF WS
DREB
DREB%
EFG%
FBPS
FG%
FGA
FGM
FGM %AST
FGM %UAST
FP
FT%
FTA
FTM
GP
L
MIN
NETRTG
OFFRTG
OPP PTS 2ND CHANCE
OPP PTS FB
OPP PTS OFF TOV
OPP PTS PAINT
OREB
OREB%
PACE
PF
PFD
PIE
PITP
PTS
PTS OFF TO
REB
REB%
STL
STL%
TD3
TEAM
TO RATIO
TOT MIN
TOV
TS%
USG%
W
