In [9]:
from basketball_reference_scraper.teams import get_roster
from basketball_reference_scraper.players import get_stats 
from ratelimit import limits, RateLimitException, sleep_and_retry

In [21]:
import pandas as pd
from requests import get
from bs4 import BeautifulSoup
import time
import os
import numpy as np
import csv
import glob


In [11]:
try:
    from constants import TEAM_TO_TEAM_ABBR, TEAM_SETS
    from utils import remove_accents
except:
    from basketball_reference_scraper.constants import TEAM_TO_TEAM_ABBR, TEAM_SETS
    from basketball_reference_scraper.utils import remove_accents

In [12]:
# 30 calls per minute
CALLS = 1
RATE_LIMIT = 60

@sleep_and_retry
@limits(calls=CALLS, period=RATE_LIMIT)
def check_limit():
    ''' Empty function just to check for calls to API '''
    return

In [13]:
os.getcwd()

'/Users/amywinecoff/Documents/CITP/Research/Github/ml-teaching/basketball_reference_scraper'

In [14]:
def get_roster(team, season_end_year):
    check_limit()
    r = get(
        f'https://www.basketball-reference.com/teams/{team}/{season_end_year}.html')
    df = None
    
    try:
    #if r.status_code == 200:
        soup = BeautifulSoup(r.content, 'html.parser')
        table = soup.find('table')
        df = pd.read_html(str(table))[0]
        df.columns = ['NUMBER', 'PLAYER', 'POS', 'HEIGHT', 'WEIGHT', 'BIRTH_DATE',
                      'NATIONALITY', 'EXPERIENCE', 'COLLEGE']
        # remove rows with no player name (this was the issue above)
        df = df[df['PLAYER'].notna()]
        df['PLAYER'] = df['PLAYER'].apply(
            lambda name: remove_accents(name, team, season_end_year))
        # handle rows with empty fields but with a player name.
        df['BIRTH_DATE'] = df['BIRTH_DATE'].apply(
            lambda x: pd.to_datetime(x) if pd.notna(x) else pd.NaT)
        df['NATIONALITY'] = df['NATIONALITY'].apply(
            lambda x: x.upper() if pd.notna(x) else '')
    except Exception as e:
        print(e)
        print(r.status_code)

    return df

In [16]:
RUN_ROSTER = False

if RUN_ROSTER:
    season_end = 2021
    nba_teams = ['ATL','BOS','CHI','CLE','DAL','DEN','DET','GSW','HOU','IND','LAC',
    'LAL','MEM','MIA','MIL','MIN','NOH','NYK','BKN','OKC','ORL','PHI','PHO','POR','SAC',
    'SAS','TOR','UTH','WAS']
    failed_teams = []

    players_positions = []
    for team in nba_teams:
        print(f'Current team: {team}')
        #don't regenerate the file if it is already there
        if not os.path.exists(f'./data/team_roster_{team}_{season_end}.csv'):
            try:
                team_roster = get_roster(team=team, season_end_year=2021)
                team_roster['TEAM']= team
                team_roster.to_csv(f'./data/team_roster_{team}_{season_end}.csv', index=False)
                roster_subset = team_roster[['PLAYER', 'POS', 'TEAM']].values.tolist()
                players_positions.append(roster_subset)
                time.sleep(180)

            except Exception as e:
                print(team)
                print(e)
                failed_teams.append(team)


In [6]:
WRITE_PLAYERS_FILE=False
if WRITE_PLAYERS_FILE:
    nba_teams = ['ATL','BOS','CHI','CLE','DAL','DEN','DET','GSW','HOU','IND','LAC',
        'LAL','MEM','MIA','MIL','MIN','NOH','NYK','BKN','OKC','ORL','PHI','PHO','POR','SAC',
        'SAS','TOR','UTH','WAS']
    season_end = 2021
    players_positions = []
    for team in nba_teams:

        if os.path.exists(f'./data/team_roster_{team}_{season_end}.csv'):
            team_roster = pd.read_csv(f'./data/team_roster_{team}_{season_end}.csv')
            roster_subset = team_roster[['PLAYER', 'POS', 'TEAM']].values.tolist()
            players_positions.extend(roster_subset)


    players_positions_df =pd.DataFrame(players_positions, columns=['player', 'position', 'team'])
    players_positions_df.to_csv('./data/all_players_positions.csv', index=False)

In [17]:
def fetch_player_career(player):
    career = get_stats(player, stat_type='PER_GAME', playoffs=False, career=False, ask_matches=False)
    
    # Drop rows containing 'Did Not Play' by only returning rows with digits in the "Games Played" field
    career_stats = career[career['G'].apply(lambda x: str(x).replace('.','').isdigit())]
    
    return career_stats

def extract_stats(career_stats):

    total_seasons = len(career_stats)
    
    career_ast = (sum(career_stats['AST'].astype(float)) / total_seasons)
    career_stl = (sum(career_stats['STL'].astype(float)) / total_seasons)
    career_blk = (sum(career_stats['BLK'].astype(float)) / total_seasons)
    career_tov = (sum(career_stats['TOV'].astype(float)) / total_seasons)
    career_pf = (sum(career_stats['PF'].astype(float)) / total_seasons)
    career_pts = (sum(career_stats['PTS'].astype(float)) / total_seasons)

    
    return [career_ast, career_stl, career_blk, career_tov, career_pf, career_pts]
    

In [19]:
players_positions_df = pd.read_csv('./data/all_players_positions.csv')
players_positions_df.head()


careers = []
columns = ['ast','stl','blk','tov','pf','pts', 'name', 'position', 'team']

failed_players = []

for index, row in players_positions_df.iterrows():
    
    #don't regenerate the file if it is already there
    player_name = row['player'].replace(" ",'_')
    print(f'Current player: {player_name}')
    if not os.path.exists(f'./data/players/{player_name}_stats.csv'):
        try:
            d = fetch_player_career(row['player'])
            bball_features = extract_stats(d)
            #add the player name, position, and team
            bball_features.extend([row['player'], row['position'], row['team']])

            bball_df = pd.DataFrame(np.array(bball_features).reshape(-1,len(bball_features)), columns=columns)
            bball_df.head()
            bball_df.to_csv(f'./data/players/{player_name}_stats.csv', index=False)

            time.sleep(30)

        except Exception as e:
            print(row['player'])
            print(e)
            failed_players.append(row['player'])

Current player: Bogdan_Bogdanović
Current player: Clint_Capela
Clint Capela
string index out of range
Current player: John_Collins
Current player: Kris_Dunn
Current player: Bruno_Fernando
Current player: Danilo_Gallinari
Current player: Brandon_Goodwin
Current player: Solomon_Hill
Current player: Kevin_Huerter
Current player: De'Andre_Hunter
Current player: Nathan_Knight
Current player: Skylar_Mays
Current player: Onyeka_Okongwu
Current player: Cam_Reddish
Current player: Rajon_Rondo
Current player: Tony_Snell
Current player: Lou_Williams
Current player: Trae_Young
Current player: Jaylen_Brown
Current player: Carsen_Edwards
Current player: Tacko_Fall
Current player: Evan_Fournier
Current player: Javonte_Green
Current player: Luke_Kornet
Current player: Romeo_Langford
Current player: Aaron_Nesmith
Current player: Semi_Ojeleye
Current player: Jabari_Parker
Current player: Payton_Pritchard
Current player: Marcus_Smart
Current player: Jayson_Tatum
Current player: Jeff_Teague
Current player

Current player: Justise_Winslow
Current player: Precious_Achiuwa
Current player: Bam_Adebayo
Current player: Trevor_Ariza
Current player: Nemanja_Bjelica
Current player: Avery_Bradley
Current player: Jimmy_Butler
Current player: Dewayne_Dedmon
Current player: Goran_Dragić
Current player: Maurice_Harkless
Current player: Udonis_Haslem
Current player: Tyler_Herro
Current player: Andre_Iguodala
Current player: Meyers_Leonard
Current player: Kendrick_Nunn
Current player: KZ_Okpala
Current player: Victor_Oladipo
Current player: Kelly_Olynyk
Current player: Duncan_Robinson
Current player: Chris_Silva
Current player: Max_Strus
Current player: Gabe_Vincent
Current player: Jaylen_Adams
Current player: Giannis_Antetokounmpo
Current player: Thanasis_Antetokounmpo
Current player: D.J._Augustin
D.J. Augustin
string index out of range
Current player: Elijah_Bryant
Elijah Bryant
list index out of range
Current player: Pat_Connaughton
Current player: Torrey_Craig
Current player: Mamadi_Diakite
Current

Current player: Troy_Brown_Jr.
Current player: Thomas_Bryant
Current player: Daniel_Gafford
Current player: Anthony_Gill
Current player: Rui_Hachimura
Current player: Chandler_Hutchison
Current player: Alex_Len
Current player: Robin_Lopez
Current player: Garrison_Mathews
Current player: Raul_Neto
Current player: Anžejs_Pasečņiks
Current player: Jerome_Robinson
Current player: Ish_Smith
Current player: Moritz_Wagner
Current player: Russell_Westbrook
Current player: Cassius_Winston


In [20]:
os.getcwd()

'/Users/amywinecoff/Documents/CITP/Research/Github/ml-teaching/basketball_reference_scraper'

In [23]:
nba = pd.DataFrame(columns = ['ast','stl','blk','tov','pf','pts', 
                                             'name', 'position', 'team'])

csv_files = []
os.chdir("/Users/amywinecoff/Documents/CITP/Research/Github/ml-teaching/basketball_reference_scraper/data/players")
for file in glob.glob("*.csv"):
    csv_files.append(file)

for csv in csv_files:
    d = pd.read_csv(csv)
    nba = pd.concat([nba, d], axis=0)
    
nba.to_csv('nba_player_statistics.csv', index=False)