In [8]:

import os
import sys
import json
import sportsdataverse

import numpy as np
import pandas as pd

from tqdm import tqdm
from sportsdataverse.mbb import espn_mbb_pbp, load_mbb_schedule,espn_mbb_calendar,espn_mbb_schedule

# Groups = [
#50, # reg season
#55, # CBI 
#56 # CIT lol
#and 100] # 100 NCAA
# schedule = espn_mbb_schedule(dates=2019, groups =100)
# schedule['date'] = pd.to_datetime(schedule['date'].copy())
# schedule = schedule.sort_values(by=['date'])

DATA_PATH = '../../data/'


In [2]:


def download_player_boxscore_history():
    
    seasons = list(range(2003,2023))
    # since not all exist, reversing and trying until they fail
    seasons = seasons[::-1]
#     seasons=[2022]
    for season in tqdm(seasons):
        mbb_df = sportsdataverse.mbb.load_mbb_player_boxscore(seasons=[season])
        mbb_df.to_csv(os.path.join(DATA_PATH, f'ESPN/player_boxscores/{season}.csv'), index=False)
        
    return



download_player_boxscore_history()


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:35<00:00,  1.79s/it]


In [3]:

def download_schedules():
    
    seasons = list(range(2003,2023))
    # since not all exist, reversing and trying until they fail
    seasons = seasons[::-1]
    # for update
#     seasons=[2022]
    for season in tqdm(seasons):
        mbb_df = sportsdataverse.mbb.load_mbb_schedule(seasons=[season])
        mbb_df.to_csv(os.path.join(DATA_PATH, f'ESPN/schedules/by_season/{season}.csv'), index=False)
        
    return

download_schedules()


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:14<00:00,  1.38it/s]


In [4]:

# mbb_df = sportsdataverse.mbb.load_mbb_team_boxscore(seasons=range(2002,2023))


def download_team_boxscore_history():
    
    seasons = list(range(2003,2023))
    # since not all exist, reversing and trying until they fail
    seasons = seasons[::-1]
#     seasons=[2022]
    for season in tqdm(seasons):
        mbb_df = sportsdataverse.mbb.load_mbb_team_boxscore(seasons=[season])
        mbb_df.to_csv(os.path.join(DATA_PATH, f'ESPN/team_boxscores/{season}.csv'), index=False)
        
    return

download_team_boxscore_history()



100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:12<00:00,  1.63it/s]


In [5]:


def download_pbp_history():
    
    seasons = list(range(2006,2023))
    # since not all exist, reversing and trying until they fail
    seasons = seasons[::-1]
#     seasons=[2022]
    for season in tqdm(seasons):
        mbb_df = sportsdataverse.mbb.load_mbb_pbp(seasons=[season])
        mbb_df.to_csv(os.path.join(DATA_PATH, f'ESPN/pbp/{season}.csv'), index=False)
        
    return



download_pbp_history()



100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17/17 [09:02<00:00, 31.92s/it]


In [9]:


## finding missing games

def load_season_schedule(season):
    return pd.read_csv(os.path.join(DATA_PATH, f'ESPN/schedules/by_season/{season}.csv'))

master = []
for year in tqdm(list(range(2003, 2023))):
    schedule = load_season_schedule(year)
    master.append(schedule)
    
master = pd.concat(master).reset_index(drop=True)




100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:01<00:00, 18.00it/s]


In [30]:


# get opponent team id
def get_opponent_team_id(data):
    
    opps = data.copy()[['game_id','team_id']].drop_duplicates().reset_index(drop=True)
    opps['team_AorB'] = opps.groupby(['game_id'])['team_id'].rank('dense').astype(int).map({
        1:'A',
        2:'B'
    })
    opps = opps.pivot(index='game_id', columns=['team_AorB'], values='team_id').reset_index()
    opps.columns=['game_id','team_id','opp_id']
    opps2 = opps.copy()
    opps2.columns=['game_id','opp_id','team_id']
    opps = pd.concat([opps, opps2], axis=0).dropna().reset_index(drop=True)
    opps['team_id'] = opps['team_id'].astype(int)
    opps['opp_id'] = opps['opp_id'].astype(int)
    
    return opps


def get_possessions(pbox_data):
    
    opponent_ids = get_opponent_team_id(pbox_data.copy())
    ## estimate number of possessions from box score 
    poss = pbox_data.groupby(['game_id','team_id'])[['fga','to','fta','oreb']].sum().reset_index()
    ## commonly used possession estimate formula
    ## (FGA – OR) + TO + (0.44 * FTA)
    poss['tm_poss'] = (poss['fga'].copy()-poss['oreb'].copy())+poss['to'].copy()+(0.44*poss['fta'].copy())
    poss = poss.drop(columns=['fga','to','fta','oreb'])

    possession_key = opponent_ids.copy().merge(poss, how='left', on=['game_id','team_id'])
    poss = poss.rename(columns={'team_id':'opp_id','tm_poss':'opp_poss'})
    possession_key = possession_key.copy().merge(poss, how='left', on=['game_id','opp_id'])
    possession_key['game_possessions'] = possession_key[['tm_poss','opp_poss']].copy().mean(axis=1)
    possession_key = possession_key.drop(columns=['tm_poss','opp_poss','opp_id'])

    return possession_key.sort_values(by='game_id').reset_index(drop=True)

def add_player_boxscore_features(data):
    
    #pbox
    data['fgm'] = data['fg'].apply(lambda x: x.split('-')[0])
    data['fga'] = data['fg'].apply(lambda x: x.split('-')[-1])
    data['fg3m'] = data['fg3'].apply(lambda x: x.split('-')[0])
    data['fg3a'] = data['fg3'].apply(lambda x: x.split('-')[-1])
    data['ftm'] = data['ft'].apply(lambda x: x.split('-')[0])
    data['fta'] = data['ft'].apply(lambda x: x.split('-')[-1])

    data['fgm']= data['fgm'].replace('',0)
    data['fgm'] = data['fgm'].astype(int)
    data['fga']= data['fga'].replace('',0)
    data['fga'] = data['fga'].astype(int)
    data['ftm']= data['ftm'].replace('',0)
    data['ftm'] = data['ftm'].astype(int)
    data['fta']= data['fta'].replace('',0)
    data['fta'] = data['fta'].astype(int)
    
    data['oreb']= data['oreb'].replace('',0)
    data['oreb'] = data['oreb'].astype(int)
    data['dreb']= data['dreb'].replace('',0)
    data['dreb'] = data['dreb'].astype(int)
    data['reb']= data['reb'].replace('',0)
    data['reb'] = data['reb'].astype(int)

    data['fg3m']= data['fg3m'].replace('',0)
    data['fg3m'] = data['fg3m'].astype(int)
    data['fg3a']= data['fg3a'].replace('',0)
    data['fg3a'] = data['fg3a'].astype(int)

    data['fg2m'] = data['fgm'].copy()-data['fg3m'].copy()
    data['fg2a'] = data['fga'].copy()-data['fg3a'].copy()
    
    possess = get_possessions(data.copy())
    data = data.merge(possess, how='left', on=['game_id','team_id'])

    data['fg%'] = (data['fgm'].copy()/data['fga'].copy()).fillna(0)
    data['fg2%'] = (data['fg2m'].copy()/data['fg2a'].copy()).fillna(0)
    data['fg3%'] = (data['fg3m'].copy()/data['fg3a'].copy()).fillna(0)

    data['eFG%'] = ((data['fgm'].copy()+(data['fg3m'].copy()*0.5))/data['fga'].copy()).fillna(0)
    data['TS%'] = ((data['pts'].copy())/(2*(data['fga'].copy()+(0.44*data['fta'].copy())))).fillna(0)
    # pbox[['fg','fg3m','fga']].dtypes
    data['pts_pm'] = data['pts'].copy()/(data['min'].copy()+1)# 1 added to prevent infinities
    data['reb_pm'] = data['reb'].copy()/(data['min'].copy()+1)
    data['ast_pm'] = data['ast'].copy()/(data['min'].copy()+1)
    data['stl_pm'] = data['stl'].copy()/(data['min'].copy()+1)
    data['blk_pm'] = data['blk'].copy()/(data['min'].copy()+1)
    data['to_pm'] = data['to'].copy()/(data['min'].copy()+1)
    data['pf_pm'] = data['pf'].copy()/(data['min'].copy()+1)
    
    ## could be improved with OT markers
    ## percentage of estimated possessions player took part of
    data['player_possessions'] = data['game_possessions'].copy()*(data['min'].copy()/(40*2)) # times 2 because game possessions = 
    
    data['pts_pp'] = data['pts'].copy()/(data['player_possessions'].copy()+1)
    data['reb_pp'] = data['reb'].copy()/(data['player_possessions'].copy()+1)
    data['ast_pp'] = data['ast'].copy()/(data['player_possessions'].copy()+1)
    data['stl_pp'] = data['stl'].copy()/(data['player_possessions'].copy()+1)
    data['blk_pp'] = data['blk'].copy()/(data['player_possessions'].copy()+1)
    data['to_pp'] = data['to'].copy()/(data['player_possessions'].copy()+1)
    data['pf_pp'] = data['pf'].copy()/(data['player_possessions'].copy()+1)
    
    return data


def clean_player_boxscores(data):
    for stat_col in ['min','pts','oreb','dreb','reb','ast','stl','blk','to','pf']:
        data[stat_col] = data[stat_col].replace('--',0)
        data[stat_col] = data[stat_col].astype(int)

    return data


def load_player_boxscore_season(year):
    return add_player_boxscore_features(\
            clean_player_boxscores(\
            pd.read_csv(os.path.join(DATA_PATH, f'ESPN/player_boxscores/{year}.csv'))))

# get game ids we have
pboxes = []
for year in tqdm(list(range(2003, 2023))):
    pbox = load_player_boxscore_season(year)
    pbox = pbox.copy()[['game_id','season','season_type']].drop_duplicates(subset=['game_id']).sort_values(by=['game_id','season_type']).reset_index(drop=True)
    pboxes.append(pbox)
    
pboxes = pd.concat(pboxes).reset_index(drop=True)






  pbox = load_player_boxscore_season(year)
  pbox = load_player_boxscore_season(year)
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:14<00:00,  1.37it/s]


In [66]:

# have = set(pboxes.loc[pboxes['season_type']==3]['game_id'].unique())
# want = set(master.loc[master['season_type']==3]['game_id'].unique())

# can_hopefully_pull = list(want.difference(have))

# needs encoder because of int64s
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)

for game_id in tqdm(can_hopefully_pull[624+598:]):
    game_pbp = espn_mbb_pbp(game_id=game_id)
    path = os.path.join(DATA_PATH, f'ESPN/pbp/manual_collection/{game_id}.json')
    with open(path, "w") as f:
        json.dump(game_pbp, f, cls=NpEncoder)



 79%|█████████████████████████████████████████████████████████████████████████████████████████▉                        | 606/768 [13:13<03:19,  1.23s/it]

Download error: http://cdn.espn.com/core/mens-college-basketball/playbyplay?gameId=320732382&xhr=1&render=false&userab=18


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 768/768 [16:53<00:00,  1.32s/it]


In [50]:

import random

# for game in 
test = espn_mbb_pbp(game_id=random.choice(can_hopefully_pull))



In [68]:
with open(path, 'r') as f:
    test = json.load(f)


{'gameId': 400949246,
 'plays': [{'shootingPlay': False,
   'sequenceNumber': '101799901',
   'period.displayValue': '1st Half',
   'period.number': 1,
   'homeScore': 0,
   'coordinate.x': 25,
   'coordinate.y': 0,
   'scoringPlay': False,
   'clock.displayValue': '20:00',
   'team.id': '153',
   'type.id': '615',
   'type.text': 'Jumpball',
   'awayScore': 0,
   'id': 400949246101799901,
   'text': 'Jump Ball won by North Carolina',
   'scoreValue': 0,
   'participants.0.athlete.id': None,
   'participants.1.athlete.id': None,
   'season': 2017,
   'seasonType': 3,
   'awayTeamId': 2250,
   'awayTeamName': 'Gonzaga',
   'awayTeamMascot': 'Bulldogs',
   'awayTeamAbbrev': 'GONZ',
   'awayTeamNameAlt': 'Gonzaga',
   'homeTeamId': 153,
   'homeTeamName': 'North Carolina',
   'homeTeamMascot': 'Tar Heels',
   'homeTeamAbbrev': 'UNC',
   'homeTeamNameAlt': 'North Carolina',
   'homeTeamSpread': 1.0,
   'gameSpread': -1.0,
   'homeFavorite': True,
   'gameSpreadAvailable': True,
   'game_id

In [88]:

test_df = pd.json_normalize(test['plays'])
test_df['type.id'] = test_df['type.id'].astype(int)
test_df.loc[test_df['participants.1.athlete.id'].notnull()][['type.text','text']]
# test_df.loc[test_df['type.id'].notnull()][['type.text','text']]
test_df['type.text'].value_counts()
# list(test_df)
# list(test_df)


Defensive Rebound        68
JumpShot                 62
MadeFreeThrow            52
Three Point Jump Shot    46
PersonalFoul             44
Offensive Rebound        27
LayUpShot                20
Lost Ball Turnover       18
Block Shot               13
Dead Ball Rebound        11
Steal                     9
OfficialTVTimeOut         8
ShortTimeOut              5
Jumpball                  3
DunkShot                  3
End Period                1
TipShot                   1
End Game                  1
Name: type.text, dtype: int64

In [77]:

test_df['type.text'].value_counts()


Defensive Rebound        68
JumpShot                 62
MadeFreeThrow            52
Three Point Jump Shot    46
PersonalFoul             44
Offensive Rebound        27
LayUpShot                20
Lost Ball Turnover       18
Block Shot               13
Dead Ball Rebound        11
Steal                     9
OfficialTVTimeOut         8
ShortTimeOut              5
Jumpball                  3
DunkShot                  3
End Period                1
TipShot                   1
End Game                  1
Name: type.text, dtype: int64

In [63]:
os.path.exists(os.path.join(DATA_PATH, f'ESPN/pbp/manual_collection'))

True

In [39]:

from sportsdataverse.mbb import espn_mbb_pbp




{'gameId': 401265031,
 'plays': [{'shootingPlay': False,
   'sequenceNumber': '101799901',
   'period.displayValue': '1st Half',
   'period.number': 1,
   'homeScore': 0,
   'coordinate.x': 25,
   'coordinate.y': 0,
   'scoringPlay': False,
   'clock.displayValue': '20:00',
   'team.id': '52',
   'type.id': '615',
   'type.text': 'Jumpball',
   'awayScore': 0,
   'id': 401265031101799901,
   'text': 'Jump Ball won by Florida State',
   'scoreValue': 0,
   'participants.0.athlete.id': None,
   'participants.1.athlete.id': None,
   'season': 2021,
   'seasonType': 2,
   'awayTeamId': 52,
   'awayTeamName': 'Florida State',
   'awayTeamMascot': 'Seminoles',
   'awayTeamAbbrev': 'FSU',
   'awayTeamNameAlt': 'Florida St',
   'homeTeamId': 228,
   'homeTeamName': 'Clemson',
   'homeTeamMascot': 'Tigers',
   'homeTeamAbbrev': 'CLEM',
   'homeTeamNameAlt': 'Clemson',
   'homeTeamSpread': 1.5,
   'gameSpread': -1.5,
   'homeFavorite': True,
   'gameSpreadAvailable': True,
   'game_id': 40126503