In [2]:
import nba_api.stats.endpoints as nba
import pandas as pd
import numpy as np
import tqdm
import time

SEASONS = ['2017-18', '2018-19', '2019-20', '2020-21', '2021-22', '2022-23', '2023-24']

In [29]:
# Get all games since 2010-11 season
games = pd.DataFrame()
for season in SEASONS:
    print('Getting games for season: {}'.format(season))
    games = pd.concat([games, nba.leaguegamelog.LeagueGameLog(season=season).get_data_frames()[0]])

Getting games for season: 2017-18
Getting games for season: 2018-19
Getting games for season: 2019-20
Getting games for season: 2020-21
Getting games for season: 2021-22
Getting games for season: 2022-23
Getting games for season: 2023-24


In [30]:
print('Total games: {}'.format(len(games)))

# Add a column indicating whether the team was home or away
games['HOME_TEAM'] = np.where(games['MATCHUP'].str.contains('vs.'), 1, 0)

# Add a column indicating whether the home team won based on the WL and HOME_TEAM columns
games['HOME_TEAM_WON'] = np.where((games['WL'] == 'W') & (games['HOME_TEAM'] == 1) | (games['WL'] == 'L') & (games['HOME_TEAM'] == 0), 1, 0)

# Save games to csv
games.to_csv('../data/games.csv', index=False)

Total games: 15314


In [31]:
games = pd.read_csv('../data/games.csv')
advanced_box_scores= pd.read_csv('../data/advanced_box_scores.csv')
games_to_get = games[~games['GAME_ID'].isin(advanced_box_scores['gameId'].unique())]['GAME_ID'].unique()
loading_bar = tqdm.tqdm(total=len(games_to_get))
skipped_games = []
for game_id in games_to_get:
    game_id = "00" + str(game_id)
    tries = 0
    while tries < 5:
        try:
            game = nba.boxscoreadvancedv3.BoxScoreAdvancedV3(game_id=game_id).get_data_frames()[0]
            time.sleep(0.6)
            break
        except:
            tries += 1
            time.sleep(0.6)
    if tries == 5:
        print('Skipping game: {}'.format(game_id))
        skipped_games.append(game_id)
    # Append game data to csv
    else:
        game.to_csv('../data/advanced_box_scores.csv', mode='a', header=False, index=False)
    loading_bar.update(1)

100%|██████████| 3/3 [00:03<00:00,  1.00s/it]

In [32]:
games = pd.read_csv('../data/games.csv')
traditional_box_scores = pd.read_csv('../data/games_details.csv')
games_to_get = games[~games['GAME_ID'].isin(traditional_box_scores['GAME_ID'].unique())]['GAME_ID'].unique()
loading_bar = tqdm.tqdm(total=len(games_to_get))
skipped_games = []
for game_id in games_to_get:
    game_id = "00" + str(game_id)
    tries = 0
    while tries < 5:
        try:
            game = nba.boxscoretraditionalv2.BoxScoreTraditionalV2(game_id=game_id).get_data_frames()[0]
            break
        except:
            tries += 1
            time.sleep(0.6)
    if tries == 5:
        print('Skipping game: {}'.format(game_id))
        skipped_games.append(game_id)
    # Append game data to csv
    else:
        game.to_csv('../data/games_details.csv', mode='a', header=False, index=False)
    loading_bar.update(1)

  traditional_box_scores = pd.read_csv('../data/games_details.csv')
100%|██████████| 3/3 [00:04<00:00,  1.37s/it]


In [33]:
# Get usage stats
games = pd.read_csv('../data/games.csv')
usage_stats = pd.read_csv('../data/usage_stats.csv')
games_to_get = games[~games['GAME_ID'].isin(usage_stats['GAME_ID'].unique())]['GAME_ID'].unique()
for game_id in tqdm.tqdm(games_to_get):
    game_id = "00" + str(game_id)
    tries = 0
    while tries < 5:
        try:
            game = nba.boxscoreusagev2.BoxScoreUsageV2(game_id=game_id).get_data_frames()[0]
            break
        except:
            tries += 1
            time.sleep(0.6)
    if tries == 5:
        print('Skipping game: {}'.format(game_id))
        skipped_games.append(game_id)
    # Append game data to csv
    else:
        game.to_csv('../data/usage_stats.csv', mode='a', header=False, index=False)

100%|██████████| 3/3 [00:03<00:00,  1.19s/it]


In [5]:
# Get defensive box scores
games = pd.read_csv('../data/games.csv')
defensive_box_scores = pd.read_csv('../data/defensive_box_scores.csv')
games_to_get = games[~games['GAME_ID'].isin(defensive_box_scores['gameId'].unique())]['GAME_ID'].unique()
# games_to_get = games['GAME_ID'].unique()
for game_id in tqdm.tqdm(games_to_get):
    game_id = "00" + str(game_id)
    tries = 0
    while tries < 5:
        try:
            game = nba.boxscoredefensivev2.BoxScoreDefensiveV2(game_id=game_id).get_data_frames()[0]
            break
        except:
            tries += 1
            time.sleep(0.6)
    if tries == 5:
        print('Skipping game: {}'.format(game_id))
        games_to_skip = pd.concat([games_to_skip, pd.DataFrame({'GAME_ID': [game_id]})])
    # Append game data to csv
    else:
        game.to_csv('../data/defensive_box_scores.csv', mode='a', header=False, index=False)
games_to_skip.to_csv('../data/games_to_skip.csv', index=False)

  4%|▎         | 1/28 [00:04<01:50,  4.10s/it]

Skipping game: 0021700067


  7%|▋         | 2/28 [00:08<01:46,  4.12s/it]

Skipping game: 0021700088


 11%|█         | 3/28 [00:12<01:42,  4.09s/it]

Skipping game: 0021700230


 14%|█▍        | 4/28 [00:16<01:38,  4.11s/it]

Skipping game: 0021700365


 18%|█▊        | 5/28 [00:20<01:36,  4.18s/it]

Skipping game: 0021700366


 21%|██▏       | 6/28 [00:24<01:31,  4.18s/it]

Skipping game: 0021700376


 25%|██▌       | 7/28 [00:28<01:25,  4.09s/it]

Skipping game: 0021700431


 29%|██▊       | 8/28 [00:32<01:20,  4.02s/it]

Skipping game: 0021700614


 32%|███▏      | 9/28 [00:36<01:15,  3.98s/it]

Skipping game: 0021800014


 36%|███▌      | 10/28 [00:40<01:11,  3.98s/it]

Skipping game: 0021800218


 39%|███▉      | 11/28 [00:44<01:07,  3.97s/it]

Skipping game: 0021800418


 43%|████▎     | 12/28 [00:48<01:03,  3.99s/it]

Skipping game: 0021800429


 46%|████▋     | 13/28 [00:54<01:07,  4.47s/it]

Skipping game: 0021800665


 50%|█████     | 14/28 [00:59<01:08,  4.89s/it]

Skipping game: 0021800678


 54%|█████▎    | 15/28 [01:04<01:00,  4.65s/it]

Skipping game: 0021800911


 57%|█████▋    | 16/28 [01:13<01:12,  6.01s/it]

Skipping game: 0021900369


 61%|██████    | 17/28 [01:17<01:00,  5.47s/it]

Skipping game: 0021900380


 64%|██████▍   | 18/28 [01:21<00:50,  5.05s/it]

Skipping game: 0021900669


 68%|██████▊   | 19/28 [01:26<00:45,  5.04s/it]

Skipping game: 0021900792


 71%|███████▏  | 20/28 [01:30<00:38,  4.82s/it]

Skipping game: 0022100132


 75%|███████▌  | 21/28 [01:35<00:33,  4.75s/it]

Skipping game: 0022301230


 79%|███████▊  | 22/28 [01:40<00:28,  4.76s/it]

Skipping game: 0022300383


 82%|████████▏ | 23/28 [01:44<00:22,  4.53s/it]

Skipping game: 0022300661


 89%|████████▉ | 25/28 [01:48<00:10,  3.45s/it]

Skipping game: 0022300665


 93%|█████████▎| 26/28 [01:52<00:07,  3.63s/it]

Skipping game: 0022300666


 96%|█████████▋| 27/28 [01:56<00:03,  3.79s/it]

Skipping game: 0022300667


100%|██████████| 28/28 [02:00<00:00,  4.31s/it]

Skipping game: 0022300668





In [None]:
# Get hustle box scores
games = pd.read_csv('../data/games.csv')
# hustle_box_scores = pd.read_csv('../data/hustle_box_scores.csv')
# games_to_get = games[~games['GAME_ID'].isin(hustle_box_scores['gameId'].unique())]['GAME_ID'].unique()
games_to_get = games['GAME_ID'].unique()
for game_id in tqdm.tqdm(games_to_get):
    game_id = "00" + str(game_id)
    tries = 0
    while tries < 5:
        try:
            game = nba.boxscorehustlev2.BoxScoreHustleV2(game_id=game_id).get_data_frames()[0]
            break
        except:
            tries += 1
            time.sleep(0.6)
    if tries == 5:
        print('Skipping game: {}'.format(game_id))
    # Append game data to csv
    else:
        game.to_csv('../data/hustle_box_scores.csv', mode='a', header=False, index=False)


In [11]:
nba.boxscorehustlev2.BoxScoreHustleV2(game_id='0021600001').get_data_frames()[0]

Unnamed: 0,gameId,teamId,teamCity,teamName,teamTricode,teamSlug,personId,firstName,familyName,nameI,...,screenAssists,screenAssistPoints,looseBallsRecoveredOffensive,looseBallsRecoveredDefensive,looseBallsRecoveredTotal,offensiveBoxOuts,defensiveBoxOuts,boxOutPlayerTeamRebounds,boxOutPlayerRebounds,boxOuts
0,21600001,1610612752,New York,Knicks,NYK,knicks,2546,Carmelo,Anthony,C. Anthony,...,1,3,0,0,1,0,0,0,0,0
1,21600001,1610612752,New York,Knicks,NYK,knicks,204001,Kristaps,Porzingis,K. Porzingis,...,1,2,0,0,0,0,0,0,0,0
2,21600001,1610612752,New York,Knicks,NYK,knicks,201149,Joakim,Noah,J. Noah,...,3,6,0,0,4,0,0,0,0,0
3,21600001,1610612752,New York,Knicks,NYK,knicks,201584,Courtney,Lee,C. Lee,...,0,0,0,0,2,0,0,0,0,0
4,21600001,1610612752,New York,Knicks,NYK,knicks,201565,Derrick,Rose,D. Rose,...,0,0,0,0,2,0,0,0,0,0
5,21600001,1610612752,New York,Knicks,NYK,knicks,201943,Brandon,Jennings,B. Jennings,...,0,0,0,0,0,0,0,0,0,0
6,21600001,1610612752,New York,Knicks,NYK,knicks,203124,Kyle,O'Quinn,K. O'Quinn,...,2,4,0,0,0,0,0,0,0,0
7,21600001,1610612752,New York,Knicks,NYK,knicks,202498,Lance,Thomas,L. Thomas,...,0,0,0,0,0,0,0,0,0,0
8,21600001,1610612752,New York,Knicks,NYK,knicks,203200,Justin,Holiday,J. Holiday,...,0,0,0,0,1,0,0,0,0,0
9,21600001,1610612752,New York,Knicks,NYK,knicks,1626195,Willy,Hernangomez,W. Hernangomez,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# Get scoring box scores
games = pd.read_csv('../data/games.csv')
# scoring_box_scores = pd.read_csv('../data/scoring_box_scores.csv')
# games_to_get = games[~games['GAME_ID'].isin(scoring_box_scores['gameId'].unique())]['GAME_ID'].unique()
games_to_get = games['GAME_ID'].unique()
for game_id in tqdm.tqdm(games_to_get):
    game_id = "00" + str(game_id)
    tries = 0
    while tries < 5:
        try:
            game = nba.boxscorescoringv2.BoxScoreScoringV2(game_id=game_id).get_data_frames()[0]
            break
        except:
            tries += 1
            time.sleep(0.6)
    if tries == 5:
        print('Skipping game: {}'.format(game_id))
    # Append game data to csv
    else:
        game.to_csv('../data/scoring_box_scores.csv', mode='a', header=False, index=False)

100%|██████████| 7666/7666 [2:53:27<00:00,  1.36s/it]   


In [None]:
# Add headers to scoring box scores
scoring_box_scores = pd.read_csv('../data/scoring_box_scores.csv')
game = nba.boxscorescoringv2.BoxScoreScoringV2(game_id='0021600001').get_data_frames()[0]
scoring_box_scores.columns = game.columns
scoring_box_scores.to_csv('../data/scoring_box_scores.csv', index=False)

In [4]:
nba.LeagueDashPlayerStats().get_data_frames()[0]

Unnamed: 0,PLAYER_ID,PLAYER_NAME,NICKNAME,TEAM_ID,TEAM_ABBREVIATION,AGE,GP,W,L,W_PCT,...,BLK_RANK,BLKA_RANK,PF_RANK,PFD_RANK,PTS_RANK,PLUS_MINUS_RANK,NBA_FANTASY_PTS_RANK,DD2_RANK,TD3_RANK,WNBA_FANTASY_PTS_RANK
0,1630639,A.J. Lawson,A.J.,1610612742,DAL,23.0,21,10,11,0.476,...,386,219,141,379,347,130,368,199,25,366
1,1631260,AJ Green,AJ,1610612749,MIL,24.0,28,21,7,0.750,...,405,1,177,421,343,165,377,199,25,367
2,1631100,AJ Griffin,AJ,1610612737,ATL,20.0,17,7,10,0.412,...,405,114,91,464,416,371,436,199,25,426
3,203932,Aaron Gordon,Aaron,1610612743,DEN,28.0,42,28,14,0.667,...,83,457,385,53,102,37,82,42,25,95
4,1628988,Aaron Holiday,Aaron,1610612745,HOU,27.0,43,21,22,0.488,...,349,280,336,245,221,97,250,199,25,239
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
529,1628380,Zach Collins,Zach,1610612759,SAS,26.0,36,7,29,0.194,...,83,365,503,99,148,521,146,116,25,146
530,203897,Zach LaVine,Zach,1610612741,CHI,28.0,25,10,15,0.400,...,248,397,305,141,132,458,172,149,25,161
531,1630192,Zeke Nnaji,Zeke,1610612743,DEN,23.0,36,26,10,0.722,...,123,328,297,182,326,477,314,199,25,319
532,1630533,Ziaire Williams,Ziaire,1610612763,MEM,22.0,44,18,26,0.409,...,237,430,374,166,192,513,216,149,25,207
