In [2]:
import numpy as np
from nba_api.stats.endpoints import leaguegamelog
import time
from datetime import date

def get_all_games(start_year=1946, end_year=2023):
  results = []
  for year in range(start_year, end_year):
    season = '{}-{}'.format(year, str(year + 1)[-2:])
    res = leaguegamelog.LeagueGameLog(counter=10000, season=season)
    res_frame = res.get_data_frames()[0]
    results.append(res_frame)
    time.sleep(1)
  return results
    
seasons = get_all_games(2013, 2023)

In [3]:
seasons[0].loc[0]

SEASON_ID                    22013
TEAM_ID                 1610612741
TEAM_ABBREVIATION              CHI
TEAM_NAME            Chicago Bulls
GAME_ID                 0021300002
GAME_DATE               2013-10-29
MATCHUP                  CHI @ MIA
WL                               L
MIN                            240
FGM                             35
FGA                             83
FG_PCT                       0.422
FG3M                             7
FG3A                            26
FG3_PCT                      0.269
FTM                             18
FTA                             23
FT_PCT                       0.783
OREB                            11
DREB                            30
REB                             41
AST                             23
STL                             11
BLK                              4
TOV                             19
PF                              27
PTS                             95
PLUS_MINUS                     -12
VIDEO_AVAILABLE     

In [124]:
# games retrieved include the stats for both teams as separate rows, so put them in 'home' or 'away' respectively
def map_home_and_away(seasons):
  results_all_season = []
  for season in seasons:
    results_for_season = []
    results_for_season.append(season[season.duplicated(subset='GAME_ID', keep='first')])
    results_for_season.append(season[season.duplicated(subset='GAME_ID', keep='last')])

    results_all_season.append(results_for_season)
    
  return results_all_season

# seasons_home_and_away = map_home_and_away(results)

In [125]:
def preprocess(seasons):
  
  # drop unnecessary columns
  for season in seasons:
    season.drop(columns=['TEAM_ABBREVIATION', 'TEAM_NAME',\
                         'MATCHUP', 'VIDEO_AVAILABLE'], inplace=True, errors='ignore')

  # convert WL to 0 and 1
  # mapped 1 and 0 to themselves because couldn't seem to get
  # na_action to ignore when already converted
  WL = {'L': 0., 'W': 1., 1.:1., 0.:0.}
  for season in seasons:
    season['WL'] = season['WL'].map(WL.get, na_action='ignore')

  # add home and away dimension
  seasons = map_home_and_away(seasons)

  # map season_id to index
  season_index = []
  for home, away in seasons:
    season_index.append(home.iloc[0]['SEASON_ID'])
    home['SEASON_ID'] = len(season_index) - 1
    away['SEASON_ID'] = len(season_index) - 1

  # map game_id to index
  game_index = []
  for home, away in seasons:
    start = len(game_index)
    game_index.extend(home['GAME_ID'].to_list())
    end = len(game_index)
    home['GAME_ID'] = range(start, end) 
    away['GAME_ID'] = range(start, end) 

  # get unique team_id's
  team_id_set = set()
  for home, _ in seasons:
    team_id_set.update(home['TEAM_ID'].unique())

  # map index to team_id
  team_index = list(team_id_set)
  # map team_id to index
  team_id_dict = {team_index[i]: i for i in range(len(team_index))}

  for home, away in seasons:
    home['TEAM_ID'] = home['TEAM_ID'].map(team_id_dict.get)
    away['TEAM_ID'] = away['TEAM_ID'].map(team_id_dict.get)

  columns = list(seasons[0][0].columns)

  dates_index = []
  for home, away in seasons:
    start = len(dates_index)
    dates_index.extend(home['GAME_DATE'].map(date.fromisoformat).to_list())
    end = len(dates_index)
    home['GAME_DATE'] = range(start, end)
    away['GAME_DATE'] = range(start, end)

  # do whatever else with pandas before they are turned into numpy
  # ...
  
  seasons[0][0].info()
  max_row = 0
  for home, _ in seasons:
    max_row = max(home.shape[-2], max_row)
    
  for i, (home, away) in enumerate(seasons):
    home.to_numpy(dtype=np.float32)
    away.to_numpy(dtype=np.float32)
    seasons[i][0] = np.pad(home, ((0, max_row - home.shape[-2]), (0, 0)), 'constant')
    seasons[i][1] = np.pad(away, ((0, max_row - away.shape[-2]), (0, 0)), 'constant')
    print(seasons[i][0][-5:])

  seasons_np = np.array(seasons, dtype=np.float32)
  
  return seasons_np, columns, season_index, game_index, team_index, dates_index

seasons_np, columns, season_index, game_index, team_index, dates_index = preprocess(seasons)

<class 'pandas.core.frame.DataFrame'>
Index: 1230 entries, 1 to 2459
Data columns (total 25 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   SEASON_ID   1230 non-null   int64  
 1   TEAM_ID     1230 non-null   int64  
 2   GAME_ID     1230 non-null   int64  
 3   GAME_DATE   1230 non-null   int64  
 4   WL          1230 non-null   float64
 5   MIN         1230 non-null   int64  
 6   FGM         1230 non-null   int64  
 7   FGA         1230 non-null   int64  
 8   FG_PCT      1230 non-null   float64
 9   FG3M        1230 non-null   int64  
 10  FG3A        1230 non-null   int64  
 11  FG3_PCT     1230 non-null   float64
 12  FTM         1230 non-null   int64  
 13  FTA         1230 non-null   int64  
 14  FT_PCT      1230 non-null   float64
 15  OREB        1230 non-null   int64  
 16  DREB        1230 non-null   int64  
 17  REB         1230 non-null   int64  
 18  AST         1230 non-null   int64  
 19  STL         1230 non-null   int6

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  home['SEASON_ID'] = len(season_index) - 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  away['SEASON_ID'] = len(season_index) - 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  home['GAME_ID'] = range(start, end)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row

In [83]:
seasons[0].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2460 entries, 0 to 2459
Data columns (total 25 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   SEASON_ID   2460 non-null   object 
 1   TEAM_ID     2460 non-null   int64  
 2   GAME_ID     2460 non-null   object 
 3   GAME_DATE   2460 non-null   object 
 4   WL          2460 non-null   float64
 5   MIN         2460 non-null   int64  
 6   FGM         2460 non-null   int64  
 7   FGA         2460 non-null   int64  
 8   FG_PCT      2460 non-null   float64
 9   FG3M        2460 non-null   int64  
 10  FG3A        2460 non-null   int64  
 11  FG3_PCT     2460 non-null   float64
 12  FTM         2460 non-null   int64  
 13  FTA         2460 non-null   int64  
 14  FT_PCT      2460 non-null   float64
 15  OREB        2460 non-null   int64  
 16  DREB        2460 non-null   int64  
 17  REB         2460 non-null   int64  
 18  AST         2460 non-null   int64  
 19  STL         2460 non-null  

In [115]:
seasons_np.shape

(23, 2, 1230, 25)

In [111]:
seasons_np[3][1][5:8][0]

array([ 3.000e+00,  2.800e+01,  3.695e+03,  3.695e+03,  0.000e+00,
        2.400e+02,  3.500e+01,  8.400e+01,  4.170e-01,  4.000e+00,
        2.200e+01,  1.820e-01,  1.700e+01,  2.200e+01,  7.730e-01,
        8.000e+00,  3.200e+01,  4.000e+01,  1.700e+01,  8.000e+00,
        0.000e+00,  1.400e+01,  2.400e+01,  9.100e+01, -1.800e+01],
      dtype=float32)

In [126]:
columns

['SEASON_ID',
 'TEAM_ID',
 'GAME_ID',
 'GAME_DATE',
 'WL',
 'MIN',
 'FGM',
 'FGA',
 'FG_PCT',
 'FG3M',
 'FG3A',
 'FG3_PCT',
 'FTM',
 'FTA',
 'FT_PCT',
 'OREB',
 'DREB',
 'REB',
 'AST',
 'STL',
 'BLK',
 'TOV',
 'PF',
 'PTS',
 'PLUS_MINUS']

In [127]:
season_index

['22013',
 '22014',
 '22015',
 '22016',
 '22017',
 '22018',
 '22019',
 '22020',
 '22021',
 '22022']

In [128]:
game_index[:10]

['0021300003',
 '0021300001',
 '0021300002',
 '0021300007',
 '0021300008',
 '0021300016',
 '0021300009',
 '0021300014',
 '0021300006',
 '0021300010']

In [129]:
team_index

[1610612737,
 1610612738,
 1610612739,
 1610612740,
 1610612741,
 1610612742,
 1610612743,
 1610612744,
 1610612745,
 1610612746,
 1610612747,
 1610612748,
 1610612749,
 1610612750,
 1610612751,
 1610612752,
 1610612753,
 1610612754,
 1610612755,
 1610612756,
 1610612757,
 1610612758,
 1610612759,
 1610612760,
 1610612761,
 1610612762,
 1610612763,
 1610612764,
 1610612765,
 1610612766]

In [130]:
dates_index[:5]

[datetime.date(2013, 10, 29),
 datetime.date(2013, 10, 29),
 datetime.date(2013, 10, 29),
 datetime.date(2013, 10, 30),
 datetime.date(2013, 10, 30)]

In [131]:
seasons_np.shape

(10, 2, 1230, 25)

In [134]:
seasons_np_T = seasons_np.transpose(0, 2, 1, 3)

In [135]:
seasons_np_T.shape

(10, 1230, 2, 25)