1. Extract teams by id and abbreviations
2. Extract game data for each team
3. Split into seasons (format: xxxx = year)
    Note: 4xxxx - playoff, 2 - reg, 1 - preseason, non-summer league games gameID start with 00

In [87]:
from nba_api.stats.endpoints import commonplayerinfo, playercareerstats, leaguegamefinder, commonteamroster
from nba_api.stats.static import players, teams
from os import path
import pandas as pd
import pickle
import numpy as np

In [3]:
""" for saving and loading things to a pickle file. Don't need to add extension to the file name """
def save_obj(obj, filename, dirname='pickle_files', ):
    with open(path.join(dirname, filename + '.pkl'), 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(filename, dirname='pickle_files'):
    with open(path.join(dirname, filename + '.pkl'), 'rb') as f:
        return pickle.load(f)

In [4]:
teams_df = pd.DataFrame(teams.get_teams())

In [5]:
team_ids = teams_df.loc[:, ("id","abbreviation")]

In [6]:
team_ids["abbreviation"]

0     ATL
1     BOS
2     CLE
3     NOP
4     CHI
5     DAL
6     DEN
7     GSW
8     HOU
9     LAC
10    LAL
11    MIA
12    MIL
13    MIN
14    BKN
15    NYK
16    ORL
17    IND
18    PHI
19    PHX
20    POR
21    SAC
22    SAS
23    OKC
24    TOR
25    UTA
26    MEM
27    WAS
28    DET
29    CHA
Name: abbreviation, dtype: object

In [7]:
# *** should only have to run the loading cell below ***
# get team info into dataframes
# store dictionary key = team abbreviation, value = df with game data
import time

team_names = {}
for i in range(len(team_ids)): 
    temp_id = team_ids["id"][i]
    temp_team_name = team_ids["abbreviation"][i]
    print('getting team: {0}...'.format(temp_team_name))
    team_names[temp_team_name] = leaguegamefinder.LeagueGameFinder(team_id_nullable=temp_id).get_data_frames()[0]
    time.sleep(1)

getting team: ATL...
getting team: BOS...
getting team: CLE...
getting team: NOP...
getting team: CHI...
getting team: DAL...
getting team: DEN...
getting team: GSW...
getting team: HOU...
getting team: LAC...
getting team: LAL...
getting team: MIA...
getting team: MIL...
getting team: MIN...
getting team: BKN...
getting team: NYK...
getting team: ORL...
getting team: IND...
getting team: PHI...
getting team: PHX...
getting team: POR...
getting team: SAC...
getting team: SAS...
getting team: OKC...
getting team: TOR...
getting team: UTA...
getting team: MEM...
getting team: WAS...
getting team: DET...
getting team: CHA...


In [8]:
""" save dictionary to a file called team_names under pickle_files folder"""
save_obj(team_names, 'team_names')

In [9]:
""" load the team_names dictionary in pickle_files """
team_names = load_obj('team_names')

In [10]:
""" gets the season games given the seasonID and abbreviation of the team """
""" Season is based on the year that the season STARTED in. I.E 2016-2017 has seasonID 2016"""
""" Reg season starts with 2, playoffs 4, pre 1 """
""" 
For some reason summer league and preseason games are sometimes included in reg season IDs.
The function accounts for this by only looking at Game IDs that start with 0021 for reg reason,
0041 for playoffs, 0011 for preseason
"""
def get_season_team(seasonID, team_abr, team_names):
    seasonID = str(seasonID)
    gameID_prefix = seasonID[0]
    return team_names[team_abr].loc[(team_names[team_abr]["SEASON_ID"].str.contains(seasonID)) & \
                                    (team_names[team_abr]['GAME_ID'].str.startswith('00{0}1'.format(gameID_prefix)))]

""" gets the number of rows in the passed df"""
def num_rows(df):
    return len(df.index)

""" returns the number of features of the df"""
def num_features(df):
    return len(df.columns)

""" returns a list of the features from the start index moving forward. Using 9 for now """
def get_features(df, start_idx=9):
    return [feature for feature in df.columns.tolist()[start_idx:]]

""" 
util func: gets last n games given the curr game
in the df with the correct season, year
"""
def last_n_games(df, n, curr_game):
    n_rows = num_rows(df)
    # the first row is the latest game. I.E for reg season it's 82nd game
    
"""
returns avg team stats vector over the last n games given
the current game. Assumes games 0 indexed
"""
def get_team_avgs(season_games, curr_game, n):
    total_games = num_rows(season_games)
    assert curr_game > n, "curr game ({0}) > n ({1})".format(curr_game, n)
    assert total_games > n, "n ({0}) is more than the total number of games ({1})".format(n, total_games)
    # the real index of the current game
    game_idx = total_games - curr_game - 1
    # start at 9th index to get meaningful stats
    start_idx = 9
    team_avgs = np.zeros(num_features(season_games) - start_idx)
    for i in range(game_idx - n, game_idx):
        game_stats = np.array(season_games.iloc[i].tolist()[start_idx:])
        team_avgs += game_stats
    return team_avgs / n

def print_team_avgs(season_games, team_avgs):
    features = get_features(season_games)
    for stat, feature in zip(team_avgs, features):
        print("{0}:{1}".format(feature, stat))

In [46]:
""" 
get the 2018 season games for the Atlanta Hawks. 
"""
atl_reg_season = get_season_team(22017, "ATL", team_names)
"""
print the avgs of the 10 games before the 16th game that the Hawks played in the 2017-2018 season
"""
atl_team_avgs = get_team_avgs(atl_reg_season, 16, 10)
print_team_avgs(atl_reg_season, atl_team_avgs)

PTS:105.9
FGM:39.2
FGA:85.9
FG_PCT:0.45549999999999996
FG3M:11.3
FG3A:30.2
FG3_PCT:0.3699
FTM:16.2
FTA:20.3
FT_PCT:0.8005000000000001
OREB:9.5
DREB:28.8
REB:38.3
AST:24.0
STL:8.3
BLK:3.3
TOV:13.9
PF:18.3
PLUS_MINUS:-6.0


In [13]:
team_names["ATL"].loc[team_names["ATL"]["SEASON_ID"].str.contains("2018")]

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
18,22018,1610612737,ATL,Atlanta Hawks,0021801220,2019-04-10,ATL vs. IND,L,240,134,...,0.816,22,39,61,29,5.0,7,17,25,-1.0
19,22018,1610612737,ATL,Atlanta Hawks,0021801202,2019-04-07,ATL @ MIL,L,240,107,...,0.526,9,39,48,25,2.0,3,11,28,-8.0
20,22018,1610612737,ATL,Atlanta Hawks,0021801181,2019-04-05,ATL @ ORL,L,240,113,...,0.677,10,28,38,21,16.0,4,14,21,-36.0
21,22018,1610612737,ATL,Atlanta Hawks,0021801168,2019-04-03,ATL vs. PHI,W,240,130,...,0.786,11,33,44,29,7.0,7,11,26,8.0
22,22018,1610612737,ATL,Atlanta Hawks,0021801162,2019-04-02,ATL @ SAS,L,240,111,...,0.650,11,32,43,26,13.0,2,11,18,-6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,22018,1610612737,ATL,Atlanta Hawks,1521800024,2018-07-08,ATL vs. POR,L,198,68,...,0.615,9,34,43,18,11.0,7,15,17,-17.0
110,22018,1610612737,ATL,Atlanta Hawks,1521800014,2018-07-07,ATL vs. NYK,L,200,89,...,0.591,16,35,51,22,7.0,5,16,29,-6.0
111,22018,1610612737,ATL,Atlanta Hawks,1621800006,2018-07-05,ATL @ UTA,L,201,87,...,0.654,15,33,48,16,11.0,5,17,25,2.2
112,22018,1610612737,ATL,Atlanta Hawks,1621800003,2018-07-03,ATL vs. SAS,L,200,81,...,0.545,14,29,43,12,9.0,2,12,25,-26.2


In [40]:
def find_opponent(n, curr_team, season_id):
    full_df = get_season_team(season_id, curr_team, team_names).reset_index()
    df_of_game = full_df.iloc[[-n]]
    return df_of_game["MATCHUP"].values[0][-3::]

In [41]:
"""get the opponent of the Atlanta Hawks's 2nd game of the the 2018 season"""

find_opponent(2, "ATL", 22018)

'MEM'

In [104]:
"""
based on the averages of the last 10 games before the nth game
return the difference vector
"""
def find_diff_vector(n, curr_team, season_id):
    curr_team_season = get_season_team(season_id, curr_team, team_names)
    curr_team_avgs = get_team_avgs(curr_team_season, n, 10)
    
    opponent = find_opponent(n, curr_team, season_id)
    opponent_team_season = get_season_team(season_id, opponent, team_names)
    opponent_team_avgs = get_team_avgs(opponent_team_season, n, 10)
    
    diff = np.subtract(curr_team_avgs, opponent_team_avgs)
   
    return diff

In [105]:
"""diff vector between Atlanta Hawks' stats and its opponents' stats on the Hawk's 16th game"""
find_diff_vector(16, "ATL", 22018)

array([  1.8   ,  -0.1   ,   4.3   ,  -0.0252,   2.2   ,  11.4   ,
        -0.0619,  -0.2   ,   1.4   ,  -0.0579,   0.8   ,  -4.    ,
        -3.2   ,  -2.    ,   1.    ,   0.5   ,   2.1   ,   4.1   ,
       -13.6   ])

In [11]:
team_names["ATL"].columns

Index(['SEASON_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID',
       'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'PTS', 'FGM', 'FGA', 'FG_PCT',
       'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB',
       'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS'],
      dtype='object')