1. Extract teams by id and abbreviations
2. Extract game data for each team
3. Split into seasons (format: xxxx = year)
    Note: 4xxxx - playoff, 2 - reg, 1 - preseason, non-summer league games gameID start with 00

In [1]:
from nba_api.stats.endpoints import commonplayerinfo, playercareerstats, leaguegamefinder, commonteamroster
from nba_api.stats.static import players, teams
from os import path
import pandas as pd
import pickle
import numpy as np

In [2]:
""" for saving and loading things to a pickle file. Don't need to add extension to the file name """
def save_obj(obj, filename, dirname='pickle_files', ):
    with open(path.join(dirname, filename + '.pkl'), 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(filename, dirname='pickle_files'):
    with open(path.join(dirname, filename + '.pkl'), 'rb') as f:
        return pickle.load(f)

In [3]:
teams_df = pd.DataFrame(teams.get_teams())

In [9]:
team_ids = teams_df.loc[:, ("id","abbreviation")]

In [5]:
team_ids["abbreviation"]

0     ATL
1     BOS
2     CLE
3     NOP
4     CHI
5     DAL
6     DEN
7     GSW
8     HOU
9     LAC
10    LAL
11    MIA
12    MIL
13    MIN
14    BKN
15    NYK
16    ORL
17    IND
18    PHI
19    PHX
20    POR
21    SAC
22    SAS
23    OKC
24    TOR
25    UTA
26    MEM
27    WAS
28    DET
29    CHA
Name: abbreviation, dtype: object

In [10]:
# *** should only have to run the loading cell below ***
# get team info into dataframes
# store dictionary key = team abbreviation, value = df with game data
import time

team_names = {}
for i in range(len(team_ids)): 
    temp_id = team_ids["id"][i]
    temp_team_name = team_ids["abbreviation"][i]
    print('getting team: {0}...'.format(temp_team_name))
    team_names[temp_team_name] = leaguegamefinder.LeagueGameFinder(team_id_nullable=temp_id).get_data_frames()[0]
    time.sleep(10)

getting team: ATL...


ReadTimeout: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)

In [None]:
""" save dictionary to a file called team_names under pickle_files folder"""
save_obj(team_names, 'team_names')

In [None]:
""" load the team_names dictionary in pickle_files """
team_names = load_obj('team_names')

In [None]:
""" gets the season games given the seasonID and abbreviation of the team """
""" Season is based on the year that the season STARTED in. I.E 2016-2017 has seasonID 2016"""
""" Reg season starts with 2, playoffs 4, pre 1 """
""" 
For some reason summer league and preseason games are sometimes included in reg season IDs.
The function accounts for this by only looking at Game IDs that start with 0021 for reg reason,
0041 for playoffs, 0011 for preseason
"""
def get_season_team(seasonID, team_abr, team_names):
    seasonID = str(seasonID)
    gameID_prefix = seasonID[0]
    return team_names[team_abr].loc[(team_names[team_abr]["SEASON_ID"].str.contains(seasonID)) & \
                                    (team_names[team_abr]['GAME_ID'].str.startswith('00{0}1'.format(gameID_prefix)))]

""" gets the number of rows in the passed df"""
def num_rows(df):
    return len(df.index)

""" returns the number of features of the df"""
def num_features(df):
    return len(df.columns)

""" returns a list of the features from the start index moving forward. Using 9 for now """
def get_features(df, start_idx=9):
    return [feature for feature in df.columns.tolist()[start_idx:]]

""" 
util func: gets last n games given the curr game
in the df with the correct season, year
"""
def last_n_games(df, n, curr_game):
    n_rows = num_rows(df)
    # the first row is the latest game. I.E for reg season it's 82nd game
    
"""
returns avg team stats vector over the last n games given
the current game. Assumes games 0 indexed
"""
def get_team_avgs(season_games, curr_game, n):
    total_games = num_rows(season_games)
    assert curr_game > n, "curr game ({0}) > n ({1})".format(curr_game, n)
    assert total_games > n, "n ({0}) is more than the total number of games ({1})".format(n, total_games)
    # the real index of the current game
    game_idx = total_games - curr_game - 1
    # start at 9th index to get meaningful stats
    start_idx = 9
    team_avgs = np.zeros(num_features(season_games) - start_idx)
    for i in range(game_idx - n, game_idx):
        game_stats = np.array(season_games.iloc[i].tolist()[start_idx:])
        team_avgs += game_stats
    return team_avgs / n

def print_team_avgs(season_games, team_avgs):
    features = get_features(season_games)
    for stat, feature in zip(team_avgs, features):
        print("{0}:{1}".format(feature, stat))

In [None]:
""" 
get the 2018 season games for the Atlanta Hawks. 
"""
atl_reg_season = get_season_team(22017, "ATL", team_names)
"""
print the avgs of the 10 games before the 16th game that the Hawks played in the 2017-2018 season
"""
atl_team_avgs = get_team_avgs(atl_reg_season, 16, 10)
print_team_avgs(atl_reg_season, atl_team_avgs)

In [None]:
team_names["ATL"].loc[team_names["ATL"]["SEASON_ID"].str.contains("2018")]

In [None]:
def find_opponent(n, curr_team, season_id):
    full_df = get_season_team(season_id, curr_team, team_names).reset_index()
    df_of_game = full_df.iloc[[-n]]
    return df_of_game["MATCHUP"].values[0][-3::]

In [None]:
"""get the opponent of the Atlanta Hawks's 2nd game of the the 2018 season"""

find_opponent(2, "ATL", 22018)

In [1]:
"""
return the difference vector
"""
def find_diff_vector(curr_game, n, curr_team, season_id):
    curr_team_season = get_season_team(season_id, curr_team, team_names)
    curr_team_avgs = get_team_avgs(curr_team_season, curr_game, n)
    
    opponent = find_opponent(curr_game, curr_team, season_id)
    opponent_team_season = get_season_team(season_id, opponent, team_names)
    opponent_team_avgs = get_team_avgs(opponent_team_season, curr_game, n)
    
    diff = np.subtract(curr_team_avgs, opponent_team_avgs)
   
    return diff

In [2]:
"""diff vector between Atlanta Hawks' stats and its opponents' stats on the Hawks' 16th game
based on the averages of the last 15 games before the nth game"""
find_diff_vector(16, 15, "ATL", 22018)

NameError: name 'get_season_team' is not defined

In [None]:
# gets diff vectors for all the games 10-82 for given team and season
def get_season_diff_vectors(curr_team, season_id):
    diff_vector = []
    for i in range(10, 83):
        diff_vector.append(find_diff_vector(i, i-1, curr_team, season_id))
    return diff_vector

In [None]:
# get difference vectors for all teams for one season

def diff_vectors_per_season(season_id):
    team_per_season_dict = {}
    for team in team_ids["abbreviation"]:
        team_per_season_dict[team] = get_season_diff_vectors(team, season_id)
    
    return team_per_season_dict

In [None]:
# get diff vectors for all seasons for all teams

def ultimate_diff_vectors():
    ultimate_dict = {}
    seasons = list(range(22000, 22019))
    for s in seasons:
        ultimate_dict[s] = diff_vectors_per_season(s)
        
    return ultimate_dict

In [11]:
team_names["ATL"].columns

Index(['SEASON_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID',
       'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'PTS', 'FGM', 'FGA', 'FG_PCT',
       'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB',
       'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS'],
      dtype='object')