In [1]:
from nba_api.stats.endpoints import commonplayerinfo, playercareerstats, leaguegamefinder, commonteamroster, leagueseasonmatchups
from nba_api.stats.static import players, teams
from os import path
import pandas as pd
import pickle
import numpy as np

In [2]:
""" for saving and loading things to a pickle file. Don't need to add extension to the file name """
def save_obj(obj, filename, dirname='./assets/pickle_files', ):
    with open(path.join(dirname, filename + '.pkl'), 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(filename, dirname='./assets/pickle_files'):
    with open(path.join(dirname, filename + '.pkl'), 'rb') as f:
        return pickle.load(f)

In [3]:
teams_df = pd.DataFrame(teams.get_teams())
teams_df.head()

Unnamed: 0,id,full_name,abbreviation,nickname,city,state,year_founded
0,1610612737,Atlanta Hawks,ATL,Hawks,Atlanta,Atlanta,1949
1,1610612738,Boston Celtics,BOS,Celtics,Boston,Massachusetts,1946
2,1610612739,Cleveland Cavaliers,CLE,Cavaliers,Cleveland,Ohio,1970
3,1610612740,New Orleans Pelicans,NOP,Pelicans,New Orleans,Louisiana,2002
4,1610612741,Chicago Bulls,CHI,Bulls,Chicago,Illinois,1966


In [7]:
team_ids = teams_df.loc[:, ("id","abbreviation")]

In [8]:
team_names = load_obj('team_names')

In [9]:
""" load the team_names dictionary in pickle_files """
advanced_stats = load_obj('game_data')

In [10]:
advanced_stats['0021900099'].keys()

dict_keys(['home_team', 'away_team', 'game_date', 'home_win', 'home_team_stats', 'away_team_stats', 'advanced_player_stats', 'advanced_team_stats'])

In [11]:
advanced_stats['0021900099']['advanced_team_stats']

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_NAME,TEAM_ABBREVIATION,TEAM_CITY,MIN,E_OFF_RATING,OFF_RATING,E_DEF_RATING,DEF_RATING,...,TM_TOV_PCT,EFG_PCT,TS_PCT,USG_PCT,E_USG_PCT,E_PACE,PACE,PACE_PER40,POSS,PIE
0,21900099,1610612759,Spurs,SAS,San Antonio,240:00,95.1,98.0,101.8,104.9,...,14.7,0.462,0.494,1.0,0.2,105.58,102.5,85.42,102,0.44
1,21900099,1610612737,Hawks,ATL,Atlanta,240:00,101.8,104.9,95.1,98.0,...,17.5,0.539,0.556,1.0,0.2,105.58,102.5,85.42,103,0.56


In [12]:
advanced_stats['0021900099']['advanced_team_stats'].columns[6:]

Index(['E_OFF_RATING', 'OFF_RATING', 'E_DEF_RATING', 'DEF_RATING',
       'E_NET_RATING', 'NET_RATING', 'AST_PCT', 'AST_TOV', 'AST_RATIO',
       'OREB_PCT', 'DREB_PCT', 'REB_PCT', 'E_TM_TOV_PCT', 'TM_TOV_PCT',
       'EFG_PCT', 'TS_PCT', 'USG_PCT', 'E_USG_PCT', 'E_PACE', 'PACE',
       'PACE_PER40', 'POSS', 'PIE'],
      dtype='object')

In [13]:
""" updated teams_names dictionary where the key is team_id, value is team stats"""
team_names_test = {}

ids = team_ids["id"]
abr = team_ids["abbreviation"]

for i in range(len(ids)):
    real_value = team_names[abr[i]]
    test_key = ids[i]
    team_names_test[test_key] = real_value

In [14]:
""" all_abrs is a dictionary of team_names with cleaned team_abbreviations
    that have changed over the years"""

all_abrs = {}
for team in team_names.keys():
    df = team_names[team]
    abrs = set()
    for game in df.values.tolist():
        abrs.add(game[2])
    for abr in abrs:
        all_abrs[abr] = df
        
#all_abrs['CHA'][200:]

# Normalize Team Data

In [50]:
from sklearn.preprocessing import StandardScaler

In [None]:
adv_stats_headers = advanced_stats['0021900099']['advanced_team_stats'].columns[6:]

In [51]:
def normalize(prev_season):
    scaler = StandardScaler()
    season = []
    for i in team_ids["id"]:
        teams_df = get_season_team(prev_season, i, team_names_test)
        
        adv_stats = []
        for j in range(num_rows(teams_df)):
            
            game_id = teams_df.iloc[j]["GAME_ID"]
            adv_stats.append(find_advanced_stats(i, game_id))
        
        all_adv_stats = pd.DataFrame(data=adv_stats, columns=adv_stats_headers)
        
        teams_list = teams_df.drop(teams_df.columns[list(range(0, 9))], axis = 1)
        all_data_df  = pd.concat(list(teams_list, all_adv_stats))
        all_data = all_data_df.values.tolist()
        season += all_data
    
    scaler.fit(season)
    return scaler

In [98]:
normalize(22018)

StandardScaler(copy=True, with_mean=True, with_std=True)

# Calculations

In [88]:
""" extract advanced stats"""

def find_advanced_stats(team_id, game_id):
    t = team_names_test[team_id]
    t.loc[lambda t: t["GAME_ID"] == game_id]
    
    df = advanced_stats[game_id]['advanced_team_stats']
    return np.array(df.loc[lambda df: df["TEAM_ID"] == team_id].iloc[:, 6:])[0]

In [154]:
""" gets the season games given the seasonID and abbreviation of the team """
""" Season is based on the year that the season STARTED in. I.E 2016-2017 has seasonID 2016"""
""" Reg season starts with 2, playoffs 4, pre 1 """
""" 
For some reason summer league and preseason games are sometimes included in reg season IDs.
The function accounts for this by only looking at Game IDs that start with 0021 for reg reason,
0041 for playoffs, 0011 for preseason
"""

def get_season_team(seasonID, team_id, team_names):
    seasonID = str(seasonID)
    gameID_prefix = seasonID[0]
    return team_names[team_id].loc[(team_names[team_id]["SEASON_ID"].str.contains(seasonID)) & \
                                    (team_names[team_id]['GAME_ID'].str.startswith('00{0}'.format(gameID_prefix)))]


""" gets the number of rows in the passed df"""
def num_rows(df):
    return len(df.index)

""" returns the number of features of the df"""
def num_features(df):
    return len(df.columns)

""" returns a list of the features from the start index moving forward. Using 9 for now """
def get_features(df, start_idx=9):
    return [feature for feature in df.columns.tolist()[start_idx:]]

""" 
util func: gets last n games given the curr game
in the df with the correct season, year
"""
def last_n_games(df, n, curr_game):
    n_rows = num_rows(df)
    # the first row is the latest game. I.E for reg season it's 82nd game
    
"""
returns avg team stats vector over the last n games given
the current game. Assumes games 0 indexed
"""
def get_team_avgs(season_games, curr_game, n):
    total_games = num_rows(season_games)
    assert curr_game > n, "curr game ({0}) > n ({1})".format(curr_game, n)
    assert total_games > n, "n ({0}) is more than the total number of games ({1})".format(n, total_games)
    # the real index of the current game
    game_idx = total_games - curr_game - 1
    # start at 9th index to get meaningful stats
    start_idx = 9
    adv_stat_len = 23
    team_avgs = np.zeros(num_features(season_games) - start_idx + adv_stat_len)
    for i in range(game_idx - n, game_idx):
        
        game_id = season_games.iloc[i]["GAME_ID"]
        team_id = season_games.iloc[i]["TEAM_ID"]
        adv_stats = find_advanced_stats(team_id, game_id)
        basic_stats = np.array(season_games.iloc[i].tolist()[start_idx:])
        
        all_stats = np.concatenate((basic_stats, adv_stats), axis = 0)
     
        team_avgs = team_avgs + all_stats
        print(team_avgs)
    return team_avgs / n

def print_team_avgs(season_games, team_avgs):
    features = get_features(season_games)
    for stat, feature in zip(team_avgs, features):
        print("{0}:{1}".format(feature, stat))

In [155]:
advanced_stats['0021900099']['advanced_team_stats']

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_NAME,TEAM_ABBREVIATION,TEAM_CITY,MIN,E_OFF_RATING,OFF_RATING,E_DEF_RATING,DEF_RATING,...,TM_TOV_PCT,EFG_PCT,TS_PCT,USG_PCT,E_USG_PCT,E_PACE,PACE,PACE_PER40,POSS,PIE
0,21900099,1610612759,Spurs,SAS,San Antonio,240:00,95.1,98.0,101.8,104.9,...,14.7,0.462,0.494,1.0,0.2,105.58,102.5,85.42,102,0.44
1,21900099,1610612737,Hawks,ATL,Atlanta,240:00,101.8,104.9,95.1,98.0,...,17.5,0.539,0.556,1.0,0.2,105.58,102.5,85.42,103,0.56


In [156]:
# TEST: get the 2018 season games for the Atlanta Hawks. 

atl_reg_season = get_season_team(22018, 1610612737, team_names_test)

# print the avgs of the 10 games before the 16th game that the Hawks played in the 2017-2018 season

atl_team_avgs = get_team_avgs(atl_reg_season, 16, 10)
atl_team_avgs
#print_team_avgs(atl_reg_season, atl_team_avgs)

[107.     43.     93.      0.462  11.     33.      0.333  10.     14.
   0.714  16.     28.     44.     21.      7.      2.     21.     30.
  -7.    102.7   104.9   109.7   111.8    -7.     -6.9     0.488   1.
  14.9     0.358   0.762   0.537  20.161  20.6     0.522   0.54    1.
   0.198 104.04  102.     85.    102.      0.391]
[213.     83.    186.      0.892  26.     75.      0.69   21.     30.
   1.402  25.     70.     95.     54.     16.     13.     39.     53.
   1.    199.9   206.8   198.6   206.      1.3     0.8     1.313   2.83
  36.7     0.644   1.46    1.041  36.669  37.9     1.033   1.07    2.
   0.396 213.68  206.    171.67  206.      0.933]
[330.    124.    278.      1.338  38.    113.      1.006  44.     61.
   2.144  33.    105.    138.     82.     29.     17.     56.     80.
 -13.    302.    311.3   314.4   321.9   -12.5   -10.7     1.996   4.48
  55.3     0.848   2.166   1.489  51.498  53.1     1.544   1.624   3.
   0.595 327.54  318.5   265.42  318.      1.365]
[441. 

array([108.7   ,  40.8   ,  90.5   ,   0.451 ,  10.9   ,  36.    ,
         0.3   ,  16.2   ,  21.7   ,   0.7397,  10.5   ,  32.5   ,
        43.    ,  25.3   ,  10.2   ,   6.2   ,  18.2   ,  23.2   ,
        -9.    , 100.8   , 102.5   , 108.99  , 110.74  ,  -8.19  ,
        -8.25  ,   0.6208,   1.489 ,  17.53  ,   0.2567,   0.7306,
         0.4831,  17.0011,  17.31  ,   0.511 ,   0.5439,   1.    ,
         0.199 , 108.008 , 106.2   ,  88.5   , 106.1   ,   0.4435])

In [157]:
def find_opponent_id(opp_abr, season_id):
    
    return all_abrs[opp_abr].loc[all_abrs[opp_abr]["SEASON_ID"].str.contains(str(season_id))]["TEAM_ID"].iloc[0]

In [158]:
def find_opponent(n, curr_team, season_id):
    full_df = get_season_team(season_id, curr_team, team_names_test).reset_index()
    
    df_of_game = full_df.iloc[[-(n + 1)]]
    opp_abr = df_of_game["MATCHUP"].values[0][-3::]
    return find_opponent_id(opp_abr, season_id)

In [159]:
# TEST: get the opponent of the Atlanta Hawks's 2nd game of the the 2018 season

find_opponent(1, 1610612737, 22019)

1610612753

In [160]:
def convert_to_01s(letter):
    return 1 if letter == "W" else 0

In [161]:
def check_home_team(team_df, curr_game):
    matchup = team_df["MATCHUP"].iloc[num_rows(team_df) - curr_game - 1]
    #print('matchup: ', matchup)
    return False if "@" in matchup else True

In [162]:
""" return the difference vector betweeen the current team and its opponent """
def find_diff_vector(curr_game, n, curr_team, season_id):
    
    #curr_team_season is a df
    curr_team_season = get_season_team(season_id, curr_team, team_names_test)
    curr_team_avgs = get_team_avgs(curr_team_season, curr_game, n)
    
    #check for home game
    if check_home_team(curr_team_season, curr_game):
    
        # find point differentials
        outcome = curr_team_season["PLUS_MINUS"].iloc[num_rows(curr_team_season) - curr_game - 1]

        opponent = find_opponent(curr_game, curr_team, season_id)
        opponent_team_season = get_season_team(season_id, opponent, team_names_test)
        opponent_team_avgs = get_team_avgs(opponent_team_season, curr_game, n)

        # normalize 
        scaler = normalize(season_id - 1)
        curr_team_avgs_normalized = scaler.transform([curr_team_avgs])
        opp_team_avgs_normalized = scaler.transform([opponent_team_avgs])

        diff = np.subtract(curr_team_avgs_normalized, opp_team_avgs_normalized)

        return diff, outcome
    return None, None

In [163]:
# TEST: diff vector between Atlanta Hawks' stats and its opponents' stats on the Hawks' 16th
# game based on the averages of the last 15 games before the nth game"""
find_diff_vector(15, 14, 1610612737, 22018)

[118.     37.     79.      0.468  12.     32.      0.375  32.     39.
   0.821  13.     36.     49.     23.      4.      5.     23.     29.
   8.    110.1   113.5   104.9   104.8     5.2     8.7     0.622   0.96
  16.1     0.348   0.755   0.566  22.396  23.1     0.544   0.614   1.
   0.198 106.    104.5    87.08  104.      0.53 ]
[245.     84.    164.      1.021  26.     61.      0.858  51.     64.
   1.581  21.     61.     82.     61.     13.     10.     37.     51.
  -9.    234.6   239.2   241.4   246.     -6.8    -6.7     1.431   3.67
  41.8     0.616   1.389   1.017  36.121  37.      1.179   1.275   2.
   0.4   209.76  206.    171.66  205.      0.986]
[353.    120.    250.      1.44   42.     99.      1.279  71.     96.
   2.206  37.     89.    126.     88.     23.     15.     61.     70.
 -30.    334.5   339.2   355.8   364.3   -21.2   -25.      2.181   4.8
  59.7     0.956   2.001   1.488  58.327  59.2     1.691   1.815   3.
   0.601 320.2   314.5   262.08  313.      1.376]
[460.

(None, None)

In [164]:
""" gets diff vectors for all the games 2-82 for given team and season""" 
def get_season_diff_vectors(curr_team, season_id):
    diff_vector = []
    win_losses = []
    df_len = num_rows(get_season_team(season_id, curr_team, team_names_test))
    for i in range(2, df_len):
        diff, outcome = find_diff_vector(i, min(8, i-1), curr_team, season_id)
        if outcome != None:
            diff_vector.append(diff)
            win_losses.append(outcome)
    return diff_vector, win_losses

In [165]:
# TEST: get_season_diff_vectors function
diffs, outcomes = get_season_diff_vectors(1610612737, 22012)
len(outcomes)

TypeError: unsupported operand type(s) for +: 'float' and 'NoneType'

In [23]:
""" get difference vectors for all teams for one season """

def diff_vectors_per_season(season_id):
    team_per_season_dict = {}
    outcomes_per_season_dict = {}
    
    for team in team_ids["id"]:
        print(team)
        diff, outcomes = get_season_diff_vectors(team, season_id)
        team_per_season_dict[team] = diff
        outcomes_per_season_dict[team] = outcomes
    
    return team_per_season_dict, outcomes_per_season_dict

In [24]:
""" get diff vectors for all seasons for all teams """

def ultimate_diff_vectors():
    ultimate_dict = {}
    ultimate_dict_outcomes = {}
    seasons = list(range(22009, 22019))

    for s in seasons:
        print(s)
        diff, outcomes = diff_vectors_per_season(s)
        ultimate_dict[s] = diff
        ultimate_dict_outcomes[s] = outcomes
        
    return ultimate_dict, ultimate_dict_outcomes

In [51]:
"""
key = season
value = dictionary with key as team id, values as difference numpy arrays
"""
ultimate_dict_points, ultimate_dict_points_outcomes = ultimate_diff_vectors()


22009
1610612737
1610612738
1610612739
1610612740
1610612741
1610612742
1610612743
1610612744
1610612745
1610612746
1610612747
1610612748
1610612749
1610612750
1610612751
1610612752
1610612753
1610612754
1610612755
1610612756
1610612757
1610612758
1610612759
1610612760
1610612761
1610612762
1610612763
1610612764
1610612765
1610612766
22010
1610612737
1610612738
1610612739
1610612740
1610612741
1610612742
1610612743
1610612744
1610612745
1610612746
1610612747
1610612748
1610612749
1610612750
1610612751
1610612752
1610612753
1610612754
1610612755
1610612756
1610612757
1610612758
1610612759
1610612760
1610612761
1610612762
1610612763
1610612764
1610612765
1610612766
22011
1610612737
1610612738
1610612739
1610612740
1610612741
1610612742
1610612743
1610612744
1610612745
1610612746
1610612747
1610612748
1610612749
1610612750
1610612751
1610612752
1610612753
1610612754
1610612755
1610612756
1610612757
1610612758
1610612759
1610612760
1610612761
1610612762
1610612763
1610612764
1610612765
161

In [25]:
""" pickle here!"""

#save_obj(ultimate_dict_points, 'ultimate_dict_points')
ultimate_dict_points_pickled = load_obj('ultimate_dict_points')

#save_obj(ultimate_dict_points_outcomes, 'ultimate_dict_points_outcome')
ultimate_dict_points_outcomes_pickled = load_obj('ultimate_dict_points_outcome')

## MODELS

In [26]:
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from pipeline import Model

from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import plot_tree
from sklearn.ensemble import RandomForestRegressor

In [27]:
def massive_flatten(ultimate_lists):
    flattened_values = []
    for team in ultimate_lists.values():
        for games in team.values():
            flattened_values.extend(games)
    return flattened_values

In [28]:
def flatten_further(ult):
    for idx, vector in enumerate(ult):
        ult[idx] = vector.flatten()

In [50]:
X = massive_flatten(ultimate_dict_points_pickled)
y = massive_flatten(ultimate_dict_points_outcomes_pickled)
# X = massive_flatten(ultimate_dict_homeOnly_pickled)
# y = massive_flatten(ultimate_dict_homeOnly_outcomes_pickled)
flatten_further(X)

In [51]:
Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.25)

In [52]:
m = Model(LinearRegression())
m.train(Xtr, ytr)
lin_preds = m.predict(Xte)

In [53]:
accuracy = lin_preds/yte
np.mean(accuracy)

0.22962292623643407

# Using point spread predictions (lin reg) to predict win/loss. About the same accuracy as directly predicting win/loss.

In [57]:
yte = np.array(yte)
both_win = (lin_preds > 0) & (yte > 0)
both_loss = (lin_preds < 0) & (yte < 0)
win_loss_acc = len(np.where(both_win | both_loss)[0]) / len(yte)
win_loss_acc

0.6777815583531813

In [71]:
decision_tree = DecisionTreeRegressor()
decision_tree.fit(Xtr, ytr)
decision_tree.score(Xte, yte)

-0.7133812190112241

In [73]:
random_tree = RandomForestRegressor()
random_tree.fit(Xtr, ytr)
random_tree.score(Xte, yte)



0.0816047978192056

In [74]:
from sklearn.neural_network import MLPRegressor

In [None]:
clf = MLPRegressor(solver='adam', activation='relu', alpha=5e-3,hidden_layer_sizes=(32,16),max_iter=3000, random_state=1)
clf.fit(Xtr, ytr)
#nn_preds = clf.predict(Xte)
clf.score(Xte, yte)

# Using the Logistic Regression model to predict 2019 game outcomes

In [93]:
# """get the opponent of the Atlanta Hawks's 2nd game of the the 2018 season"""

# find_opponent(2, 1610612737, 22018)

#diff, L = find_diff_vector(5, 4, 1610612737, 22019)  
#all_abrs['GSW']
for i in range(2, 7):
    print('i:', i)
    diff, outcome = find_diff_vector(i, i-1, 1610612737 + 7, 22019)  
    if (diff is None):
        opponent = find_opponent(i, 1610612737 + 7, 22019)
        print('opponent:', opponent)
        
        diff, outcome = find_diff_vector(i, i-1, opponent, 22019)
    print('predicted:', m.predict(diff)[0], 'actual:', outcome)

i: 2
matchup:  GSW @ NOP
opponent: 1610612740
matchup:  NOP @ HOU


ValueError: Expected 2D array, got scalar array instead:
array=None.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [90]:
features = get_features(all_abrs['ATL'])
features[9]
season = get_season_team(22018, 1610612737 + 7, team_names_test)
get_team_avgs(season, 6, 4)

array([128.25   ,  46.     ,  88.5    ,   0.51875,  16.5    ,  33.75   ,
         0.48925,  19.75   ,  25.25   ,   0.79025,  10.     ,  40.5    ,
        50.5    ,  33.25   ,   6.5    ,   6.     ,  13.25   ,  23.5    ,
        17.     ])

In [91]:
features = get_features(all_abrs['ATL'])
features[9]
season = get_season_team(22018, 1610612737 + 29, team_names_test)
get_team_avgs(season, 6, 4)

array([117.75 ,  44.75 ,  89.25 ,   0.503,  11.75 ,  32.5  ,   0.371,
        16.5  ,  20.5  ,   0.804,   9.75 ,  36.   ,  45.75 ,  27.25 ,
         6.5  ,   4.75 ,  16.75 ,  21.25 ,  12.75 ])