# NBA Game Outcome Projections

The goal of this project is to project the outcomes of NBA games. To this end, I am using a dataset with game data from 2004-2020.

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import warnings
from datetime import datetime, timedelta
from tqdm import tqdm
import matplotlib.pyplot as plt

In [2]:
# games import
df = pd.read_csv("games.csv")

In [3]:
# games details import
df2 = pd.read_csv("games_details.csv")

In [4]:
# merge
df = df.merge(df2, on="GAME_ID")

In [5]:
# drop some columns
to_drop = ['GAME_STATUS_TEXT','HOME_TEAM_ID',
       'VISITOR_TEAM_ID','COMMENT']
df = df.drop(labels=to_drop,axis=1)

In [6]:
# fix some nulls
df.START_POSITION.fillna("Bench", inplace=True)

In [7]:
# fill empty values with 0 (which in this case is their true value)
df.fillna(0,inplace=True)

In [8]:
def team_stats(TEAM_ID,GAME_ID):
    '''assembles specified team's stats for the specified game'''
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        querydf = df[df['GAME_ID']==GAME_ID][df['TEAM_ID']==TEAM_ID]
        PF = np.sum(querydf['PF'])
        TO = np.sum(querydf['TO'])
        BLK = np.sum(querydf['BLK'])
        STL = np.sum(querydf['STL'])
        AST = np.sum(querydf['AST'])
        OREB = np.sum(querydf['OREB']) 
        DREB = np.sum(querydf['DREB'])
        FTA = np.sum(querydf['FTA'])
        FTM = np.sum(querydf['FTM'])
        FG3A = np.sum(querydf['FG3A'])
        FG3M = np.sum(querydf['FG3M'])
        FG2A = np.sum(querydf['FGA'])-FG3A
        FG2M = np.sum(querydf['FGM'])-FG3M
        
        # is the team the home team?
        home_id = querydf['TEAM_ID_home'].values[0]
        
        if home_id == TEAM_ID:
            home_g = 1
            away_g = 0
        else:
            home_g = 0
            away_g = 1
    
        home_win = querydf['HOME_TEAM_WINS'].values[0]
        
        if home_g == 1 and home_win == 1:
            win = 1
            loss = 0
        elif away_g == 1 and home_win == 0:
            win = 1
            loss = 0
        else:
            win = 0
            loss = 1
        
        return FG2M, FG2A, FG3M, FG3A, FTM, FTA, AST, TO, OREB, DREB, BLK, STL, PF, home_g, away_g, win, loss 

In [9]:
# change date strings to datetime objects
df.GAME_DATE_EST = pd.to_datetime(df.GAME_DATE_EST)

In [10]:
def on_this_day(TEAM_ID,SEASON,DATE):
    '''runs and compiles team_stats for every game in a season up to the date of interest'''
    DATE = pd.to_datetime(DATE)
    upper_bound = DATE - timedelta(days=1)
    # filter df by current season
    season = df[df['SEASON']==SEASON]
    # filter that by TEAM_ID
    team_season = season[df['TEAM_ID']==TEAM_ID]
    # filter that into date range first game->upper_bound
    team_season_date = team_season[df['GAME_DATE_EST']<=upper_bound]
    # unique game ids
    games = team_season_date['GAME_ID'].unique()
    # empty array to sum within
    results = np.zeros(17)
    # game_ids from there
    for id in games:
        results += team_stats(TEAM_ID,id)
    return results

In [11]:
temp_dict={}

In [None]:
# applies on_this_day for each game, and stores in temp_dict.
# the result is a dictionary of each team's stats going into the game,
# which is the information we can use as input for predictions on new,
# unseen data
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    with tqdm(total=(len(df.GAME_ID.unique()))) as pbar:
        for id in df.GAME_ID.unique():
            home_team_id = df[df['GAME_ID']==id]['TEAM_ID_home'].values[0]
            away_team_id = df[df['GAME_ID']==id]['TEAM_ID_away'].values[0]
            season = df[df['GAME_ID']==id]['SEASON'].values[0]
            date = df[df['GAME_ID']==id]['GAME_DATE_EST'].values[0]
            home = on_this_day(home_team_id, season, date)
            away = on_this_day(away_team_id, season, date)
            gameday = np.concatenate((home,away))
            temp_dict[id] = gameday
            pbar.update(1)

  3%|██                                                                          | 608/23096 [03:27<4:20:22,  1.44it/s]

In [None]:
# create new df, add dict of lists to it
on_this_df=pd.DataFrame(temp_dict).T

on_this_df.to_csv("raw_on_this_df.csv")

In [None]:
# column labels for newly constructed df
cols = ['FG2M', 'FG2A', 'FG3M', 'FG3A', 'FTM', 'FTA', 'AST', 'TO', 'OREB', 'DREB', 'BLK', 'STL', 'PF', 'home_g', 'away_g', 'win', 'loss']
new_cols1 = []
for col in cols:
    col = "home_"+col
    new_cols1.append(col)
new_cols2 = []
for col in cols:
    col = "away_"+col
    new_cols2.append(col)
on_this_df.columns = new_cols1 + new_cols2
on_this_df.index.name = "GAME_ID"
df = on_this_df

In [None]:
to_drop = ['home_home_g','home_away_g','away_home_g','away_away_g']
df = df.drop(labels=to_drop, axis=1)

In [None]:
df2 = pd.read_csv("games.csv")

In [None]:
df = pd.merge(df, df2, left_index=True, right_on="GAME_ID")

In [None]:
to_drop = ['GAME_DATE_EST', 'GAME_ID',
       'GAME_STATUS_TEXT', 'HOME_TEAM_ID', 'VISITOR_TEAM_ID', 'SEASON',
       'TEAM_ID_home', 'PTS_home', 'FG_PCT_home', 'FT_PCT_home',
       'FG3_PCT_home', 'AST_home', 'REB_home', 'TEAM_ID_away', 'PTS_away',
       'FG_PCT_away', 'FT_PCT_away', 'FG3_PCT_away', 'AST_away', 'REB_away']
df=df.drop(labels=to_drop, axis=1)

In [None]:
_ = plt.figure(figsize=(15,10))
_ = sns.heatmap(df.corr())

In [None]:
df.describe().T

In [None]:
df.to_csv("clean.csv")