In [6]:
# import warnings
# warnings.filterwarnings("ignore")

import requests
import nba_api
from nba_api.stats.endpoints import teamdashboardbygeneralsplits, leaguedashteamstats

from datetime import date, timedelta
import time

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score

from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

# from sklearn import model_selection
# from sklearn.utils import class_weight
# from sklearn.metrics import confusion_matrix

In [2]:
# Compile a dictionary of all 30 NBA teams and their Team ID
teams = {"Atlanta Hawks": 1610612737,
        "Boston Celtics": 1610612738,
        "Brooklyn Nets": 1610612751,
        "Charlotte Bobcats": 1610612766,
        "Charlotte Hornets": 1610612766,
        "Chicago Bulls": 1610612741,
        "Cleveland Cavaliers": 1610612739,
        "Dallas Mavericks": 1610612742,
        "Denver Nuggets": 1610612743,
        "Detroit Pistons": 1610612765,
        "Golden State Warriors": 1610612744,
        "Houston Rockets": 1610612745,
        "Indiana Pacers": 1610612754,
        "LA Clippers": 1610612746,
        "Los Angeles Clippers": 1610612746,
        "Los Angeles Lakers": 1610612747,
        "Memphis Grizzlies": 1610612763,
        "Miami Heat": 1610612748,
        "Milwaukee Bucks": 1610612749,
        "Minnesota Timberwolves": 1610612750,
        "New Jersey Nets": 1610612751,
        "New Orleans Hornets": 1610612740,
        "New Orleans Pelicans": 1610612740,
        "New York Knicks": 1610612752,
        "Oklahoma City Thunder": 1610612760,
        "Orlando Magic": 1610612753,
        "Philadelphia 76ers": 1610612755,
        "Phoenix Suns": 1610612756,
        "Portland Trail Blazers": 1610612757,
        "Sacramento Kings": 1610612758,
        "San Antonio Spurs": 1610612759,
        "Toronto Raptors": 1610612761,
        "Utah Jazz": 1610612762,
        "Washington Wizards": 1610612764,
    }

In [3]:
# Compile a dictionary of stats and their source 
available_stats = {'W_PCT': 'Base',
                   'FG_PCT': 'Base',
                   'FG3_PCT': 'Base',
                   'FT_PCT': 'Base',
                   'REB': 'Base',
                   'AST': 'Base',
                   'TOV': 'Base',
                   'STL': 'Base',
                   'BLK': 'Base',
                   'PLUS_MINUS': 'Base',
                   'OFF_RATING': 'Advanced',
                   'DEF_RATING': 'Advanced',
                   'TS_PCT': 'Advanced'}

In [4]:
# get_stats.py gets the team data for the model

# from team_names import teams


def get_team_stats_dict(team, start_date, end_date, season='2021-22'):
    """
    Returns the stats for the selected team in a dataframe, default year is 2021-22
    :param start_data: Day of games scheduled in form 'mm/dd/yyyy'
    :param end_data: Day of games scheduled in form 'mm/dd/yyyy'
    :param season: Day of games scheduled in form 'yyyy-yy'
    :return: A dictionary of game matchups {home_team:[away_team]}
    """

    time.sleep(1)

    
    general_team_info = teamdashboardbygeneralsplits.TeamDashboardByGeneralSplits(team_id=teams[team],
                                                                                  per_mode_detailed='Per100Possessions',
                                                                                  season=season,
                                                                                  date_from_nullable=start_date,
                                                                                  date_to_nullable=end_date,
                                                                                  timeout=120)
    general_team_dict = general_team_info.get_normalized_dict()
    general_team_dashboard = general_team_dict['OverallTeamDashboard'][0]


    win_percentage = general_team_dashboard['W_PCT']
    fg_percentage = general_team_dashboard['FG_PCT']
    fg3_percentage = general_team_dashboard['FG3_PCT']
    ft_percentage = general_team_dashboard['FT_PCT']
    rebounds = general_team_dashboard['REB']
    assists = general_team_dashboard['AST']
    turnovers = general_team_dashboard['TOV']
    steals = general_team_dashboard['STL']
    blocks = general_team_dashboard['BLK']
    plus_minus = general_team_dashboard['PLUS_MINUS']

    advanced_team_info = teamdashboardbygeneralsplits.TeamDashboardByGeneralSplits(team_id=teams[team],
                                                                                   measure_type_detailed_defense='Advanced',
                                                                                   season=season,
                                                                                   date_from_nullable=start_date,
                                                                                   date_to_nullable=end_date,
                                                                                   timeout=120)
    advanced_team_dict = advanced_team_info.get_normalized_dict()
    advanced_team_dashboard = advanced_team_dict['OverallTeamDashboard'][0]

    offensive_rating = advanced_team_dashboard['OFF_RATING']
    defensive_rating = advanced_team_dashboard['DEF_RATING']
    true_shooting_percentage = advanced_team_dashboard['TS_PCT']

    all_stats_dict = {
        'W_PCT': win_percentage,
        'FG_PCT': fg_percentage,
        'FG3_PCT': fg3_percentage,
        'FT_PCT': ft_percentage,
        'REB': rebounds,
        'AST': assists,
        'TOV': turnovers,
        'STL': steals,
        'BLK': blocks,
        'PLUS_MINUS': plus_minus,
        'OFF_RATING': offensive_rating,
        'DEF_RATING': defensive_rating,
        'TS_PCT': true_shooting_percentage
    }

    return all_stats_dict


get_team_stats_dict('Golden State Warriors', '10/19/2021', '04/10/2022', '2021-22')

{'W_PCT': 0.646,
 'FG_PCT': 0.469,
 'FG3_PCT': 0.364,
 'FT_PCT': 0.769,
 'REB': 45.9,
 'AST': 27.4,
 'TOV': 15.0,
 'STL': 8.9,
 'BLK': 4.6,
 'PLUS_MINUS': 5.6,
 'OFF_RATING': 112.1,
 'DEF_RATING': 106.6,
 'TS_PCT': 0.582}

In [5]:
# get_matches.py gets the daily matches for a specific date and the results of the games

from nba_api.stats.endpoints import leaguegamelog, scoreboard, leaguestandings
#from team_names import teams



def get_match_results(date, season):
    """
    Returns the matchup and result of the game

    :param date: Day of games scheduled in form 'mm/dd/yyyy'
    :param season: Season in form of 'yyyy-yy'
    :return: [{Golden State Warriors: Boston Celtics}], ['W']
    """

    game_log = leaguegamelog.LeagueGameLog(season=season, league_id='00', date_from_nullable=date,
                                           date_to_nullable=date, season_type_all_star='Regular Season', timeout=120)
    game_log_dict = game_log.get_normalized_dict()
    list_of_teams = game_log_dict['LeagueGameLog']

    daily_match = {}
    win_loss = []
    score = []
    game_id = []

    for i in range(0, len(list_of_teams), 2):

        if '@' in list_of_teams[i]['MATCHUP']:

            away_team = list_of_teams[i]['TEAM_NAME']
            home_team = list_of_teams[i + 1]['TEAM_NAME']

            win_loss.append(list_of_teams[i + 1]['WL'])

            game_id.append(list_of_teams[i + 1]['GAME_ID'])

            score.append(list_of_teams[i + 1]['PTS'])
            score.append(list_of_teams[i]['PTS'])

        else:
            away_team = list_of_teams[i + 1]['TEAM_NAME']
            home_team = list_of_teams[i]['TEAM_NAME']

            win_loss.append(list_of_teams[i]['WL'])

            game_id.append(list_of_teams[i]['GAME_ID'])

            score.append(list_of_teams[i]['PTS'])
            score.append(list_of_teams[i + 1]['PTS'])

        daily_match.update({home_team: away_team})

    match_results = [daily_match, win_loss, score, game_id]

    return match_results


def get_daily_matches(date):
    """
    This method creates a dictionary of daily game matchups.

    :param date: Day of games scheduled in form 'mm/dd/yyyy'
    :return: A dictionary of game matchups {home_team:away_team}
    """

    daily_match = scoreboard.Scoreboard(league_id='00', game_date=date, timeout=120)
    daily_match_dict = daily_match.get_normalized_dict()
    games = daily_match_dict['GameHeader']

    match = {}

    for game in games:

        home_team_id = game['HOME_TEAM_ID']

        for team, team_id in teams.items():
            if team_id == home_team_id:
                home_team = team

        away_team_id = game['VISITOR_TEAM_ID']

        for team, team_id in teams.items():
            if team_id == away_team_id:
                away_team = team

        match.update({home_team: away_team})

    return match



def main():
    print(f"""'get_daily_matches' returns a dictionary of the games on a specified date\n{get_daily_matches('12/25/22')}\n""")
    print(f"""'get_match_results' returns the matchup plus the result\n{get_match_results('10/19/2021', '2021-22')}""")

main()

'get_daily_matches' returns a dictionary of the games on a specified date
{'New York Knicks': 'Philadelphia 76ers', 'Dallas Mavericks': 'Los Angeles Lakers', 'Boston Celtics': 'Milwaukee Bucks', 'Golden State Warriors': 'Memphis Grizzlies', 'Denver Nuggets': 'Phoenix Suns'}

'get_match_results' returns the matchup plus the result
[{'Los Angeles Lakers': 'Golden State Warriors', 'Milwaukee Bucks': 'Brooklyn Nets'}, ['L', 'W'], [114, 121, 127, 104], ['0022100002', '0022100001']]


In [None]:
from datetime import date, timedelta
import pandas as pd
import requests

# from get_stats import get_team_stats_dict
# from get_matches import get_match_results
#from standardization import z_score, stat_std, stat_mean
#from available_stats import available_stats


# [{'Sacramento Kings': 'Boston Celtics', 'Charlotte Hornets': 'Philadelphia 76ers'}, ['W', 'L']]
# team stats is a dataframe
def to_dataframe(daily_games, start_date, end_date, season): #, mean_dict, std_dict):
    full_dataframe = []
    game_number = 0  # counter to match with the correct game
    daily_results = daily_games[1]  # win or loss for each game
    score = daily_games[2]
    game_id = daily_games[3]

    for home_team, away_team in daily_games[0].items():  # loops through matchups
        home_team_stats = get_team_stats_dict(home_team, start_date, end_date, season)
        away_team_stats = get_team_stats_dict(away_team, start_date, end_date, season)

        current_game = [home_team, away_team]
        
        current_game.append(game_id[game_number])

        current_game.append(score.pop(0))

        for stat, stat_type in available_stats.items():
            current_game.append(home_team_stats[stat])
        
        current_game.append(score.pop(0))

        for stat, stat_type in available_stats.items():
            current_game.append(away_team_stats[stat])


        #for stat, stat_type in available_stats.items():
        #    z_score_diff = z_score_difference(home_team_stats[stat], away_team_stats[stat], mean_dict[stat], std_dict[stat])

         #   current_game.append(z_score_diff)

        if daily_results[game_number] == 'W':
            result = 1
        else:
            result = 0

        current_game.append(result)
        game_number += 1

        print(current_game)

        full_dataframe.append(current_game)

    return full_dataframe



def date_range(start_date, end_date):
    for n in range(int((end_date - start_date).days)):
        yield start_date + timedelta(n)
        

def training_set(start_year, start_month, start_day, end_year, end_month, end_day, season, season_start):
    start_date = date(start_year, start_month, start_day)
    end_date = date(end_year, end_month, end_day)

    total_games = []

    for single_date in date_range(start_date, end_date):
        current_date = single_date.strftime('%m/%d/%Y')
        print(current_date)

        previous_day = single_date - timedelta(days=1)
        previous_day_formatted = previous_day.strftime('%m/%d/%Y')

        #mean_std_dictionary = mean_std_dict(season_start, previous_day_formatted, season)
        #mean_dict = mean_std_dictionary[0]
        #std_dict = mean_std_dictionary[1]

        current_day_games = get_match_results(current_date, season)
        current_day_games_with_stats = to_dataframe(current_day_games, season_start, previous_day_formatted, season)

        for game in current_day_games_with_stats:
            game.append(current_date)
            total_games.append(game)

    print(total_games)
    return total_games


def make_dataframe(game_list):
    games = pd.DataFrame(game_list,
                         columns=['Home', 'Away', 'Game_ID', 'H_Score', 'H_W_PCT', 'H_FG_PCT', 'H_FG3_PCT', 'H_FT_PCT',
                                  'H_REB', 'H_AST', 'H_TOV', 'H_STL',
                                  'H_BLK', 'H_PLUS_MINUS', 'H_OFF_RATING', 'H_DEF_RATING', 'H_TS_PCT', 'A_Score',
                                  'A_W_PCT', 'A_FG_PCT', 'A_FG3_PCT',
                                  'A_FT_PCT', 'A_REB', 'A_AST', 'A_TOV', 'A_STL',
                                  'A_BLK', 'A_PLUS_MINUS', 'A_OFF_RATING', 'A_DEF_RATING', 'A_TS_PCT', 'Result',
                                  'Date'])

    print(games)
    return games


def main():
    attempts = 10

    for i in range(attempts):
        try:
            all_games = training_set(start_year=2016, start_month=10, start_day=28, end_year=2017, end_month=4, end_day=5,
                             season='2016-17', season_start='10/25/2016')
            df = make_dataframe(all_games)

            print(df)
            df.to_csv(r'C:\Users\alvaro\OneDrive\Documents\School\Flatiron\Projects\NBA_Prediction_Model\data\nba_df_2016.csv', index=False)
        except requests.exceptions.ReadTimeout:
            if i < attempts - 1:
                continue
            else:
                raise
        break

if __name__ == '__main__':
    main()


10/28/2016
['Brooklyn Nets', 'Indiana Pacers', '0021600019', 103, 0.0, 0.443, 0.341, 0.8, 42.3, 21.2, 15.4, 7.7, 2.9, -4.8, 112.5, 117.3, 0.553, 94, 1.0, 0.505, 0.526, 0.765, 44.8, 25.9, 13.8, 9.5, 5.2, 7.8, 112.1, 105.2, 0.602, 1]
['Miami Heat', 'Charlotte Hornets', '0021600022', 91, 1.0, 0.485, 0.25, 0.625, 54.2, 28.1, 12.5, 5.2, 7.3, 12.5, 112.5, 102.1, 0.519, 97, 1.0, 0.449, 0.304, 0.833, 53.1, 28.1, 10.4, 5.2, 10.4, 11.5, 111.5, 102.1, 0.537, 0]
['Detroit Pistons', 'Orlando Magic', '0021600020', 108, 0.0, 0.417, 0.182, 0.773, 41.2, 17.5, 14.4, 8.2, 0.0, -18.6, 93.8, 112.4, 0.486, 82, 0.0, 0.382, 0.333, 0.786, 47.9, 21.3, 11.7, 5.3, 4.3, -12.8, 102.1, 112.5, 0.474, 1]
['Utah Jazz', 'Los Angeles Lakers', '0021600024', 96, 0.0, 0.488, 0.333, 1.0, 34.1, 20.9, 15.4, 9.9, 5.5, -9.9, 114.3, 122.8, 0.584, 89, 1.0, 0.506, 0.343, 0.818, 38.6, 20.8, 15.8, 9.9, 4.0, 5.9, 118.8, 112.9, 0.608, 1]
['Dallas Mavericks', 'Houston Rockets', '0021600023', 98, 0.0, 0.433, 0.375, 0.722, 42.6, 22.6, 13.

['New Orleans Pelicans', 'Milwaukee Bucks', '0021600051', 113, 0.0, 0.421, 0.19, 0.766, 40.6, 23.6, 11.5, 9.6, 4.8, -10.2, 94.2, 104.8, 0.482, 117, 0.333, 0.427, 0.273, 0.793, 44.8, 24.1, 11.7, 9.7, 4.8, -8.3, 99.7, 107.6, 0.494, 0]
11/02/2016
['New York Knicks', 'Houston Rockets', '0021600058', 99, 0.333, 0.43, 0.314, 0.703, 42.9, 20.1, 14.6, 5.4, 6.5, -11.9, 98.0, 110.2, 0.511, 118, 0.5, 0.471, 0.368, 0.721, 41.3, 24.9, 15.1, 6.8, 2.1, -1.3, 112.5, 113.8, 0.581, 0]
['Phoenix Suns', 'Portland Trail Blazers', '0021600063', 118, 0.0, 0.435, 0.268, 0.696, 42.5, 17.3, 15.7, 9.0, 2.9, -10.9, 95.5, 106.7, 0.508, 115, 0.5, 0.465, 0.38, 0.802, 38.4, 20.8, 14.7, 7.5, 4.6, -4.8, 105.8, 110.9, 0.567, 1]
['Brooklyn Nets', 'Detroit Pistons', '0021600057', 109, 0.25, 0.415, 0.289, 0.82, 45.5, 19.3, 15.9, 9.3, 3.9, -6.8, 101.7, 108.6, 0.516, 101, 0.75, 0.458, 0.324, 0.71, 52.4, 22.6, 13.9, 8.2, 3.4, 9.5, 105.0, 95.0, 0.516, 1]
['Boston Celtics', 'Chicago Bulls', '0021600061', 107, 0.667, 0.508, 0.39

['Dallas Mavericks', 'Milwaukee Bucks', '0021600090', 86, 0.0, 0.416, 0.333, 0.815, 42.2, 18.9, 13.1, 7.4, 4.1, -9.0, 99.8, 108.8, 0.518, 75, 0.667, 0.471, 0.347, 0.82, 46.4, 27.2, 15.0, 10.0, 6.0, 4.1, 110.2, 105.6, 0.56, 1]
['Toronto Raptors', 'Sacramento Kings', '0021600089', 91, 0.8, 0.464, 0.27, 0.809, 47.2, 16.7, 14.0, 9.8, 4.0, 7.7, 107.3, 100.2, 0.54, 96, 0.286, 0.443, 0.333, 0.772, 41.9, 22.9, 16.0, 7.2, 3.6, -6.5, 103.9, 111.8, 0.535, 0]
['Memphis Grizzlies', 'Portland Trail Blazers', '0021600088', 94, 0.5, 0.4, 0.357, 0.753, 45.0, 20.1, 16.9, 8.7, 5.6, -6.0, 98.0, 104.6, 0.502, 100, 0.5, 0.453, 0.378, 0.818, 40.8, 21.2, 14.4, 7.7, 4.8, -2.1, 105.6, 107.7, 0.56, 0]
11/07/2016
['Philadelphia 76ers', 'Utah Jazz', '0021600093', 84, 0.0, 0.426, 0.341, 0.735, 42.0, 22.2, 17.5, 7.3, 6.9, -11.6, 94.5, 105.9, 0.51, 109, 0.571, 0.448, 0.383, 0.75, 43.5, 19.9, 14.4, 7.1, 5.2, 1.1, 106.4, 104.8, 0.552, 0]
['Golden State Warriors', 'New Orleans Pelicans', '0021600099', 116, 0.667, 0.486,

['Charlotte Hornets', 'Toronto Raptors', '0021600123', 111, 0.857, 0.442, 0.332, 0.771, 45.6, 25.2, 11.6, 8.5, 5.6, 8.7, 107.9, 99.9, 0.545, 113, 0.714, 0.456, 0.288, 0.786, 45.8, 19.1, 13.0, 9.5, 4.5, 6.3, 108.0, 102.0, 0.538, 0]
['Washington Wizards', 'Cleveland Cavaliers', '0021600122', 94, 0.286, 0.447, 0.321, 0.786, 45.9, 20.4, 16.6, 9.2, 4.9, -2.4, 105.0, 106.5, 0.527, 105, 0.857, 0.44, 0.364, 0.797, 44.4, 22.9, 12.0, 8.7, 4.1, 7.1, 112.9, 105.9, 0.559, 0]
['Oklahoma City Thunder', 'LA Clippers', '0021600126', 108, 0.75, 0.437, 0.325, 0.726, 46.6, 20.0, 17.5, 9.0, 5.1, 3.0, 102.1, 99.0, 0.52, 110, 0.875, 0.446, 0.338, 0.746, 47.4, 21.3, 12.3, 10.3, 5.5, 17.2, 107.4, 90.1, 0.54, 0]
11/12/2016
['Chicago Bulls', 'Washington Wizards', '0021600133', 106, 0.556, 0.451, 0.357, 0.813, 49.6, 24.0, 15.0, 8.0, 4.8, 5.0, 111.3, 105.8, 0.544, 95, 0.25, 0.446, 0.31, 0.787, 46.1, 19.9, 16.5, 8.9, 4.9, -3.5, 104.1, 106.7, 0.523, 1]
['Phoenix Suns', 'Brooklyn Nets', '0021600139', 104, 0.333, 0.44

['Orlando Magic', 'New Orleans Pelicans', '0021600159', 89, 0.364, 0.41, 0.317, 0.719, 46.4, 22.6, 13.3, 6.0, 5.6, -10.7, 99.4, 110.1, 0.491, 82, 0.182, 0.435, 0.287, 0.768, 42.5, 22.4, 13.3, 8.3, 6.1, -7.3, 100.4, 107.3, 0.519, 1]
['Indiana Pacers', 'Cleveland Cavaliers', '0021600160', 103, 0.455, 0.46, 0.356, 0.78, 42.1, 24.3, 14.5, 9.6, 6.4, -1.5, 105.1, 106.6, 0.542, 93, 0.9, 0.445, 0.373, 0.765, 46.4, 23.0, 12.7, 7.8, 5.0, 7.3, 113.2, 106.0, 0.559, 1]
['Atlanta Hawks', 'Milwaukee Bucks', '0021600163', 107, 0.8, 0.48, 0.363, 0.682, 45.4, 25.8, 18.0, 10.7, 5.1, 9.9, 107.6, 97.8, 0.563, 100, 0.556, 0.461, 0.33, 0.801, 46.0, 24.5, 15.8, 9.4, 5.3, 1.9, 105.8, 103.7, 0.549, 1]
['Sacramento Kings', 'San Antonio Spurs', '0021600168', 105, 0.364, 0.446, 0.331, 0.773, 43.5, 22.3, 15.0, 7.5, 3.8, -4.0, 104.9, 109.9, 0.537, 110, 0.727, 0.443, 0.396, 0.844, 47.0, 22.9, 12.9, 8.3, 6.4, 5.5, 108.4, 102.6, 0.547, 0]
['Denver Nuggets', 'Phoenix Suns', '0021600166', 120, 0.3, 0.426, 0.356, 0.77, 48

['Oklahoma City Thunder', 'Indiana Pacers', '0021600195', 111, 0.615, 0.449, 0.338, 0.742, 46.3, 20.6, 16.6, 7.5, 5.1, 1.9, 104.8, 102.7, 0.534, 115, 0.462, 0.464, 0.351, 0.775, 41.6, 23.8, 14.5, 9.2, 7.0, -2.1, 104.9, 106.7, 0.544, 0]
11/21/2016
['Philadelphia 76ers', 'Miami Heat', '0021600199', 101, 0.231, 0.437, 0.351, 0.719, 42.3, 22.3, 17.5, 8.4, 5.5, -10.4, 98.1, 109.0, 0.527, 94, 0.333, 0.428, 0.351, 0.656, 48.1, 20.9, 14.8, 8.0, 6.7, -0.4, 101.1, 102.0, 0.501, 1]
['San Antonio Spurs', 'Dallas Mavericks', '0021600206', 96, 0.769, 0.456, 0.394, 0.846, 45.4, 24.1, 12.7, 8.5, 6.5, 5.7, 109.8, 103.8, 0.559, 91, 0.167, 0.401, 0.311, 0.802, 42.5, 17.9, 13.3, 8.6, 4.4, -7.8, 95.9, 103.9, 0.495, 1]
['LA Clippers', 'Toronto Raptors', '0021600207', 123, 0.857, 0.466, 0.383, 0.745, 44.9, 22.4, 12.3, 10.3, 5.2, 14.1, 112.3, 98.2, 0.567, 115, 0.615, 0.456, 0.326, 0.807, 45.1, 19.1, 11.9, 9.1, 4.3, 3.5, 110.4, 107.1, 0.549, 1]
['Detroit Pistons', 'Houston Rockets', '0021600203', 96, 0.429, 0.

['Milwaukee Bucks', 'Toronto Raptors', '0021600232', 99, 0.462, 0.446, 0.326, 0.788, 45.6, 24.7, 15.5, 9.3, 5.3, -0.9, 103.8, 104.8, 0.532, 105, 0.6, 0.461, 0.344, 0.81, 43.7, 19.7, 12.1, 9.5, 4.4, 3.3, 111.0, 107.8, 0.556, 0]
['Utah Jazz', 'Atlanta Hawks', '0021600231', 95, 0.5, 0.454, 0.34, 0.759, 46.7, 19.5, 13.6, 6.5, 5.4, 2.6, 105.6, 102.6, 0.548, 68, 0.667, 0.469, 0.339, 0.705, 46.1, 24.8, 17.7, 9.7, 5.6, 5.7, 105.1, 99.2, 0.552, 1]
['Indiana Pacers', 'Brooklyn Nets', '0021600234', 118, 0.438, 0.449, 0.348, 0.778, 42.1, 22.4, 15.1, 9.5, 6.3, -4.4, 102.2, 106.4, 0.53, 97, 0.286, 0.438, 0.325, 0.797, 42.5, 20.0, 16.5, 7.8, 4.3, -7.5, 102.9, 110.7, 0.546, 1]
['Sacramento Kings', 'Houston Rockets', '0021600239', 104, 0.4, 0.449, 0.356, 0.784, 43.2, 22.7, 14.9, 7.2, 3.9, -2.4, 106.8, 110.2, 0.546, 117, 0.6, 0.463, 0.368, 0.747, 45.5, 25.2, 17.4, 7.3, 5.1, 3.0, 110.9, 107.6, 0.572, 0]
['Denver Nuggets', 'Oklahoma City Thunder', '0021600235', 129, 0.4, 0.43, 0.352, 0.74, 49.8, 20.6, 16.

['Dallas Mavericks', 'San Antonio Spurs', '0021600272', 87, 0.188, 0.409, 0.315, 0.798, 43.2, 19.2, 13.9, 8.4, 4.2, -9.4, 97.1, 106.5, 0.503, 94, 0.778, 0.46, 0.392, 0.84, 44.7, 24.3, 13.6, 8.4, 6.3, 5.1, 109.9, 104.7, 0.564, 0]
['Denver Nuggets', 'Miami Heat', '0021600273', 98, 0.412, 0.433, 0.359, 0.753, 49.4, 21.2, 15.8, 6.4, 4.4, -1.9, 105.6, 107.7, 0.531, 106, 0.294, 0.426, 0.329, 0.67, 47.4, 20.7, 14.3, 7.5, 6.4, -2.3, 101.7, 104.2, 0.504, 0]
['Minnesota Timberwolves', 'New York Knicks', '0021600271', 104, 0.294, 0.449, 0.358, 0.771, 43.5, 22.6, 15.5, 8.6, 4.9, -1.0, 107.2, 107.5, 0.544, 106, 0.471, 0.445, 0.352, 0.784, 46.3, 22.5, 14.0, 7.7, 5.8, -3.4, 105.8, 109.7, 0.528, 0]
['Toronto Raptors', 'Memphis Grizzlies', '0021600268', 120, 0.647, 0.466, 0.368, 0.809, 43.7, 19.9, 12.1, 9.3, 4.7, 5.0, 112.4, 107.8, 0.564, 105, 0.611, 0.422, 0.332, 0.785, 45.0, 20.9, 15.0, 8.5, 5.9, -0.4, 102.3, 102.6, 0.52, 1]
['Oklahoma City Thunder', 'Washington Wizards', '0021600270', 126, 0.579, 0.

['Chicago Bulls', 'Portland Trail Blazers', '0021600308', 110, 0.579, 0.441, 0.322, 0.807, 50.7, 22.1, 14.0, 8.2, 5.5, 4.2, 108.0, 103.6, 0.525, 112, 0.524, 0.457, 0.364, 0.798, 42.5, 22.2, 13.3, 6.8, 4.7, -2.3, 109.7, 112.0, 0.56, 0]
['Brooklyn Nets', 'Washington Wizards', '0021600306', 113, 0.263, 0.432, 0.325, 0.777, 42.4, 20.1, 16.3, 7.7, 4.3, -8.6, 102.3, 111.1, 0.539, 118, 0.333, 0.45, 0.35, 0.769, 44.4, 21.7, 15.4, 8.9, 4.1, -2.9, 104.8, 107.8, 0.531, 0]
['Houston Rockets', 'Boston Celtics', '0021600311', 107, 0.65, 0.466, 0.378, 0.753, 44.6, 24.7, 16.4, 7.5, 5.1, 3.9, 113.2, 109.0, 0.579, 106, 0.6, 0.45, 0.36, 0.787, 43.4, 25.2, 12.8, 7.4, 4.9, 0.9, 108.2, 107.4, 0.551, 1]
['Milwaukee Bucks', 'San Antonio Spurs', '0021600309', 96, 0.556, 0.457, 0.339, 0.776, 45.8, 24.9, 15.1, 9.1, 5.7, 1.9, 105.7, 103.9, 0.545, 97, 0.8, 0.461, 0.4, 0.82, 44.5, 24.2, 13.7, 8.5, 6.2, 5.1, 109.8, 104.7, 0.565, 0]
['Atlanta Hawks', 'Oklahoma City Thunder', '0021600307', 99, 0.476, 0.451, 0.325, 0.6

['Charlotte Hornets', 'Orlando Magic', '0021600337', 109, 0.591, 0.438, 0.346, 0.747, 46.7, 24.1, 11.8, 6.6, 5.3, 2.7, 106.6, 104.2, 0.537, 88, 0.435, 0.423, 0.333, 0.712, 47.1, 22.1, 14.4, 6.9, 6.2, -4.8, 100.4, 104.9, 0.506, 1]
['Minnesota Timberwolves', 'Detroit Pistons', '0021600342', 90, 0.273, 0.452, 0.351, 0.78, 43.8, 22.7, 15.2, 8.2, 4.9, -2.1, 108.3, 109.7, 0.546, 117, 0.5, 0.451, 0.342, 0.75, 47.6, 22.2, 12.3, 7.3, 4.4, 3.0, 104.8, 101.8, 0.522, 0]
['Oklahoma City Thunder', 'Houston Rockets', '0021600341', 99, 0.636, 0.459, 0.34, 0.716, 45.7, 22.4, 15.6, 7.7, 5.1, 2.6, 106.4, 103.5, 0.542, 102, 0.682, 0.469, 0.377, 0.763, 44.8, 24.9, 16.5, 8.1, 5.1, 5.4, 113.4, 107.7, 0.583, 0]
12/10/2016
['San Antonio Spurs', 'Brooklyn Nets', '0021600353', 130, 0.783, 0.459, 0.393, 0.816, 44.3, 24.2, 13.7, 8.6, 6.1, 4.9, 108.8, 104.0, 0.559, 101, 0.286, 0.437, 0.329, 0.776, 42.7, 20.2, 16.4, 7.7, 4.3, -7.8, 103.2, 111.0, 0.544, 1]
['Chicago Bulls', 'Miami Heat', '0021600350', 105, 0.545, 0.4

['Miami Heat', 'Indiana Pacers', '0021600377', 95, 0.32, 0.432, 0.333, 0.673, 47.1, 21.5, 13.8, 7.6, 5.9, -3.3, 103.4, 106.7, 0.509, 89, 0.52, 0.458, 0.371, 0.804, 42.2, 22.3, 14.8, 9.0, 6.0, -1.9, 105.1, 106.8, 0.548, 1]
['Utah Jazz', 'Oklahoma City Thunder', '0021600382', 109, 0.6, 0.464, 0.361, 0.781, 46.4, 20.9, 14.0, 7.0, 5.8, 5.4, 109.2, 103.4, 0.567, 89, 0.6, 0.456, 0.328, 0.712, 45.9, 21.9, 15.9, 7.9, 5.1, 1.5, 105.7, 103.8, 0.538, 1]
['Dallas Mavericks', 'Detroit Pistons', '0021600381', 85, 0.25, 0.421, 0.335, 0.793, 42.3, 20.5, 13.6, 8.2, 3.9, -7.5, 100.5, 107.9, 0.518, 95, 0.5, 0.451, 0.347, 0.753, 47.2, 22.1, 12.3, 7.6, 4.2, 3.2, 105.0, 101.9, 0.523, 0]
['Washington Wizards', 'Charlotte Hornets', '0021600375', 109, 0.391, 0.454, 0.354, 0.775, 43.0, 21.9, 14.9, 9.2, 3.8, -2.4, 105.5, 108.0, 0.536, 106, 0.56, 0.436, 0.35, 0.753, 47.0, 23.8, 12.1, 6.5, 5.1, 2.1, 106.5, 104.7, 0.536, 1]
['Memphis Grizzlies', 'Cleveland Cavaliers', '0021600379', 93, 0.654, 0.419, 0.327, 0.781, 4

['Denver Nuggets', 'Dallas Mavericks', '0021600417', 117, 0.407, 0.444, 0.357, 0.745, 48.4, 21.8, 16.2, 6.6, 4.5, -2.1, 107.3, 109.5, 0.538, 107, 0.259, 0.422, 0.34, 0.798, 41.8, 20.6, 13.3, 8.6, 3.9, -6.4, 101.1, 107.3, 0.521, 1]
['Minnesota Timberwolves', 'Phoenix Suns', '0021600416', 115, 0.269, 0.452, 0.344, 0.773, 44.6, 22.7, 15.4, 7.6, 4.9, -3.1, 107.5, 110.1, 0.543, 108, 0.296, 0.44, 0.335, 0.765, 44.4, 17.7, 15.6, 8.3, 4.2, -6.5, 103.3, 109.3, 0.529, 1]
['Oklahoma City Thunder', 'Atlanta Hawks', '0021600415', 108, 0.593, 0.455, 0.33, 0.718, 45.7, 22.0, 15.7, 8.0, 5.0, 1.1, 106.0, 104.5, 0.538, 110, 0.481, 0.459, 0.332, 0.709, 44.7, 24.6, 16.8, 8.9, 5.4, -1.9, 103.8, 105.6, 0.541, 0]
12/20/2016
['Philadelphia 76ers', 'New Orleans Pelicans', '0021600418', 93, 0.259, 0.433, 0.356, 0.735, 44.3, 23.2, 17.3, 7.8, 5.8, -7.6, 99.8, 107.3, 0.528, 108, 0.31, 0.439, 0.331, 0.757, 44.0, 22.7, 13.1, 7.8, 6.1, -4.8, 101.8, 106.6, 0.526, 0]
['Golden State Warriors', 'Utah Jazz', '0021600427',

['Denver Nuggets', 'Atlanta Hawks', '0021600451', 108, 0.414, 0.447, 0.358, 0.743, 48.2, 22.3, 16.0, 6.5, 4.2, -2.2, 107.8, 110.2, 0.541, 109, 0.483, 0.46, 0.332, 0.711, 44.3, 24.5, 16.6, 8.8, 5.4, -2.0, 103.9, 105.8, 0.542, 0]
['Minnesota Timberwolves', 'Sacramento Kings', '0021600450', 105, 0.321, 0.452, 0.344, 0.783, 44.9, 22.9, 15.6, 7.6, 5.0, -2.3, 107.6, 109.5, 0.544, 109, 0.414, 0.446, 0.347, 0.768, 43.3, 22.2, 14.5, 8.3, 4.4, -2.2, 106.0, 108.7, 0.54, 0]
['Charlotte Hornets', 'Chicago Bulls', '0021600442', 103, 0.552, 0.437, 0.356, 0.757, 46.7, 24.2, 12.2, 6.7, 5.0, 1.9, 106.8, 105.2, 0.537, 91, 0.5, 0.44, 0.307, 0.801, 50.3, 22.2, 14.3, 7.9, 5.6, 1.9, 106.1, 104.1, 0.522, 1]
['Memphis Grizzlies', 'Houston Rockets', '0021600448', 115, 0.613, 0.414, 0.327, 0.784, 46.8, 21.0, 14.4, 9.1, 5.4, 0.0, 101.9, 101.8, 0.51, 109, 0.733, 0.467, 0.378, 0.742, 45.4, 25.7, 16.1, 8.1, 4.8, 7.2, 113.7, 106.4, 0.582, 1]
['New Orleans Pelicans', 'Miami Heat', '0021600449', 91, 0.323, 0.439, 0.338

['Phoenix Suns', 'Toronto Raptors', '0021600490', 99, 0.281, 0.445, 0.341, 0.762, 43.7, 17.9, 15.8, 8.6, 4.3, -7.1, 103.9, 110.7, 0.535, 91, 0.71, 0.472, 0.393, 0.808, 44.3, 20.9, 12.1, 9.6, 4.9, 8.7, 115.7, 107.0, 0.574, 1]
['Cleveland Cavaliers', 'Boston Celtics', '0021600488', 124, 0.767, 0.457, 0.398, 0.76, 45.6, 23.2, 14.0, 7.5, 4.3, 7.4, 113.3, 106.1, 0.572, 118, 0.594, 0.447, 0.351, 0.796, 43.4, 25.6, 12.8, 7.9, 4.5, 2.2, 108.9, 106.8, 0.549, 1]
['Utah Jazz', 'Philadelphia 76ers', '0021600489', 100, 0.594, 0.467, 0.362, 0.768, 46.2, 20.8, 15.0, 6.7, 5.7, 4.2, 108.2, 103.7, 0.569, 83, 0.233, 0.437, 0.357, 0.74, 43.6, 23.0, 17.4, 7.9, 5.9, -7.6, 100.2, 107.7, 0.532, 1]
['Charlotte Hornets', 'Miami Heat', '0021600486', 91, 0.563, 0.441, 0.355, 0.768, 46.9, 24.3, 12.2, 6.8, 5.1, 2.6, 107.9, 105.5, 0.541, 82, 0.313, 0.438, 0.338, 0.663, 47.2, 21.4, 13.9, 7.2, 5.7, -3.3, 103.2, 106.6, 0.513, 1]
['Memphis Grizzlies', 'Oklahoma City Thunder', '0021600487', 114, 0.588, 0.418, 0.334, 0.78

['Los Angeles Lakers', 'Memphis Grizzlies', '0021600528', 116, 0.324, 0.445, 0.353, 0.777, 44.3, 20.6, 16.0, 8.3, 3.6, -6.3, 105.5, 111.7, 0.539, 102, 0.611, 0.423, 0.346, 0.781, 46.1, 21.5, 13.9, 9.1, 5.4, 1.0, 104.3, 103.1, 0.521, 1]
['Phoenix Suns', 'Miami Heat', '0021600527', 99, 0.286, 0.442, 0.341, 0.761, 44.0, 17.9, 15.7, 8.6, 4.3, -6.7, 103.5, 109.8, 0.53, 90, 0.286, 0.44, 0.346, 0.663, 46.7, 22.1, 13.9, 7.1, 5.9, -3.6, 103.1, 106.9, 0.516, 1]
['Boston Celtics', 'Utah Jazz', '0021600522', 115, 0.588, 0.45, 0.355, 0.801, 42.6, 25.7, 12.8, 8.0, 4.5, 1.9, 109.6, 107.8, 0.555, 104, 0.629, 0.466, 0.367, 0.758, 46.4, 20.6, 14.9, 6.8, 5.7, 4.9, 108.0, 102.9, 0.567, 1]
['Detroit Pistons', 'Indiana Pacers', '0021600523', 116, 0.444, 0.449, 0.341, 0.74, 46.4, 22.1, 12.8, 7.4, 3.9, 0.1, 104.3, 104.5, 0.524, 121, 0.486, 0.454, 0.358, 0.815, 42.7, 22.4, 14.2, 8.4, 5.6, -1.3, 105.0, 106.0, 0.546, 0]
['Dallas Mavericks', 'Washington Wizards', '0021600524', 113, 0.294, 0.429, 0.344, 0.794, 41.

['Los Angeles Lakers', 'Orlando Magic', '0021600567', 111, 0.35, 0.448, 0.357, 0.77, 44.7, 20.9, 15.6, 8.1, 3.5, -5.0, 106.5, 111.3, 0.542, 95, 0.421, 0.439, 0.339, 0.714, 44.9, 23.2, 13.4, 7.0, 5.9, -4.7, 103.9, 108.4, 0.52, 1]
['Phoenix Suns', 'Cleveland Cavaliers', '0021600565', 116, 0.324, 0.443, 0.343, 0.765, 44.5, 18.0, 15.8, 8.5, 4.3, -6.0, 103.7, 109.4, 0.531, 120, 0.771, 0.456, 0.388, 0.758, 45.8, 22.6, 13.9, 7.6, 4.1, 7.0, 113.1, 106.3, 0.569, 0]
['Brooklyn Nets', 'Philadelphia 76ers', '0021600560', 95, 0.229, 0.44, 0.337, 0.783, 43.1, 20.3, 16.7, 7.3, 4.4, -8.4, 102.9, 111.1, 0.548, 105, 0.265, 0.435, 0.353, 0.753, 43.7, 23.0, 17.1, 7.9, 5.9, -7.2, 100.6, 107.8, 0.532, 0]
['LA Clippers', 'Miami Heat', '0021600561', 98, 0.641, 0.465, 0.379, 0.757, 44.6, 22.7, 13.4, 7.9, 4.9, 5.4, 110.8, 105.4, 0.567, 86, 0.289, 0.439, 0.343, 0.669, 46.4, 22.0, 14.0, 7.1, 6.0, -4.2, 103.0, 107.4, 0.516, 1]
['Portland Trail Blazers', 'Detroit Pistons', '0021600559', 124, 0.421, 0.459, 0.362, 0.

['Portland Trail Blazers', 'Orlando Magic', '0021600598', 109, 0.439, 0.461, 0.363, 0.779, 42.8, 22.7, 13.9, 7.0, 5.0, -2.3, 109.0, 111.3, 0.561, 115, 0.4, 0.437, 0.334, 0.713, 44.9, 23.2, 13.5, 7.0, 5.7, -5.1, 103.4, 108.3, 0.518, 0]
['Utah Jazz', 'Detroit Pistons', '0021600600', 110, 0.6, 0.463, 0.368, 0.755, 46.4, 20.8, 15.2, 7.0, 5.6, 3.8, 107.6, 103.6, 0.564, 77, 0.439, 0.452, 0.34, 0.744, 45.9, 22.0, 12.6, 7.6, 4.1, -0.7, 105.1, 106.1, 0.527, 1]
['Atlanta Hawks', 'Boston Celtics', '0021600597', 101, 0.579, 0.455, 0.339, 0.715, 45.2, 24.4, 15.9, 8.9, 5.3, 0.1, 104.6, 104.4, 0.539, 103, 0.615, 0.453, 0.37, 0.809, 42.5, 25.8, 13.0, 7.8, 4.7, 2.4, 110.7, 108.4, 0.561, 0]
['Sacramento Kings', 'Cleveland Cavaliers', '0021600599', 108, 0.421, 0.45, 0.358, 0.778, 42.5, 22.7, 14.5, 8.3, 4.2, -2.6, 106.8, 109.8, 0.548, 120, 0.737, 0.453, 0.384, 0.763, 45.7, 22.1, 14.3, 7.6, 4.1, 5.9, 112.4, 106.6, 0.566, 0]
['Minnesota Timberwolves', 'Oklahoma City Thunder', '0021600596', 96, 0.333, 0.457,

['Detroit Pistons', 'Atlanta Hawks', '0021600633', 118, 0.442, 0.45, 0.341, 0.742, 45.8, 22.0, 12.5, 7.5, 4.1, -1.3, 104.7, 106.2, 0.525, 95, 0.585, 0.456, 0.345, 0.719, 44.9, 24.6, 15.8, 8.9, 5.2, 0.4, 105.3, 104.8, 0.542, 1]
['Sacramento Kings', 'Indiana Pacers', '0021600635', 100, 0.4, 0.452, 0.359, 0.783, 42.5, 22.7, 14.9, 8.4, 4.2, -2.9, 107.1, 110.3, 0.551, 106, 0.525, 0.461, 0.366, 0.819, 42.6, 22.7, 14.2, 8.5, 5.5, -1.0, 106.2, 107.1, 0.553, 0]
['Charlotte Hornets', 'Portland Trail Blazers', '0021600629', 107, 0.488, 0.44, 0.352, 0.789, 46.4, 23.9, 12.3, 6.7, 5.2, 1.0, 107.7, 107.0, 0.541, 85, 0.419, 0.458, 0.36, 0.78, 42.8, 22.5, 13.9, 7.0, 5.1, -2.7, 108.8, 111.5, 0.558, 1]
['Washington Wizards', 'Memphis Grizzlies', '0021600628', 104, 0.525, 0.471, 0.368, 0.779, 43.3, 23.5, 14.8, 8.8, 4.0, 0.2, 108.0, 107.9, 0.555, 101, 0.581, 0.427, 0.348, 0.78, 45.7, 21.8, 13.7, 9.1, 4.9, 0.5, 104.7, 104.2, 0.524, 1]
['New Orleans Pelicans', 'Orlando Magic', '0021600631', 118, 0.381, 0.438

['Detroit Pistons', 'Sacramento Kings', '0021600669', 104, 0.467, 0.452, 0.344, 0.738, 46.1, 22.2, 12.7, 7.5, 4.0, -0.7, 105.1, 106.0, 0.528, 109, 0.372, 0.452, 0.355, 0.78, 42.3, 22.8, 14.9, 8.4, 4.3, -3.3, 106.8, 110.5, 0.55, 0]
['Milwaukee Bucks', 'Houston Rockets', '0021600670', 127, 0.465, 0.471, 0.368, 0.761, 44.1, 25.6, 14.3, 8.3, 5.6, 0.4, 108.6, 108.3, 0.559, 114, 0.723, 0.467, 0.367, 0.761, 44.5, 25.7, 15.4, 8.2, 4.7, 7.1, 114.7, 107.7, 0.586, 1]
['Utah Jazz', 'Oklahoma City Thunder', '0021600673', 95, 0.644, 0.465, 0.367, 0.749, 46.6, 21.1, 15.0, 7.1, 5.4, 4.8, 108.4, 103.5, 0.565, 97, 0.568, 0.457, 0.325, 0.742, 45.9, 21.8, 15.3, 7.9, 5.2, 0.8, 107.8, 106.8, 0.545, 0]
['Atlanta Hawks', 'LA Clippers', '0021600667', 105, 0.591, 0.456, 0.344, 0.719, 44.5, 24.5, 15.8, 8.9, 5.1, 0.4, 105.1, 104.6, 0.541, 115, 0.644, 0.468, 0.372, 0.753, 44.8, 23.1, 13.4, 8.0, 4.8, 5.4, 110.6, 105.3, 0.566, 0]
['Indiana Pacers', 'New York Knicks', '0021600672', 103, 0.512, 0.462, 0.369, 0.815, 42

['New Orleans Pelicans', 'San Antonio Spurs', '0021600701', 119, 0.391, 0.44, 0.357, 0.749, 44.4, 22.8, 12.5, 7.6, 6.0, -2.9, 103.3, 106.3, 0.53, 103, 0.8, 0.481, 0.411, 0.817, 44.9, 25.5, 13.5, 8.3, 5.9, 9.4, 112.8, 103.5, 0.579, 1]
01/28/2017
['Golden State Warriors', 'LA Clippers', '0021600707', 144, 0.848, 0.499, 0.385, 0.788, 44.4, 30.4, 14.9, 9.1, 6.3, 12.4, 115.2, 103.1, 0.602, 98, 0.638, 0.468, 0.38, 0.75, 44.9, 23.0, 13.5, 7.8, 4.8, 5.2, 110.9, 105.8, 0.568, 1]


# Data

## 2016 - 17

In [None]:
df_2016 = pd.read_csv('./data/nba_df_2016.csv')
df_2016['Date'] = pd.to_datetime(df_2016['Date'])
df_2016['Season'] = '2017-18'

df_2016_2 = pd.read_csv('./data/nba_df_2016_v2.csv')
df_2016_2['Date'] = pd.to_datetime(df_2016_2['Date'])
df_2016_2['Season'] = '2017-18'

print(len(df_2017), len(df_2017_2))

In [None]:
frames = [df_2017, df_2017_2]
df_2017_final = pd.concat(frames)
len(df_2017_final)

## 2017 - 18

In [2]:
df_2017 = pd.read_csv('./data/nba_df_2017.csv')
df_2017['Date'] = pd.to_datetime(df_2017['Date'])
df_2017['Season'] = '2017-18'

df_2017_2 = pd.read_csv('./data/nba_df_2017_v2.csv')
df_2017_2['Date'] = pd.to_datetime(df_2017_2['Date'])
df_2017_2['Season'] = '2017-18'

df_2017_3 = pd.read_csv('./data/nba_df_2017_v3.csv')
df_2017_3['Date'] = pd.to_datetime(df_2017_3['Date'])
df_2017_3['Season'] = '2017-18'

print(len(df_2017), len(df_2017_2), len(df_2017_3))

171 546 493


In [3]:
frames = [df_2017, df_2017_2, df_2017_3]
df_2017_final = pd.concat(frames)
len(df_2017_final)

1210

## 2018 - 19

In [4]:
df_2018 = pd.read_csv('./data/nba_df_2018.csv')
df_2018['Date'] = pd.to_datetime(df_2018['Date'])
df_2018['Season'] = '2018-19'

print(len(df_2018))

1203


## 2019 - 20

In [5]:
df_2019 = pd.read_csv('./data/nba_df_2019.csv')
df_2019['Date'] = pd.to_datetime(df_2019['Date'])
df_2019['Season'] = '2019-20'

df_2019_2 = pd.read_csv('./data/nba_df_2019_2.csv')
df_2019_2['Date'] = pd.to_datetime(df_2019_2['Date'])
df_2019_2['Season'] = '2019-20'

print(len(df_2019), len(df_2019_2))

951 84


In [6]:
frames = [df_2019, df_2019_2]
df_2019_final = pd.concat(frames)
len(df_2019_final)

1035

## 2020 - 21

In [7]:
df_2021 = pd.read_csv('./data/nba_df_2020_v0.csv')
df_2021['Date'] = pd.to_datetime(df_2021['Date'])
df_2021['Season'] = '2020-21'

df_2021_2 = pd.read_csv('./data/nba_df_2020.csv')
df_2021_2['Date'] = pd.to_datetime(df_2021_2['Date'])
df_2021_2['Season'] = '2020-21'

df_2021_3 = pd.read_csv('./data/nba_df_2020_v2.csv')
df_2021_3['Date'] = pd.to_datetime(df_2021_3['Date'])
df_2021_3['Season'] = '2020-21'

print(len(df_2021), len(df_2021_2), len(df_2021_3))

5 582 469


In [8]:
frames = [df_2021, df_2021_2, df_2021_3]
df_2021_final = pd.concat(frames)
len(df_2021_final)

1056

## 2021 - 22

In [9]:
import pandas as pd

df_2022 = pd.read_csv('./data/nba_game_2022.csv')
df_2022['Date'] = pd.to_datetime(df_2022['Date'])
df_2022['Season'] = '2021-22'

df_2022_1 = pd.read_csv('./data/nba_game_2022_v1.csv')
df_2022_1['Date'] = pd.to_datetime(df_2022_1['Date'])
df_2022_1['Season'] = '2021-22'

df_2022_2 = pd.read_csv('./data/nba_game_2022_v2.csv')
df_2022_2['Date'] = pd.to_datetime(df_2022_2['Date'])
df_2022_2['Season'] = '2021-22'

df_2022_3 = pd.read_csv('./data/nba_game_2022_v3.csv')
df_2022_3['Date'] = pd.to_datetime(df_2022_3['Date'])
df_2022_3['Season'] = '2021-22'

print(len(df_2022), len(df_2022_1), len(df_2022_2), len(df_2022_3))

61 159 784 199


In [10]:
frames = [df_2022, df_2022_1, df_2022_2, df_2022_3]
df_2022_final = pd.concat(frames)

print(f"Length of 2022 data: {len(df_2022_final)}\n")

Length of 2022 data: 1203



## Merge DataFrames

In [11]:
frames = [df_2017_final, df_2018, df_2019_final, df_2021_final, df_2022_final]
df = pd.concat(frames)
df = df.reset_index(drop=True)

df

Unnamed: 0,Home,Away,Game_ID,H_Score,H_W_PCT,H_FG_PCT,H_FG3_PCT,H_FT_PCT,H_REB,H_AST,...,A_TOV,A_STL,A_BLK,A_PLUS_MINUS,A_OFF_RATING,A_DEF_RATING,A_TS_PCT,Result,Date,Season
0,Philadelphia 76ers,Boston Celtics,21700019,92,0.000,0.462,0.429,0.737,45.3,23.6,...,13.5,11.5,3.0,-5.5,99.5,106.1,0.499,0,2017-10-20,2017-18
1,Phoenix Suns,Los Angeles Lakers,21700026,130,0.000,0.315,0.259,0.722,33.0,10.0,...,19.0,7.6,6.7,-15.2,87.6,101.9,0.455,0,2017-10-20,2017-18
2,Brooklyn Nets,Orlando Magic,21700022,126,0.000,0.479,0.400,0.906,37.4,19.1,...,14.4,7.7,8.7,6.7,111.5,103.8,0.564,1,2017-10-20,2017-18
3,Milwaukee Bucks,Cleveland Cavaliers,21700021,97,1.000,0.500,0.333,0.833,45.0,19.0,...,17.3,3.1,4.1,3.1,104.1,100.0,0.543,0,2017-10-20,2017-18
4,Indiana Pacers,Portland Trail Blazers,21700018,96,1.000,0.520,0.265,0.781,40.2,24.8,...,17.8,7.9,6.9,47.5,122.8,76.0,0.606,0,2017-10-20,2017-18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5702,Orlando Magic,Miami Heat,22101227,125,0.259,0.433,0.329,0.786,44.1,23.6,...,15.0,7.6,3.3,4.8,113.0,108.2,0.584,1,2022-04-10,2021-22
5703,Denver Nuggets,Los Angeles Lakers,22101220,141,0.593,0.482,0.353,0.796,44.6,28.0,...,14.3,7.5,5.1,-3.1,109.7,112.7,0.566,0,2022-04-10,2021-22
5704,Memphis Grizzlies,Boston Celtics,22101223,110,0.691,0.462,0.353,0.735,48.7,25.7,...,13.8,7.4,6.0,7.1,113.3,106.2,0.577,0,2022-04-10,2021-22
5705,Philadelphia 76ers,Detroit Pistons,22101228,118,0.617,0.465,0.366,0.820,43.5,24.4,...,14.2,7.8,4.8,-7.7,105.5,113.2,0.533,1,2022-04-10,2021-22


In [12]:
df.isnull().sum()

Home            0
Away            0
Game_ID         0
H_Score         0
H_W_PCT         0
H_FG_PCT        0
H_FG3_PCT       0
H_FT_PCT        0
H_REB           0
H_AST           0
H_TOV           0
H_STL           0
H_BLK           0
H_PLUS_MINUS    0
H_OFF_RATING    0
H_DEF_RATING    0
H_TS_PCT        0
A_Score         0
A_W_PCT         0
A_FG_PCT        0
A_FG3_PCT       0
A_FT_PCT        0
A_REB           0
A_AST           0
A_TOV           0
A_STL           0
A_BLK           0
A_PLUS_MINUS    0
A_OFF_RATING    0
A_DEF_RATING    0
A_TS_PCT        0
Result          0
Date            0
Season          0
dtype: int64

In [13]:
duplicates = df[df.duplicated()]
duplicates

Unnamed: 0,Home,Away,Game_ID,H_Score,H_W_PCT,H_FG_PCT,H_FG3_PCT,H_FT_PCT,H_REB,H_AST,...,A_TOV,A_STL,A_BLK,A_PLUS_MINUS,A_OFF_RATING,A_DEF_RATING,A_TS_PCT,Result,Date,Season


# Last N Games Win %

In [14]:
import warnings
warnings.filterwarnings("ignore")
prev_game_df = df[df['Date'] < '12/25/2022'][(df['Home'] == "Golden State Warriors") | (df['Away'] == 'Golden State Warriors')].sort_values(by = 'Date').tail(10)
prev_game_df
h_df = prev_game_df.iloc[:, range(0, 32, 31)]

h_df = h_df.loc[h_df['Home'] == 'Golden State Warriors'] 
print(h_df)

                       Home  Result
5621  Golden State Warriors       0
5641  Golden State Warriors       1
5677  Golden State Warriors       1


In [15]:
def get_avg_win_pct_last_n_games(team, game_date, df, n):
    prev_game_df = df[df['Date'] < game_date][(df['Home'] == team) | (df['Away'] == team)].sort_values(by = 'Date').tail(n)
    
    wins = 0 
    
    result_df = prev_game_df.iloc[:, range(0,32,31)]
    h_df = result_df.loc[result_df['Home'] == team] 
    
    h_wins = h_df.loc[h_df['Result'] == 1]
    
    wins += len(h_wins)
      
    a_df = result_df.loc[result_df['Home'] != team]
    a_wins = a_df.loc[a_df['Result'] == 0]
    
    wins += len(a_wins)

    return wins/n
get_avg_win_pct_last_n_games('Golden State Warriors', '12/25/2022', df, 10)

0.6

In [16]:
for season in df['Season'].unique() :
    
    season_stats = df[df['Season'] == season].sort_values(by='Date').reset_index(drop=True)
    
    for index, row in df.iterrows() : 
        game_id = row['Game_ID']
        game_date = row['Date']
        h_team = row['Home']
        a_team = row['Away']
        
        df.loc[index,'Home_W_Pct_10'] = get_avg_win_pct_last_n_games(h_team, game_date, df, 10)
        
        df.loc[index,'Away_W_Pct_10'] = get_avg_win_pct_last_n_games(a_team, game_date, df, 10)
        

In [17]:
df[df['Season'] == '2021-22'].tail()


Unnamed: 0,Home,Away,Game_ID,H_Score,H_W_PCT,H_FG_PCT,H_FG3_PCT,H_FT_PCT,H_REB,H_AST,...,A_BLK,A_PLUS_MINUS,A_OFF_RATING,A_DEF_RATING,A_TS_PCT,Result,Date,Season,Home_W_Pct_10,Away_W_Pct_10
5702,Orlando Magic,Miami Heat,22101227,125,0.259,0.433,0.329,0.786,44.1,23.6,...,3.3,4.8,113.0,108.2,0.584,1,2022-04-10,2021-22,0.3,0.6
5703,Denver Nuggets,Los Angeles Lakers,22101220,141,0.593,0.482,0.353,0.796,44.6,28.0,...,5.1,-3.1,109.7,112.7,0.566,0,2022-04-10,2021-22,0.6,0.2
5704,Memphis Grizzlies,Boston Celtics,22101223,110,0.691,0.462,0.353,0.735,48.7,25.7,...,6.0,7.1,113.3,106.2,0.577,0,2022-04-10,2021-22,0.8,0.7
5705,Philadelphia 76ers,Detroit Pistons,22101228,118,0.617,0.465,0.366,0.82,43.5,24.4,...,4.8,-7.7,105.5,113.2,0.533,1,2022-04-10,2021-22,0.6,0.4
5706,Minnesota Timberwolves,Chicago Bulls,22101224,120,0.568,0.457,0.358,0.778,43.5,25.2,...,4.2,-0.4,112.6,113.1,0.578,0,2022-04-10,2021-22,0.5,0.3


In [None]:
df

# ELO Rating
- every team starts with a 1500
$$R_{i+1} = k * (S_{team} - E_{team} + R_{i})$$
- S team is 1 if the team wins and 0 if they lose
- E team is the expected win probability of the team 
$$E_{team} = \frac{1}{1+10^{\frac{opp\_elo - team\_elo}{400}}}$$
- k is a moving constant that depends on margin of victory and difference in Elo ratings
$$k = 20\frac{(MOV_{winner} + 3)^{0.8}}{7.5 + 0.006(elo\_difference_{winner})} $$
- team year by year carryover 
$$(R * 0.75) + (0.25 * 1505)$$

In [18]:
df.head()

Unnamed: 0,Home,Away,Game_ID,H_Score,H_W_PCT,H_FG_PCT,H_FG3_PCT,H_FT_PCT,H_REB,H_AST,...,A_BLK,A_PLUS_MINUS,A_OFF_RATING,A_DEF_RATING,A_TS_PCT,Result,Date,Season,Home_W_Pct_10,Away_W_Pct_10
0,Philadelphia 76ers,Boston Celtics,21700019,92,0.0,0.462,0.429,0.737,45.3,23.6,...,3.0,-5.5,99.5,106.1,0.499,0,2017-10-20,2017-18,0.0,0.0
1,Phoenix Suns,Los Angeles Lakers,21700026,130,0.0,0.315,0.259,0.722,33.0,10.0,...,6.7,-15.2,87.6,101.9,0.455,0,2017-10-20,2017-18,0.0,0.0
2,Brooklyn Nets,Orlando Magic,21700022,126,0.0,0.479,0.4,0.906,37.4,19.1,...,8.7,6.7,111.5,103.8,0.564,1,2017-10-20,2017-18,0.0,0.0
3,Milwaukee Bucks,Cleveland Cavaliers,21700021,97,1.0,0.5,0.333,0.833,45.0,19.0,...,4.1,3.1,104.1,100.0,0.543,0,2017-10-20,2017-18,0.0,0.0
4,Indiana Pacers,Portland Trail Blazers,21700018,96,1.0,0.52,0.265,0.781,40.2,24.8,...,6.9,47.5,122.8,76.0,0.606,0,2017-10-20,2017-18,0.0,0.0


In [19]:
# Home and road team win probabilities implied by Elo ratings and home court adjustment 
import math
import time
def win_probs(home_elo, away_elo, home_court_advantage) :
    h = math.pow(10, home_elo/400)
    r = math.pow(10, away_elo/400)
    a = math.pow(10, home_court_advantage/400) 

    denom = r + a*h
    home_prob = a*h / denom
    away_prob = r / denom 
  
    return home_prob, away_prob

  #odds the home team will win based on elo ratings and home court advantage

def home_odds_on(home_elo, away_elo, home_court_advantage) :
    h = math.pow(10, home_elo/400)
    r = math.pow(10, away_elo/400)
    a = math.pow(10, home_court_advantage/400)
    return a*h/r

#this function determines the constant used in the elo rating, based on margin of victory and difference in elo ratings
def elo_k(MOV, elo_diff):
    k = 20 # Optimal K is 20 https://fivethirtyeight.com/features/how-we-calculate-nba-elo-ratings/
    if MOV>0:
        multiplier=(MOV+3)**(0.8)/(7.5+0.006*(elo_diff))
    else:
        multiplier=(-MOV+3)**(0.8)/(7.5+0.006*(-elo_diff))
    return k*multiplier


# Updates the home and away teams elo ratings after a game 

def update_elo(home_score, away_score, home_elo, away_elo, home_court_advantage) :
    home_prob, away_prob = win_probs(home_elo, away_elo, home_court_advantage) 

    if (home_score - away_score > 0) :
        home_win = 1 
        away_win = 0 
    else :
        home_win = 0 
        away_win = 1 
  
    k = elo_k(home_score - away_score, home_elo - away_elo)

    updated_home_elo = home_elo + k * (home_win - home_prob) 
    updated_away_elo = away_elo + k * (away_win - away_prob)
    
    return updated_home_elo, updated_away_elo


# Takes into account prev season elo
# The reason we revert to a mean of 1505 rather than 1500 is that 
# there are liable to be a couple of relatively recent expansion teams in the league at any given time
def get_prev_elo(team, date, season, team_stats, elo_df) :
    prev_game = team_stats[team_stats['Date'] < game_date][(team_stats['Home'] == team) | (team_stats['Away'] == team)].sort_values(by = 'Date').tail(1).iloc[0] 

    if team == prev_game['Home'] :
        elo_rating = elo_df[elo_df['Game_ID'] == prev_game['Game_ID']]['H_Team_Elo_After'].values[0]
    else :
        elo_rating = elo_df[elo_df['Game_ID'] == prev_game['Game_ID']]['A_Team_Elo_After'].values[0]
  
    if prev_game['Season'] != season :
        return (0.75 * elo_rating) + (0.25 * 1505) # Year-to-Year Carry-Over
    else :
        return elo_rating

In [20]:
df.sort_values(by = 'Date', inplace = True)
df.reset_index(inplace=True, drop = True)
elo_df = pd.DataFrame(columns=['Game_ID', 'H_Team', 'A_Team', 'H_Team_Elo_Before', 'A_Team_Elo_Before', 'H_Team_Elo_After', 'A_Team_Elo_After'])
teams_elo_df = pd.DataFrame(columns=['Game_ID','Team', 'Elo', 'Date', 'Where_Played', 'Season']) 

for index, row in df.iterrows(): 
    game_id = row['Game_ID']
    game_date = row['Date']
    season = row['Season']
    h_team, a_team = row['Home'], row['Away']
    h_score, a_score = row['H_Score'], row['A_Score'] 

    if (h_team not in elo_df['H_Team'].values and h_team not in elo_df['A_Team'].values) :
        h_team_elo_before = 1500
    else :
        h_team_elo_before = get_prev_elo(h_team, game_date, season, df, elo_df)

    if (a_team not in elo_df['H_Team'].values and a_team not in elo_df['A_Team'].values) :
        a_team_elo_before = 1500
    else :
        a_team_elo_before = get_prev_elo(a_team, game_date, season, df, elo_df)

    h_team_elo_after, a_team_elo_after = update_elo(h_score, a_score, h_team_elo_before, a_team_elo_before, 69)

    new_row = {'Game_ID': game_id, 'H_Team': h_team, 'A_Team': a_team, 'H_Team_Elo_Before': h_team_elo_before, 'A_Team_Elo_Before': a_team_elo_before, \
                                                                        'H_Team_Elo_After' : h_team_elo_after, 'A_Team_Elo_After': a_team_elo_after}
    teams_row_one = {'Game_ID': game_id,'Team': h_team, 'Elo': h_team_elo_before, 'Date': game_date, 'Where_Played': 'Home', 'Season': season}
    teams_row_two = {'Game_ID': game_id,'Team': a_team, 'Elo': a_team_elo_before, 'Date': game_date, 'Where_Played': 'Away', 'Season': season}
  
    elo_df = elo_df.append(new_row, ignore_index = True)
    teams_elo_df = teams_elo_df.append(teams_row_one, ignore_index=True)
    teams_elo_df = teams_elo_df.append(teams_row_two, ignore_index=True)

In [21]:
#teams_elo_df.set_index(["Team"], append=True)
#dataset = teams_elo_df.pivot(index="Team",values="Elo", columns="Date")
dates = list(set([d.strftime("%m-%d-%Y") for d in teams_elo_df["Date"]]))
dates = sorted(dates, key=lambda x: time.strptime(x, '%m-%d-%Y'))
teams = df["Away"]
dataset = pd.DataFrame(columns=dates)
dataset["Team"] = teams.drop_duplicates()
dataset = dataset.set_index("Team")

for index, row in teams_elo_df.iterrows():
    date = row["Date"].strftime("%m-%d-%Y")
    team = row["Team"]
    elo = row["Elo"]
    dataset[date][team] = elo

teams_elo_df['Elo'] = teams_elo_df['Elo'].astype(float)

elo_df

Unnamed: 0,Game_ID,H_Team,A_Team,H_Team_Elo_Before,A_Team_Elo_Before,H_Team_Elo_After,A_Team_Elo_After
0,21700019,Philadelphia 76ers,Boston Celtics,1500,1500,1487.588205,1512.411795
1,21700026,Phoenix Suns,Los Angeles Lakers,1500,1500,1494.220957,1505.779043
2,21700022,Brooklyn Nets,Orlando Magic,1500,1500,1505.657856,1494.342144
3,21700021,Milwaukee Bucks,Cleveland Cavaliers,1500,1500,1481.093199,1518.906801
4,21700018,Indiana Pacers,Portland Trail Blazers,1500,1500,1481.783902,1518.216098
...,...,...,...,...,...,...,...
5702,22101225,New Orleans Pelicans,Golden State Warriors,1534.445418,1574.302989,1516.647138,1592.101269
5703,22101219,Dallas Mavericks,San Antonio Spurs,1585.484564,1490.992295,1590.900483,1485.576375
5704,22101222,LA Clippers,Oklahoma City Thunder,1526.825496,1338.090516,1537.08683,1327.829182
5705,22101217,Charlotte Hornets,Washington Wizards,1539.195491,1428.163212,1545.957475,1421.401228


In [None]:
df.head()

In [22]:
df = df.merge(elo_df.drop(columns=['H_Team', 'A_Team']), on ='Game_ID')
df.head()

Unnamed: 0,Home,Away,Game_ID,H_Score,H_W_PCT,H_FG_PCT,H_FG3_PCT,H_FT_PCT,H_REB,H_AST,...,A_TS_PCT,Result,Date,Season,Home_W_Pct_10,Away_W_Pct_10,H_Team_Elo_Before,A_Team_Elo_Before,H_Team_Elo_After,A_Team_Elo_After
0,Philadelphia 76ers,Boston Celtics,21700019,92,0.0,0.462,0.429,0.737,45.3,23.6,...,0.499,0,2017-10-20,2017-18,0.0,0.0,1500,1500,1487.588205,1512.411795
1,Phoenix Suns,Los Angeles Lakers,21700026,130,0.0,0.315,0.259,0.722,33.0,10.0,...,0.455,0,2017-10-20,2017-18,0.0,0.0,1500,1500,1494.220957,1505.779043
2,Brooklyn Nets,Orlando Magic,21700022,126,0.0,0.479,0.4,0.906,37.4,19.1,...,0.564,1,2017-10-20,2017-18,0.0,0.0,1500,1500,1505.657856,1494.342144
3,Milwaukee Bucks,Cleveland Cavaliers,21700021,97,1.0,0.5,0.333,0.833,45.0,19.0,...,0.543,0,2017-10-20,2017-18,0.0,0.0,1500,1500,1481.093199,1518.906801
4,Indiana Pacers,Portland Trail Blazers,21700018,96,1.0,0.52,0.265,0.781,40.2,24.8,...,0.606,0,2017-10-20,2017-18,0.0,0.0,1500,1500,1481.783902,1518.216098


# Merging Dataset

# Standardization and Z Score

In [None]:
df.head()

# Evaluate Different Models - No Z Score

In [23]:
df.to_csv(r'C:/Users/alvaro/OneDrive/Documents/School/Flatiron/Projects/NBA_Prediction_Model/data/nba_raw.csv', 
          index=False)
print(f'The final dataset consists of three seasons and {len(df)} games.')
df = df.reset_index(drop=True)
df.tail()


The final dataset consists of three seasons and 5707 games.


Unnamed: 0,Home,Away,Game_ID,H_Score,H_W_PCT,H_FG_PCT,H_FG3_PCT,H_FT_PCT,H_REB,H_AST,...,A_TS_PCT,Result,Date,Season,Home_W_Pct_10,Away_W_Pct_10,H_Team_Elo_Before,A_Team_Elo_Before,H_Team_Elo_After,A_Team_Elo_After
5702,New Orleans Pelicans,Golden State Warriors,22101225,107,0.444,0.457,0.332,0.789,46.0,25.4,...,0.58,0,2022-04-10,2021-22,0.6,0.5,1534.445418,1574.302989,1516.647138,1592.101269
5703,Dallas Mavericks,San Antonio Spurs,22101219,130,0.63,0.46,0.348,0.771,44.8,24.2,...,0.556,1,2022-04-10,2021-22,0.8,0.7,1585.484564,1490.992295,1590.900483,1485.576375
5704,LA Clippers,Oklahoma City Thunder,22101222,138,0.506,0.457,0.372,0.792,44.3,24.1,...,0.531,1,2022-04-10,2021-22,0.5,0.4,1526.825496,1338.090516,1537.08683,1327.829182
5705,Charlotte Hornets,Washington Wizards,22101217,124,0.519,0.467,0.363,0.741,43.9,27.6,...,0.569,1,2022-04-10,2021-22,0.6,0.5,1539.195491,1428.163212,1545.957475,1421.401228
5706,Minnesota Timberwolves,Chicago Bulls,22101224,120,0.568,0.457,0.358,0.778,43.5,25.2,...,0.578,0,2022-04-10,2021-22,0.5,0.3,1574.46694,1450.007328,1563.891946,1460.582322


In [24]:
df = pd.read_csv('./data/nba_raw.csv')

In [25]:
print(df.corr()['Result'].abs().sort_values(ascending=False))

Result               1.000000
H_Score              0.461871
A_Score              0.461385
H_Team_Elo_After     0.363042
A_Team_Elo_After     0.308683
H_Team_Elo_Before    0.266572
H_PLUS_MINUS         0.234252
Home_W_Pct_10        0.215551
H_W_PCT              0.214225
A_Team_Elo_Before    0.209484
A_W_PCT              0.169454
A_PLUS_MINUS         0.169382
H_OFF_RATING         0.157338
Away_W_Pct_10        0.157198
H_TS_PCT             0.154047
H_DEF_RATING         0.152833
H_FG_PCT             0.140374
A_OFF_RATING         0.136559
A_TS_PCT             0.129107
H_REB                0.114393
A_FG_PCT             0.101940
H_FG3_PCT            0.090817
A_DEF_RATING         0.089300
H_AST                0.080855
A_REB                0.076496
A_AST                0.073782
H_BLK                0.072406
A_BLK                0.066711
H_FT_PCT             0.064787
A_FG3_PCT            0.061637
A_TOV                0.054370
A_FT_PCT             0.051991
H_STL                0.041594
H_TOV     

In [None]:
with sns.axes_style("darkgrid"):
    fig, ax = plt.subplots(figsize=(44, 34))
    
    mask = np.triu(np.ones_like(df.corr(), dtype=bool))
    ax = sns.heatmap(abs(df.corr()),mask=mask,annot=True)
    fig.savefig('images/Corelation_Heatmap');

In [26]:
# Cast "H_Team_Elo_Before" and "A_Team_Elo_Before" as floats
df["H_Team_Elo_Before"] = df.H_Team_Elo_Before.astype(float)
df["A_Team_Elo_Before"] = df.A_Team_Elo_Before.astype(float)

In [27]:
# Remove columns that have data on post-game stats
df = df.drop(['H_Team_Elo_After', 'A_Team_Elo_After', 'H_Score', 'A_Score'], axis=1)

In [28]:
# Remove irrelevant columns
df = df.drop(['Home', 'Away', 'Game_ID', 'Date', 'Season'], axis=1)

In [29]:
df.columns

Index(['H_W_PCT', 'H_FG_PCT', 'H_FG3_PCT', 'H_FT_PCT', 'H_REB', 'H_AST',
       'H_TOV', 'H_STL', 'H_BLK', 'H_PLUS_MINUS', 'H_OFF_RATING',
       'H_DEF_RATING', 'H_TS_PCT', 'A_W_PCT', 'A_FG_PCT', 'A_FG3_PCT',
       'A_FT_PCT', 'A_REB', 'A_AST', 'A_TOV', 'A_STL', 'A_BLK', 'A_PLUS_MINUS',
       'A_OFF_RATING', 'A_DEF_RATING', 'A_TS_PCT', 'Result', 'Home_W_Pct_10',
       'Away_W_Pct_10', 'H_Team_Elo_Before', 'A_Team_Elo_Before'],
      dtype='object')

In [30]:
df.to_csv(r'C:\Users\alvaro\OneDrive\Documents\School\Flatiron\Projects\NBA_Prediction_Model\data\nba.csv', 
          index=False)

In [7]:
df = pd.read_csv('./data/nba.csv')

In [None]:
df

# EDA

In [None]:
plt.figure(figsize=(44, 34))
correlation = df[['H_W_PCT', 
                  'H_REB', 
                  'H_AST',
                  'H_TOV', 
                  'H_STL', 
                  'H_BLK', 
                  'H_PLUS_MINUS', 
                  'H_OFF_RATING',
                  'H_DEF_RATING', 
                  'H_TS_PCT', 
                  'H_Team_Elo_Before', 
                  'Home_W_Pct_10', 
                  'Result'
                  ]]

sns.heatmap(correlation.corr(), annot=True);
# correlation
# sns.heatmap(df.corr(), annot=True);
# sns.heatmap(df['Result'].corr(), annot=True);

In [None]:
print(df.corr()['Result'].abs().sort_values(ascending=False))

In [None]:
with sns.axes_style("darkgrid"):
    fig, ax = plt.subplots(figsize=(44, 34))
    
    mask = np.triu(np.ones_like(df.corr(), dtype=bool))
    ax = sns.heatmap(abs(df.corr()),mask=mask,annot=True)
    fig.savefig('images/Corelation_Heatmap_2');

In [None]:
df

In [13]:
X = df.drop(columns = 'Result')

y = df['Result']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

print(f'X train shape: {X_train.shape}')
print(f'X test shape: {X_test.shape}')

X train shape: (3823, 30)
X test shape: (1884, 30)


In [None]:
print(f"""Raw Counts 
{df["Result"].value_counts()}\n
Percentages 
{df["Result"].value_counts(normalize=True)}


We would get an accuracy score of {np.round(df["Result"].value_counts(normalize=True)[1], 4)} with a baseline model, i.e. about 56.6% accuracy

This is because about 55.79% of the results are wins""")

In [None]:
print(f"""Train percent wins
{y_train.value_counts(normalize=True)}\n""")

print(f"""Test percent wins: 
{y_test.value_counts(normalize=True)}\n""")

In [None]:
class ModelWithCV():
    '''Structure to save the model and more easily see its crossvalidation'''
    
    def __init__(self, model, model_name, X, y, cv_now=True):
        self.model = model
        self.name = model_name
        self.X = X
        self.y = y
        # For CV results
        self.cv_results = None
        self.cv_mean = None
        self.cv_median = None
        self.cv_std = None
        #
        if cv_now:
            self.cross_validate()
        
    def cross_validate(self, X=None, y=None, kfolds=10):
        '''
        Perform cross-validation and return results.
        
        Args: 
          X:
            Optional; Training data to perform CV on. Otherwise use X from object
          y:
            Optional; Training data to perform CV on. Otherwise use y from object
          kfolds:
            Optional; Number of folds for CV (default is 10)  
        '''
        
        cv_X = X if X else self.X
        cv_y = y if y else self.y

        self.cv_results = cross_val_score(self.model, cv_X, cv_y, cv=kfolds)
        self.cv_mean = np.mean(self.cv_results)
        self.cv_median = np.median(self.cv_results)
        self.cv_std = np.std(self.cv_results)

        
    def print_cv_summary(self):
        cv_summary = (
        f'''CV Results for `{self.name}` model:
            {self.cv_mean:.5f} ± {self.cv_std:.5f} accuracy
        ''')
        print(cv_summary)

        
    def plot_cv(self, ax):
        '''
        Plot the cross-validation values using the array of results and given 
        Axis for plotting.
        '''
        ax.set_title(f'CV Results for `{self.name}` Model')
        # Thinner violinplot with higher bw
        sns.violinplot(y=self.cv_results, ax=ax, bw=.4)
        sns.swarmplot(
                y=self.cv_results,
                color='orange',
                size=10,
                alpha= 0.8,
                ax=ax
        )

        return ax

## Dummy

In [None]:
# Create Dummy/Baseliner



estimator = DummyRegressor(strategy='mean')

# Create Dummy/Baseliner
from sklearn.dummy import DummyRegressor

pipe = Pipeline(steps=[
    ('estimator', DummyRegressor(strategy='mean'))
])

cv = ModelWithCV(pipe, 'estimator', X_train, y_train)

In [None]:
f, ax = plt.subplots()

cv.plot_cv(ax);

In [None]:
cv.print_cv_summary()

## Logistic Regression

In [69]:
pipe = Pipeline(steps=[
    ('poly', PolynomialFeatures(degree=2)),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=4)),
    ('estimator', LogisticRegression(random_state=42))
])

In [70]:
param_grid = {}
param_grid['estimator__C'] = [100, 10, 1.0, 0.1, 0.01]
param_grid['estimator__solver'] = ['newton-cg', 'lbfgs', 'liblinear']
param_grid['estimator__penalty'] = ['l2']
# param_grid['estimator__class_weight'] = ['balanced', None]
# param_grid['estimator__n_jobs'] = [-1]
# param_grid['estimator__l1_ratio'] = [0, 1]

In [71]:
grid_search = GridSearchCV(estimator=pipe, 
                           param_grid=param_grid, 
                           cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42), 
                           return_train_score=True, 
                           scoring='accuracy', 
                           n_jobs=-1,
                           verbose=1)

In [72]:
grid_search.fit(X_train, y_train)

Fitting 30 folds for each of 30 candidates, totalling 900 fits


KeyboardInterrupt: 

In [None]:
# Mean training score
grid_train_score = np.mean(grid_search.cv_results_['mean_train_score'])

# Mean test score
grid_test_score = grid_search.score(X_test, y_test)

best_grid = grid_search.best_estimator_
best_grid.fit(X_train, y_train)
y_pred = best_grid.predict(X_test)

print(f"Mean Training Score: {grid_train_score:.2%}\n")
print(f"Mean Test Score: {grid_test_score:.2%}\n")

print(f"Accuracy Score: {accuracy_score(y_test, y_pred):.2%}\n")
print(f"Optimal Parameters: {grid_search.best_params_}\n")
print(f"Testing Accuracy: {grid_search.best_score_:.2%}\n")

print(classification_report(y_test, y_pred))

## Ridge

In [None]:
pipe = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('estimator', RidgeClassifier())
])


param_grid = {}
param_grid['estimator__alpha'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
# param_grid['estimator__learning_rate'] = [0.001, 0.01, 0.1]
# param_grid['estimator__subsample'] = [0.5, 0.7, 1.0]
# param_grid['estimator__max_depth'] = [3, 7, 9]


grid_search = GridSearchCV(estimator=pipe, 
                           param_grid=param_grid, 
                           cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42), 
                           return_train_score=True, 
                           scoring='accuracy', 
                           random_state=42,
                           n_jobs=-1,
                           verbose=2)


grid_search.fit(X_train, y_train)


# Mean training score
grid_train_score = np.mean(grid_search.cv_results_['mean_train_score'])

# Mean test score
grid_test_score = grid_search.score(X_test, y_test)

best_grid = grid_search.best_estimator_
best_grid.fit(X_train, y_train)
y_pred = best_grid.predict(X_test)

print(f"Mean Training Score: {grid_train_score:.2%}\n")
print(f"Mean Test Score: {grid_test_score:.2%}\n")

print(f"Accuracy Score: {accuracy_score(y_test, y_pred):.2%}\n")
print(f"Optimal Parameters: {grid_search.best_params_}\n")
print(f"Testing Accuracy: {grid_search.best_score_:.2%}\n")

print(classification_report(y_test, y_pred))

## Random Forest - Random Search CV

In [38]:
pipe = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('estimator', RandomForestClassifier(random_state=42))
])

In [39]:
param_grid = {}
param_grid['estimator__n_estimators'] = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
param_grid['estimator__max_features'] = ['auto', 'sqrt', 'log2']
param_grid['estimator__max_depth'] = [int(x) for x in np.linspace(10, 110, num=11)]
param_grid['estimator__min_samples_split'] = [2, 5, 10]
param_grid['estimator__min_samples_leaf'] = [1, 2, 4]
#param_grid['estimator__bootstrap'] = [True, False]


In [40]:
grid_search = RandomizedSearchCV(estimator=pipe, 
                                 param_distributions=param_grid, 
                                 cv=10, 
                                 return_train_score=True, 
                                 scoring='accuracy', 
                                 n_iter=100, 
                                 random_state=42, 
                                 n_jobs=-1, 
                                 verbose=2)


In [41]:
grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


KeyboardInterrupt: 

In [None]:
# Mean training score
grid_train_score = np.mean(grid_search.cv_results_['mean_train_score'])

# Mean test score
grid_test_score = grid_search.score(X_test, y_test)

best_grid = grid_search.best_estimator_
best_grid.fit(X_train, y_train)
y_pred = best_grid.predict(X_test)

print(f"Mean Training Score: {grid_train_score:.2%}\n")
print(f"Mean Test Score: {grid_test_score:.2%}\n")

print(f"Accuracy Score: {accuracy_score(y_test, y_pred):.2%}\n")
print(f"Optimal Parameters: {grid_search.best_params_}\n")
print(f"Testing Accuracy: {grid_search.best_score_:.2%}\n")

print(classification_report(y_test, y_pred))

## Random Forest - Grid Search CV

In [None]:
pipe = Pipeline(steps=[
    ('estimator', RandomForestClassifier(random_state=42))
])

In [None]:
param_grid = {}
param_grid['estimator__n_estimators'] = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
param_grid['estimator__max_features'] = ['auto', 'sqrt']
param_grid['estimator__max_depth'] = [int(x) for x in np.linspace(10, 110, num=11)]
param_grid['estimator__min_samples_split'] = [2, 5, 10]
param_grid['estimator__min_samples_leaf'] = [1, 2, 4]
# param_grid['estimator__bootstrap'] = [True, False]


In [None]:
grid_search = GridSearchCV(estimator=pipe, 
                           param_grid=param_grid, 
                           cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42), 
                           return_train_score=True, 
                           scoring='accuracy',
                           n_jobs=-1,
                           verbose=1)


In [None]:
grid_search.fit(X_train, y_train)

In [None]:
# Mean training score
grid_train_score = np.mean(grid_search.cv_results_['mean_train_score'])

# Mean test score
grid_test_score = grid_search.score(X_test, y_test)

best_grid = grid_search.best_estimator_
best_grid.fit(X_train, y_train)
y_pred = best_grid.predict(X_test)

print(f"Mean Training Score: {grid_train_score:.2%}\n")
print(f"Mean Test Score: {grid_test_score:.2%}\n")

print(f"Accuracy Score: {accuracy_score(y_test, y_pred):.2%}\n")
print(f"Optimal Parameters: {grid_search.best_params_}\n")
print(f"Testing Accuracy: {grid_search.best_score_:.2%}\n")

print(classification_report(y_test, y_pred))

In [None]:
pipe = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('estimator', RandomForestClassifier(random_state=42))
])


param_grid = {}
param_grid['estimator__n_estimators'] = [10, 100, 1000]
param_grid['estimator__max_features'] = ['sqrt', 'log2']
# param_grid['estimator__max_depth'] = [int(x) for x in np.linspace(10, 110, num=11)]
# param_grid['estimator__min_samples_split'] = [2, 5, 10]
# param_grid['estimator__min_samples_leaf'] = [1, 2, 4]
# param_grid['estimator__bootstrap'] = [True, False]

grid_search = GridSearchCV(estimator=pipe, 
                           param_grid=param_grid, 
                           cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42), 
                           return_train_score=True, 
                           scoring='accuracy',
                           n_jobs=-1,
                           verbose=2)


grid_search.fit(X_train, y_train)


# Mean training score
grid_train_score = np.mean(grid_search.cv_results_['mean_train_score'])

# Mean test score
grid_test_score = grid_search.score(X_test, y_test)

best_grid = grid_search.best_estimator_
best_grid.fit(X_train, y_train)
y_pred = best_grid.predict(X_test)

print(f"Mean Training Score: {grid_train_score:.2%}\n")
print(f"Mean Test Score: {grid_test_score:.2%}\n")

print(f"Accuracy Score: {accuracy_score(y_test, y_pred):.2%}\n")
print(f"Optimal Parameters: {grid_search.best_params_}\n")
print(f"Testing Accuracy: {grid_search.best_score_:.2%}\n")

print(classification_report(y_test, y_pred))

In [None]:
pipe = ImPipeline(steps=[
    ('sm', SMOTE(random_state=42)),
    ('scaler', StandardScaler()),
    ('estimator', RandomForestClassifier(random_state=42))
])


param_grid = {}
param_grid['estimator__n_estimators'] = [10, 100, 1000]
param_grid['estimator__max_features'] = ['sqrt', 'log2']
param_grid['estimator__k_neighbors'] = [3, 5, 7]

# param_grid['estimator__max_depth'] = [int(x) for x in np.linspace(10, 110, num=11)]
# param_grid['estimator__min_samples_split'] = [2, 5, 10]
# param_grid['estimator__min_samples_leaf'] = [1, 2, 4]

grid_search = GridSearchCV(estimator=pipe, 
                           param_grid=param_grid, 
                           cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42), 
                           return_train_score=True, 
                           scoring='accuracy',
                           n_jobs=-1,
                           verbose=2)


grid_search.fit(X_train, y_train)


# Mean training score
grid_train_score = np.mean(grid_search.cv_results_['mean_train_score'])

# Mean test score
grid_test_score = grid_search.score(X_test, y_test)

best_grid = grid_search.best_estimator_
best_grid.fit(X_train, y_train)
y_pred = best_grid.predict(X_test)

print(f"Mean Training Score: {grid_train_score:.2%}\n")
print(f"Mean Test Score: {grid_test_score:.2%}\n")

print(f"Accuracy Score: {accuracy_score(y_test, y_pred):.2%}\n")
print(f"Optimal Parameters: {grid_search.best_params_}\n")
print(f"Testing Accuracy: {grid_search.best_score_:.2%}\n")

print(classification_report(y_test, y_pred))

## Gaussian Naive Bayes

In [None]:
pipe = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('estimator', GaussianNB())
])

In [None]:
param_grid = {}
param_grid['estimator__var_smoothing'] = np.logspace(0,-11, num=100)
#param_grid['estimator__var_smoothing'] = [1e-11, 1e-10, 1e-9]


In [None]:
grid_search = RandomizedSearchCV(estimator=pipe, 
                                 param_distributions=param_grid, 
                                 cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42), 
                                 return_train_score=True, 
                                 scoring='accuracy', 
                                 random_state=42,
                                 n_jobs = -1,
                                 verbose=2)


In [None]:
grid_search.fit(X_train, y_train)

In [None]:
# Mean training score
grid_train_score = np.mean(grid_search.cv_results_['mean_train_score'])

# Mean test score
grid_test_score = grid_search.score(X_test, y_test)

best_grid = grid_search.best_estimator_
best_grid.fit(X_train, y_train)
y_pred = best_grid.predict(X_test)

print(f"Mean Training Score: {grid_train_score:.2%}\n")
print(f"Mean Test Score: {grid_test_score:.2%}\n")

print(f"Accuracy Score: {accuracy_score(y_test, y_pred):.2%}\n")
print(f"Optimal Parameters: {grid_search.best_params_}\n")
print(f"Testing Accuracy: {grid_search.best_score_:.2%}\n")

print(classification_report(y_test, y_pred))

In [None]:
pipe = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('estimator', GaussianNB())
])

In [None]:
param_grid = {}
param_grid['estimator__var_smoothing'] = np.logspace(0,-11, num=100)


In [None]:
grid_search = GridSearchCV(estimator=pipe, 
                           param_grid=param_grid, 
                           cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42), 
                           return_train_score=True, 
                           scoring='accuracy', 
                           random_state=42,
                           n_jobs=-1,
                           verbose=2)


In [None]:
grid_search.fit(X_train, y_train)

In [None]:
# Mean training score
grid_train_score = np.mean(grid_search.cv_results_['mean_train_score'])

# Mean test score
grid_test_score = grid_search.score(X_test, y_test)

best_grid = grid_search.best_estimator_
best_grid.fit(X_train, y_train)
y_pred = best_grid.predict(X_test)

print(f"Mean Training Score: {grid_train_score:.2%}\n")
print(f"Mean Test Score: {grid_test_score:.2%}\n")

print(f"Accuracy Score: {accuracy_score(y_test, y_pred):.2%}\n")
print(f"Optimal Parameters: {grid_search.best_params_}\n")
print(f"Testing Accuracy: {grid_search.best_score_:.2%}\n")

print(classification_report(y_test, y_pred))

## K Nearest Neighbors

In [None]:
pipe = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('estimator', KNeighborsClassifier())
])

In [None]:
param_grid = {}
param_grid['estimator__n_neighbors'] = range(1, 21, 2)
param_grid['estimator__p'] = [1, 2]
param_grid['estimator__weights'] = ['uniform', 'distance']
param_grid['estimator__metric'] = ['euclidean', 'manhattan', 'minkowski']
# param_grid['estimator__leaf_size'] = (20, 40, 1)



In [None]:
grid_search = GridSearchCV(estimator=pipe, 
                           param_grid=param_grid, 
                           cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42), 
                           return_train_score=True, 
                           scoring='accuracy', 
                           random_state=42,
                           n_jobs = -1,
                           verbose=2)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
# Mean training score
grid_train_score = np.mean(grid_search.cv_results_['mean_train_score'])

# Mean test score
grid_test_score = grid_search.score(X_test, y_test)

best_grid = grid_search.best_estimator_
best_grid.fit(X_train, y_train)
y_pred = best_grid.predict(X_test)

print(f"Mean Training Score: {grid_train_score:.2%}\n")
print(f"Mean Test Score: {grid_test_score:.2%}\n")

print(f"Accuracy Score: {accuracy_score(y_test, y_pred):.2%}\n")
print(f"Optimal Parameters: {grid_search.best_params_}\n")
print(f"Testing Accuracy: {grid_search.best_score_:.2%}\n")

print(classification_report(y_test, y_pred))

## Gradient Boosting

In [None]:
pipe = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('estimator', GradientBoostingClassifier(random_state=42))
])


param_grid = {}
param_grid['estimator__n_estimators'] = [10, 100, 1000]
param_grid['estimator__learning_rate'] = [0.001, 0.01, 0.1]
param_grid['estimator__subsample'] = [0.5, 0.7, 1.0]
param_grid['estimator__max_depth'] = [3, 7, 9]
# param_grid['estimator__min_samples_leaf'] = [1, 2, 4]
# param_grid['estimator__bootstrap'] = [True, False]

grid_search = GridSearchCV(estimator=pipe, 
                           param_grid=param_grid, 
                           cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42), 
                           return_train_score=True, 
                           scoring='accuracy', 
                           random_state=42,
                           n_jobs=-1,
                           verbose=2)


grid_search.fit(X_train, y_train)


# Mean training score
grid_train_score = np.mean(grid_search.cv_results_['mean_train_score'])

# Mean test score
grid_test_score = grid_search.score(X_test, y_test)

best_grid = grid_search.best_estimator_
best_grid.fit(X_train, y_train)
y_pred = best_grid.predict(X_test)

print(f"Mean Training Score: {grid_train_score:.2%}\n")
print(f"Mean Test Score: {grid_test_score:.2%}\n")

print(f"Accuracy Score: {accuracy_score(y_test, y_pred):.2%}\n")
print(f"Optimal Parameters: {grid_search.best_params_}\n")
print(f"Testing Accuracy: {grid_search.best_score_:.2%}\n")

print(classification_report(y_test, y_pred))

## Support Vector Machine

In [None]:
pipe = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('estimator', SVC(random_state=42))
])

kernel = ['poly', 'rbf', 'sigmoid']
C = [50, 10, 1.0, 0.1, 0.01]
gamma = ['scale']


param_grid = {}
param_grid['estimator__kernel'] = ['poly', 'rbf', 'sigmoid']
param_grid['estimator__C'] = [50, 10, 1.0, 0.1, 0.01]
param_grid['estimator__gamma'] = ['scale']

grid_search = RandomizedSearchCV(estimator=pipe, 
                                 param_distributions=param_grid, 
                                 cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42), 
                                 return_train_score=True, 
                                 scoring='accuracy', 
                                 n_iter=100, 
                                 random_state=42, 
                                 n_jobs=-1, 
                                 verbose=2)

grid_search = GridSearchCV(estimator=pipe, 
                           param_grid=param_grid, 
                           cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42), 
                           return_train_score=True, 
                           scoring='accuracy', 
                           random_state=42,
                           n_jobs = -1,
                           verbose=2)


grid_search.fit(X_train, y_train)


# Mean training score
grid_train_score = np.mean(grid_search.cv_results_['mean_train_score'])

# Mean test score
grid_test_score = grid_search.score(X_test, y_test)

best_grid = grid_search.best_estimator_
best_grid.fit(X_train, y_train)
y_pred = best_grid.predict(X_test)

print(f"Mean Training Score: {grid_train_score:.2%}\n")
print(f"Mean Test Score: {grid_test_score:.2%}\n")

print(f"Accuracy Score: {accuracy_score(y_test, y_pred):.2%}\n")
print(f"Optimal Parameters: {grid_search.best_params_}\n")
print(f"Testing Accuracy: {grid_search.best_score_:.2%}\n")

print(classification_report(y_test, y_pred))

## XGB

In [None]:
pipe = Pipeline(steps=[
    ('estimator', xgboost.XGBRegressor(random_state=42, objective='reg:squarederror'))
])



In [None]:
param_grid = {}
param_grid['estimator__min_child_weight'] = [1, 5, 10],
param_grid['estimator__gamma'] = [0.5, 1, 1.5, 2, 5],
param_grid['estimator__subsample'] = [0.6, 0.8, 1.0],
param_grid['estimator__colsample_bytree'] = [0.6, 0.8, 1.0],
param_grid['estimator__max_depth'] = [3, 4, 5]

In [None]:
grid_search = GridSearchCV(estimator=pipe, 
                           param_grid=param_grid, 
                           cv=10, 
                           return_train_score=True, 
                           scoring='accuracy',
                           n_jobs = -1,
                           verbose=1)


In [None]:
grid_search.fit(X_train, y_train)

In [None]:
# Mean training score
grid_train_score = np.mean(grid_search.cv_results_['mean_train_score'])

# Mean test score
grid_test_score = grid_search.score(X_test, y_test)

best_grid = grid_search.best_estimator_
best_grid.fit(X_train, y_train)
y_pred = best_grid.predict(X_test)

print(f"Mean Training Score: {grid_train_score:.2%}\n")
print(f"Mean Test Score: {grid_test_score:.2%}\n")

print(f"Accuracy Score: {accuracy_score(y_test, y_pred):.2%}\n")
print(f"Optimal Parameters: {grid_search.best_params_}\n")
print(f"Testing Accuracy: {grid_search.best_score_:.2%}\n")

print(classification_report(y_test, y_pred))

In [None]:
# Plot a confusion matrix on the test data
from sklearn.metrics import ConfusionMatrixDisplay

ConfusionMatrixDisplay.from_estimator(final_model, X_test, y_test);

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn import model_selection
from sklearn.utils import class_weight
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import numpy as np
import pandas as pd


#script to test the effectivenes of each model, uses default parameters
#test six different classification models 
def run_exps(X_train, y_train, X_test, y_test) :
    '''
    Lightweight script to test many models and find winners
    :param X_train: training split
    :param y_train: training target vector
    :param X_test: test split
    :param y_test: test target vector
    :return: DataFrame of predictions
    '''
    
    dfs = []
    
    models = [
          ('LogReg', LogisticRegression()), 
          ('RF', RandomForestClassifier()),
          ('KNN', KNeighborsClassifier()),
          ('SVM', SVC()), 
          ('GNB', GaussianNB()),
          ('XGB', XGBClassifier())
        ]
    
    results = []
    
    names = []
    
    scoring = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted', 'roc_auc']
    
    target_names = ['win', 'loss']
    
    for name, model in models:
        
        kfold = model_selection.KFold(n_splits=5, shuffle=True, random_state=90210)
        cv_results = model_selection.cross_validate(model, X_train, y_train, cv=kfold, scoring=scoring)
        clf = model.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        
        print(name)
        print(classification_report(y_test, y_pred, target_names=target_names))
        
        results.append(cv_results)
        names.append(name)
        
        this_df = pd.DataFrame(cv_results)
        this_df['model'] = name
        dfs.append(this_df)
        
    final = pd.concat(dfs, ignore_index=True)
    
    return final
final = run_exps(X_train, y_train, X_test, y_test)
final

# Model Performance

In [None]:
bootstraps = []
for model in list(set(final.model.values)):
    model_df = final.loc[final.model == model]
    bootstrap = model_df.sample(n=30, replace=True)
    bootstraps.append(bootstrap)
        
bootstrap_df = pd.concat(bootstraps, ignore_index=True)
results_long = pd.melt(bootstrap_df,id_vars=['model'],var_name='metrics', value_name='values')
time_metrics = ['fit_time','score_time'] # fit time metrics
## PERFORMANCE METRICS
results_long_nofit = results_long.loc[~results_long['metrics'].isin(time_metrics)] # get df without fit data
results_long_nofit = results_long_nofit.sort_values(by='values')
## TIME METRICS
results_long_fit = results_long.loc[results_long['metrics'].isin(time_metrics)] # df with fit data
results_long_fit = results_long_fit.sort_values(by='values')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(20, 12))
sns.set(font_scale=2.5)
g = sns.boxplot(x="model", y="values", hue="metrics", data=results_long_nofit, palette="Set3")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.title('Comparison of Model by Classification Metric')
plt.savefig('./benchmark_models_performance.png',dpi=300)

In [None]:
plt.figure(figsize=(20, 12))
sns.set(font_scale=2.5)
g = sns.boxplot(x="model", y="values", hue="metrics", data=results_long_fit, palette="Set3")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.title('Comparison of Model by Fit and Score Time')
plt.savefig('./benchmark_models_time.png',dpi=300)

In [None]:
metrics = list(set(results_long_nofit.metrics.values))
bootstrap_df.groupby(['model'])[metrics].agg([np.std, np.mean])

# Final Selected Model
- grid search for parameters 
- Gaussian NB

In [None]:
# Gaussian NB only has one parameter 'var_smoothing'
# Portion of the largest variance of all features that is added to variances for calculation stability.
# Number of different combinations of parameters 

from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

nb_classifier = GaussianNB()

target_names = ['Win', 'Loss']

params_NB = {'var_smoothing': np.logspace(0,-9, num=100)}
kfold = model_selection.KFold(n_splits=5, shuffle=True, random_state=90210)

gs_NB = GridSearchCV(estimator=nb_classifier, 
                 param_grid=params_NB, 
                 cv=kfold,   
                 verbose=1, 
                 scoring='accuracy', n_jobs=-1) 

gs_NB.fit(X_train, y_train)

best_gs_grid = gs_NB.best_estimator_
best_gs_grid.fit(X_train, y_train)
y_pred_best_gs = best_gs_grid.predict(X_test)

print(classification_report(y_test, y_pred_best_gs, target_names=target_names))
gs_NB.best_params_

In [None]:
from sklearn.metrics import confusion_matrix
confusionMatrix = confusion_matrix(y_test, y_pred_best_gs)
print(confusionMatrix)

# Confusion Matrix

In [None]:
confusionMatrix = confusion_matrix(y_test, y_pred_best_gs)  

    # Code below prints model accuracy information
print('Coefficient Information:')

for i in range(len(featureColumns)):  

    logregCoefficients = logreg.coef_

    currentFeature = featureColumns[i]
    currentCoefficient = logregCoefficients[0][i]

    print(currentFeature + ': ' + str(currentCoefficient))

print('----------------------------------')

print("Accuracy:", metrics.accuracy_score(Y_test, Y_pred))
print("Precision:", metrics.precision_score(Y_test, Y_pred))
print("Recall:", metrics.recall_score(Y_test, Y_pred))

print('----------------------------------')

print('Confusion Matrix:')
print(confusionMatrix)

# Saving Model

In [None]:
import pickle

# Saves the model in folder to be used in future
# filename should be end in '.pkl'
def save_model(model, filename):

    with open(filename, 'wb') as file:
        pickle.dump(model, file)
save_model()