In [1]:
teams = {
        "Atlanta Hawks": 1610612737,
        "Boston Celtics": 1610612738,
        "Brooklyn Nets": 1610612751,
        "Charlotte Bobcats": 1610612766,
        "Charlotte Hornets": 1610612766,
        "Chicago Bulls": 1610612741,
        "Cleveland Cavaliers": 1610612739,
        "Dallas Mavericks": 1610612742,
        "Denver Nuggets": 1610612743,
        "Detroit Pistons": 1610612765,
        "Golden State Warriors": 1610612744,
        "Houston Rockets": 1610612745,
        "Indiana Pacers": 1610612754,
        "LA Clippers": 1610612746,
        "Los Angeles Clippers": 1610612746,
        "Los Angeles Lakers": 1610612747,
        "Memphis Grizzlies": 1610612763,
        "Miami Heat": 1610612748,
        "Milwaukee Bucks": 1610612749,
        "Minnesota Timberwolves": 1610612750,
        "New Jersey Nets": 1610612751,
        "New Orleans Hornets": 1610612740,
        "New Orleans Pelicans": 1610612740,
        "New York Knicks": 1610612752,
        "Oklahoma City Thunder": 1610612760,
        "Orlando Magic": 1610612753,
        "Philadelphia 76ers": 1610612755,
        "Phoenix Suns": 1610612756,
        "Portland Trail Blazers": 1610612757,
        "Sacramento Kings": 1610612758,
        "San Antonio Spurs": 1610612759,
        "Toronto Raptors": 1610612761,
        "Utah Jazz": 1610612762,
        "Washington Wizards": 1610612764,
    }

available_stats = {'W_PCT': 'Base',
                   'FG_PCT': 'Base',
                   'FG3_PCT': 'Base',
                   'FT_PCT': 'Base',
                   'REB': 'Base',
                   'AST': 'Base',
                   'TOV': 'Base',
                   'STL': 'Base',
                   'BLK': 'Base',
                   'PLUS_MINUS': 'Base',
                   'OFF_RATING': 'Advanced',
                   'DEF_RATING': 'Advanced',
                   'TS_PCT': 'Advanced'}


In [2]:
# get_stats.py gets the team data for the model

# from team_names import teams
import nba_api
from nba_api.stats.endpoints import teamdashboardbygeneralsplits, leaguedashteamstats
import time
import pandas as pd



def get_team_stats_dict(team, start_date, end_date, season='2021-22'):
    """
    Returns the stats for the specified team in a dataframe, default year is 2021-22
    :param team: Day of games scheduled in form 'mm/dd/yyyy'
    :param season: Day of games scheduled in form 'mm/dd/yyyy'
    :return: A dictionary of game matchups {home_team:[away_team]}
    """

    time.sleep(1)

    
    general_team_info = teamdashboardbygeneralsplits.TeamDashboardByGeneralSplits(team_id=teams[team],
                                                                                  per_mode_detailed='Per100Possessions',
                                                                                  season=season,
                                                                                  date_from_nullable=start_date,
                                                                                  date_to_nullable=end_date,
                                                                                  timeout=120)
    general_team_dict = general_team_info.get_normalized_dict()
    general_team_dashboard = general_team_dict['OverallTeamDashboard'][0]


    win_percentage = general_team_dashboard['W_PCT']
    fg_percentage = general_team_dashboard['FG_PCT']
    fg3_percentage = general_team_dashboard['FG3_PCT']
    ft_percentage = general_team_dashboard['FT_PCT']
    rebounds = general_team_dashboard['REB']
    assists = general_team_dashboard['AST']
    turnovers = general_team_dashboard['TOV']
    steals = general_team_dashboard['STL']
    blocks = general_team_dashboard['BLK']
    plus_minus = general_team_dashboard['PLUS_MINUS']

    advanced_team_info = teamdashboardbygeneralsplits.TeamDashboardByGeneralSplits(team_id=teams[team],
                                                                                   measure_type_detailed_defense='Advanced',
                                                                                   season=season,
                                                                                   date_from_nullable=start_date,
                                                                                   date_to_nullable=end_date,
                                                                                   timeout=120)
    advanced_team_dict = advanced_team_info.get_normalized_dict()
    advanced_team_dashboard = advanced_team_dict['OverallTeamDashboard'][0]

    offensive_rating = advanced_team_dashboard['OFF_RATING']
    defensive_rating = advanced_team_dashboard['DEF_RATING']
    true_shooting_percentage = advanced_team_dashboard['TS_PCT']

    all_stats_dict = {
        'W_PCT': win_percentage,
        'FG_PCT': fg_percentage,
        'FG3_PCT': fg3_percentage,
        'FT_PCT': ft_percentage,
        'REB': rebounds,
        'AST': assists,
        'TOV': turnovers,
        'STL': steals,
        'BLK': blocks,
        'PLUS_MINUS': plus_minus,
        'OFF_RATING': offensive_rating,
        'DEF_RATING': defensive_rating,
        'TS_PCT': true_shooting_percentage
    }

    return all_stats_dict


get_team_stats_dict('Golden State Warriors', '10/19/2021', '04/10/2022', '2021-22')

{'W_PCT': 0.646,
 'FG_PCT': 0.469,
 'FG3_PCT': 0.364,
 'FT_PCT': 0.769,
 'REB': 45.9,
 'AST': 27.4,
 'TOV': 15.0,
 'STL': 8.9,
 'BLK': 4.6,
 'PLUS_MINUS': 5.6,
 'OFF_RATING': 112.1,
 'DEF_RATING': 106.6,
 'TS_PCT': 0.582}

In [3]:
# get_matches.py gets the daily matches for a specific date and the results of the games

from nba_api.stats.endpoints import leaguegamelog, scoreboard, leaguestandings
#from team_names import teams



def get_match_results(date, season):
    """
    Returns the matchup and result of the game

    :param date: Day of games scheduled in form 'mm/dd/yyyy'
    :param season: Season in form of 'yyyy-yy'
    :return: [{Boston Celtics: Los Angeles Lakers}], ['W']
    """

    game_log = leaguegamelog.LeagueGameLog(season=season, league_id='00', date_from_nullable=date,
                                           date_to_nullable=date, season_type_all_star='Regular Season', timeout=120)
    game_log_dict = game_log.get_normalized_dict()
    list_of_teams = game_log_dict['LeagueGameLog']

    daily_match = {}
    win_loss = []
    score = []
    game_id = []

    for i in range(0, len(list_of_teams), 2):

        if '@' in list_of_teams[i]['MATCHUP']:

            away_team = list_of_teams[i]['TEAM_NAME']
            home_team = list_of_teams[i + 1]['TEAM_NAME']

            win_loss.append(list_of_teams[i + 1]['WL'])

            game_id.append(list_of_teams[i + 1]['GAME_ID'])

            score.append(list_of_teams[i + 1]['PTS'])
            score.append(list_of_teams[i]['PTS'])

        else:
            away_team = list_of_teams[i + 1]['TEAM_NAME']
            home_team = list_of_teams[i]['TEAM_NAME']

            win_loss.append(list_of_teams[i]['WL'])

            game_id.append(list_of_teams[i]['GAME_ID'])

            score.append(list_of_teams[i]['PTS'])
            score.append(list_of_teams[i + 1]['PTS'])

        daily_match.update({home_team: away_team})

    match_results = [daily_match, win_loss, score, game_id]

    return match_results


def get_daily_matches(date):
    """
    This method creates a dictionary of daily game matchups.

    :param date: Day of games scheduled in form 'mm/dd/yyyy'
    :return: A dictionary of game matchups {home_team:away_team}
    """

    daily_match = scoreboard.Scoreboard(league_id='00', game_date=date, timeout=120)
    daily_match_dict = daily_match.get_normalized_dict()
    games = daily_match_dict['GameHeader']

    match = {}

    for game in games:

        home_team_id = game['HOME_TEAM_ID']

        for team, team_id in teams.items():
            if team_id == home_team_id:
                home_team = team

        away_team_id = game['VISITOR_TEAM_ID']

        for team, team_id in teams.items():
            if team_id == away_team_id:
                away_team = team

        match.update({home_team: away_team})

    return match



def main():
    print(f"""'get_daily_matches' returns a dictionary of the games on a specified date\n{get_daily_matches('12/25/22')}\n""")
    print(f"""'get_match_results' returns the matchup plus the result\n{get_match_results('10/19/2021', '2021-22')}""")

main()

'get_daily_matches' returns a dictionary of the games on a specified date
{'New York Knicks': 'Philadelphia 76ers', 'Dallas Mavericks': 'Los Angeles Lakers', 'Boston Celtics': 'Milwaukee Bucks', 'Golden State Warriors': 'Memphis Grizzlies', 'Denver Nuggets': 'Phoenix Suns'}

'get_match_results' returns the matchup plus the result
[{'Los Angeles Lakers': 'Golden State Warriors', 'Milwaukee Bucks': 'Brooklyn Nets'}, ['L', 'W'], [114, 121, 127, 104], ['0022100002', '0022100001']]


In [11]:
from datetime import date, timedelta
import pandas as pd
import requests

# from get_stats import get_team_stats_dict
# from get_matches import get_match_results
#from standardization import z_score, stat_std, stat_mean
#from available_stats import available_stats


# [{'Sacramento Kings': 'Boston Celtics', 'Charlotte Hornets': 'Philadelphia 76ers'}, ['W', 'L']]
# team stats is a dataframe
def to_dataframe(daily_games, start_date, end_date, season): #, mean_dict, std_dict):
    full_dataframe = []
    game_number = 0  # counter to match with the correct game
    daily_results = daily_games[1]  # win or loss for each game
    score = daily_games[2]
    game_id = daily_games[3]

    for home_team, away_team in daily_games[0].items():  # loops through matchups
        home_team_stats = get_team_stats_dict(home_team, start_date, end_date, season)
        away_team_stats = get_team_stats_dict(away_team, start_date, end_date, season)

        current_game = [home_team, away_team]
        
        current_game.append(game_id[game_number])

        current_game.append(score.pop(0))

        for stat, stat_type in available_stats.items():
            current_game.append(home_team_stats[stat])
        
        current_game.append(score.pop(0))

        for stat, stat_type in available_stats.items():
            current_game.append(away_team_stats[stat])


        #for stat, stat_type in available_stats.items():
        #    z_score_diff = z_score_difference(home_team_stats[stat], away_team_stats[stat], mean_dict[stat], std_dict[stat])

         #   current_game.append(z_score_diff)

        if daily_results[game_number] == 'W':
            result = 1
        else:
            result = 0

        current_game.append(result)
        game_number += 1

        print(current_game)

        full_dataframe.append(current_game)

    return full_dataframe



def date_range(start_date, end_date):
    for n in range(int((end_date - start_date).days)):
        yield start_date + timedelta(n)
        

def training_set(start_year, start_month, start_day, end_year, end_month, end_day, season, season_start):
    start_date = date(start_year, start_month, start_day)
    end_date = date(end_year, end_month, end_day)

    total_games = []

    for single_date in date_range(start_date, end_date):
        current_date = single_date.strftime('%m/%d/%Y')
        print(current_date)

        previous_day = single_date - timedelta(days=1)
        previous_day_formatted = previous_day.strftime('%m/%d/%Y')

        #mean_std_dictionary = mean_std_dict(season_start, previous_day_formatted, season)
        #mean_dict = mean_std_dictionary[0]
        #std_dict = mean_std_dictionary[1]

        current_day_games = get_match_results(current_date, season)
        current_day_games_with_stats = to_dataframe(current_day_games, season_start, previous_day_formatted, season)

        for game in current_day_games_with_stats:
            game.append(current_date)
            total_games.append(game)

    print(total_games)
    return total_games


def make_dataframe(game_list):
    games = pd.DataFrame(game_list,
                         columns=['Home', 'Away', 'Game_ID', 'H_Score', 'H_W_PCT', 'H_FG_PCT', 'H_FG3_PCT', 'H_FT_PCT',
                                  'H_REB', 'H_AST', 'H_TOV', 'H_STL',
                                  'H_BLK', 'H_PLUS_MINUS', 'H_OFF_RATING', 'H_DEF_RATING', 'H_TS_PCT', 'A_Score',
                                  'A_W_PCT', 'A_FG_PCT', 'A_FG3_PCT',
                                  'A_FT_PCT', 'A_REB', 'A_AST', 'A_TOV', 'A_STL',
                                  'A_BLK', 'A_PLUS_MINUS', 'A_OFF_RATING', 'A_DEF_RATING', 'A_TS_PCT', 'Result',
                                  'Date'])

    print(games)
    return games


def main():
    attempts = 10

    for i in range(attempts):
        try:
            all_games = training_set(start_year=2018, start_month=1, start_day=29, end_year=2018, end_month=4, end_day=12,
                             season='2017-18', season_start='10/17/2017')
            df = make_dataframe(all_games)

            print(df)
            df.to_csv(r'C:\Users\alvaro\OneDrive\Documents\School\Flatiron\Projects\NBA_Prediction_Model\data\nba_df_2017_v3.csv', index=False)
        except requests.exceptions.ReadTimeout:
            if i < attempts - 1:
                continue
            else:
                raise
        break

if __name__ == '__main__':
    main()


01/29/2018
['Milwaukee Bucks', 'Philadelphia 76ers', '0021700741', 107, 0.542, 0.481, 0.354, 0.768, 39.9, 23.3, 14.2, 8.9, 5.2, -0.5, 108.6, 109.0, 0.574, 95, 0.522, 0.466, 0.364, 0.762, 46.4, 25.2, 17.6, 8.0, 5.0, 1.4, 106.1, 104.5, 0.564, 1]
['Atlanta Hawks', 'Minnesota Timberwolves', '0021700739', 105, 0.286, 0.45, 0.369, 0.789, 41.6, 24.1, 15.8, 8.2, 4.3, -4.8, 105.2, 110.1, 0.55, 100, 0.615, 0.476, 0.35, 0.796, 43.2, 23.3, 12.9, 9.1, 4.4, 3.3, 112.6, 109.2, 0.564, 1]
['Indiana Pacers', 'Charlotte Hornets', '0021700738', 105, 0.54, 0.48, 0.374, 0.77, 42.8, 23.0, 13.6, 8.2, 4.2, 1.1, 109.2, 108.3, 0.564, 96, 0.417, 0.44, 0.354, 0.73, 46.9, 20.7, 13.1, 6.8, 4.9, 0.0, 106.4, 106.7, 0.535, 1]
['Dallas Mavericks', 'Miami Heat', '0021700742', 88, 0.32, 0.448, 0.366, 0.767, 42.0, 23.0, 12.6, 7.1, 3.9, -2.0, 105.6, 107.4, 0.549, 95, 0.571, 0.455, 0.363, 0.758, 44.1, 22.8, 15.6, 7.3, 5.4, -0.8, 104.8, 105.8, 0.553, 0]
['Denver Nuggets', 'Boston Celtics', '0021700743', 110, 0.531, 0.46, 0.35

['Oklahoma City Thunder', 'New Orleans Pelicans', '0021700772', 100, 0.577, 0.457, 0.353, 0.71, 46.1, 21.8, 14.5, 9.6, 5.2, 4.1, 109.6, 105.3, 0.545, 114, 0.54, 0.486, 0.365, 0.763, 42.1, 25.9, 15.3, 7.4, 5.1, 0.0, 108.5, 108.6, 0.582, 0]
02/03/2018
['San Antonio Spurs', 'Utah Jazz', '0021700781', 111, 0.63, 0.456, 0.36, 0.781, 46.7, 23.7, 14.5, 7.8, 5.8, 3.3, 106.0, 102.7, 0.544, 120, 0.451, 0.455, 0.371, 0.793, 42.8, 22.0, 14.7, 9.5, 5.0, 0.5, 105.8, 105.5, 0.557, 0]
['LA Clippers', 'Chicago Bulls', '0021700775', 113, 0.5, 0.463, 0.354, 0.749, 43.5, 22.1, 15.1, 8.0, 4.9, 0.3, 108.2, 107.5, 0.561, 103, 0.353, 0.439, 0.362, 0.763, 44.8, 23.8, 13.9, 7.1, 3.7, -5.7, 103.8, 109.4, 0.533, 1]
['Orlando Magic', 'Washington Wizards', '0021700778', 98, 0.3, 0.46, 0.358, 0.752, 41.8, 23.5, 14.8, 7.3, 4.9, -4.7, 106.3, 110.6, 0.553, 115, 0.569, 0.465, 0.374, 0.764, 43.3, 23.9, 14.4, 7.9, 4.6, 1.7, 108.2, 106.9, 0.559, 0]
['Cleveland Cavaliers', 'Houston Rockets', '0021700779', 88, 0.6, 0.472, 0.

['Golden State Warriors', 'Dallas Mavericks', '0021700815', 121, 0.759, 0.509, 0.391, 0.809, 43.5, 29.8, 15.7, 8.0, 7.7, 7.2, 113.7, 106.5, 0.611, 103, 0.315, 0.445, 0.362, 0.766, 42.2, 23.1, 12.8, 7.3, 3.9, -2.2, 105.4, 107.4, 0.546, 1]
['Los Angeles Lakers', 'Oklahoma City Thunder', '0021700816', 106, 0.415, 0.456, 0.33, 0.692, 45.3, 22.8, 15.7, 8.0, 4.8, -2.9, 104.1, 106.8, 0.536, 81, 0.564, 0.455, 0.35, 0.713, 46.1, 21.7, 14.6, 9.6, 5.2, 3.9, 109.4, 105.3, 0.544, 1]
['Orlando Magic', 'Atlanta Hawks', '0021700811', 100, 0.321, 0.462, 0.357, 0.755, 41.8, 23.6, 14.8, 7.4, 4.9, -4.4, 106.6, 110.7, 0.555, 98, 0.315, 0.451, 0.37, 0.795, 41.5, 24.4, 15.7, 8.2, 4.3, -4.1, 105.8, 109.9, 0.553, 1]
['Portland Trail Blazers', 'Charlotte Hornets', '0021700814', 109, 0.537, 0.453, 0.38, 0.797, 45.9, 19.7, 14.4, 6.9, 5.3, 1.0, 107.3, 106.5, 0.549, 103, 0.434, 0.444, 0.358, 0.734, 46.5, 21.1, 13.0, 6.9, 4.7, 0.0, 107.5, 107.9, 0.54, 1]
['Toronto Raptors', 'New York Knicks', '0021700812', 113, 0.69

['Milwaukee Bucks', 'Atlanta Hawks', '0021700848', 97, 0.564, 0.478, 0.35, 0.773, 40.5, 23.2, 14.2, 8.8, 5.4, 0.1, 108.1, 107.9, 0.571, 92, 0.316, 0.451, 0.37, 0.797, 41.5, 24.4, 15.6, 8.1, 4.4, -4.2, 106.0, 110.2, 0.554, 1]
['Dallas Mavericks', 'Sacramento Kings', '0021700851', 109, 0.316, 0.446, 0.36, 0.767, 42.2, 23.2, 12.8, 7.4, 3.9, -2.4, 105.7, 108.0, 0.547, 114, 0.309, 0.452, 0.383, 0.725, 41.9, 21.8, 15.0, 8.7, 4.2, -8.6, 102.7, 111.2, 0.53, 0]
['Denver Nuggets', 'San Antonio Spurs', '0021700852', 117, 0.536, 0.464, 0.361, 0.76, 45.7, 24.5, 15.5, 7.9, 4.7, 0.7, 109.4, 108.8, 0.561, 109, 0.603, 0.457, 0.358, 0.777, 46.6, 23.9, 14.2, 7.7, 5.8, 3.4, 106.5, 103.1, 0.544, 1]
['Minnesota Timberwolves', 'Houston Rockets', '0021700849', 108, 0.593, 0.48, 0.359, 0.797, 42.9, 23.6, 13.0, 9.1, 4.4, 3.3, 112.8, 109.4, 0.568, 126, 0.764, 0.463, 0.365, 0.786, 43.9, 22.0, 14.1, 8.8, 4.7, 8.6, 114.8, 106.6, 0.595, 0]
['Toronto Raptors', 'Miami Heat', '0021700847', 115, 0.709, 0.47, 0.356, 0.8,

['New Orleans Pelicans', 'Miami Heat', '0021700879', 124, 0.544, 0.483, 0.365, 0.76, 42.3, 25.7, 15.0, 7.6, 5.2, 0.2, 108.4, 108.4, 0.577, 123, 0.517, 0.452, 0.359, 0.756, 44.4, 22.8, 15.5, 7.3, 5.4, -0.9, 104.8, 106.0, 0.55, 1]
02/24/2018
['Philadelphia 76ers', 'Orlando Magic', '0021700884', 116, 0.554, 0.466, 0.362, 0.759, 46.4, 25.3, 17.2, 8.2, 4.9, 1.9, 106.6, 104.4, 0.563, 105, 0.31, 0.461, 0.357, 0.762, 41.8, 23.6, 14.8, 7.4, 4.9, -4.3, 106.5, 110.6, 0.555, 1]
['Golden State Warriors', 'Oklahoma City Thunder', '0021700887', 112, 0.763, 0.512, 0.395, 0.81, 43.4, 30.0, 15.7, 7.9, 7.8, 8.0, 114.4, 106.5, 0.614, 80, 0.567, 0.454, 0.352, 0.709, 46.2, 21.8, 14.3, 9.4, 5.1, 3.5, 109.8, 106.1, 0.544, 1]
['New York Knicks', 'Boston Celtics', '0021700886', 112, 0.4, 0.463, 0.354, 0.793, 45.1, 23.3, 15.5, 6.9, 5.5, -2.3, 105.9, 108.1, 0.545, 121, 0.683, 0.447, 0.371, 0.765, 45.6, 22.8, 14.2, 7.4, 4.9, 3.8, 106.3, 102.6, 0.549, 0]
['Phoenix Suns', 'Portland Trail Blazers', '0021700889', 104,

['Dallas Mavericks', 'Oklahoma City Thunder', '0021700919', 110, 0.311, 0.445, 0.362, 0.77, 42.0, 23.3, 12.8, 7.4, 4.0, -2.7, 105.7, 108.2, 0.547, 111, 0.565, 0.452, 0.351, 0.709, 46.3, 21.8, 14.5, 9.3, 5.1, 3.0, 109.4, 106.2, 0.542, 0]
['Washington Wizards', 'Golden State Warriors', '0021700917', 101, 0.59, 0.469, 0.371, 0.772, 43.4, 25.0, 14.5, 8.2, 4.5, 1.9, 108.7, 107.1, 0.562, 109, 0.77, 0.511, 0.398, 0.811, 43.6, 30.1, 15.6, 8.0, 7.7, 8.5, 114.6, 106.2, 0.615, 0]
['Memphis Grizzlies', 'Phoenix Suns', '0021700918', 102, 0.305, 0.444, 0.347, 0.798, 42.5, 22.2, 15.6, 7.7, 5.3, -4.8, 103.8, 108.7, 0.543, 110, 0.29, 0.44, 0.332, 0.75, 43.7, 20.8, 15.4, 6.7, 4.5, -8.8, 103.2, 111.8, 0.533, 0]
03/01/2018
['Cleveland Cavaliers', 'Philadelphia 76ers', '0021700923', 97, 0.6, 0.475, 0.367, 0.785, 42.4, 23.7, 14.3, 7.2, 3.8, 0.3, 111.2, 110.8, 0.583, 108, 0.542, 0.465, 0.36, 0.761, 46.4, 25.4, 17.2, 8.3, 4.9, 1.8, 106.6, 104.6, 0.562, 0]
['Miami Heat', 'Los Angeles Lakers', '0021700922', 113

['Utah Jazz', 'Orlando Magic', '0021700955', 94, 0.524, 0.456, 0.369, 0.792, 43.7, 22.1, 15.0, 9.0, 5.0, 1.6, 106.3, 104.8, 0.56, 80, 0.317, 0.462, 0.36, 0.76, 41.6, 23.8, 14.8, 7.5, 4.9, -4.2, 106.6, 110.6, 0.557, 1]
['Indiana Pacers', 'Milwaukee Bucks', '0021700951', 92, 0.571, 0.478, 0.369, 0.774, 43.4, 22.8, 13.8, 8.7, 4.3, 1.6, 109.1, 107.7, 0.563, 89, 0.54, 0.475, 0.353, 0.776, 40.7, 23.0, 14.3, 8.8, 5.5, -0.4, 108.1, 108.5, 0.569, 1]
03/06/2018
['Golden State Warriors', 'Brooklyn Nets', '0021700963', 114, 0.778, 0.512, 0.397, 0.812, 43.3, 30.1, 15.6, 8.0, 7.7, 8.4, 114.5, 106.2, 0.615, 101, 0.313, 0.436, 0.349, 0.774, 44.9, 23.0, 15.3, 6.1, 4.2, -4.1, 105.2, 109.0, 0.545, 1]
['LA Clippers', 'New Orleans Pelicans', '0021700964', 116, 0.548, 0.469, 0.353, 0.759, 43.8, 22.4, 14.8, 8.0, 4.9, 1.1, 109.3, 107.9, 0.567, 121, 0.581, 0.483, 0.363, 0.762, 42.5, 25.6, 14.8, 7.6, 5.3, 0.7, 108.7, 108.3, 0.576, 0]
['Portland Trail Blazers', 'New York Knicks', '0021700962', 111, 0.594, 0.452,

['Oklahoma City Thunder', 'San Antonio Spurs', '0021700990', 104, 0.567, 0.454, 0.349, 0.711, 46.1, 21.6, 14.6, 9.2, 5.2, 3.0, 109.5, 106.2, 0.544, 94, 0.569, 0.459, 0.353, 0.776, 46.2, 23.9, 14.0, 7.8, 5.8, 3.0, 106.9, 104.0, 0.545, 1]
03/11/2018
['Los Angeles Lakers', 'Cleveland Cavaliers', '0021701000', 127, 0.446, 0.462, 0.344, 0.705, 45.2, 23.3, 15.5, 7.9, 4.8, -1.7, 106.0, 107.5, 0.546, 113, 0.585, 0.475, 0.366, 0.781, 42.5, 23.9, 14.1, 7.2, 3.9, 0.2, 111.2, 111.0, 0.582, 1]
['New York Knicks', 'Toronto Raptors', '0021700992', 106, 0.364, 0.463, 0.351, 0.79, 44.8, 23.3, 15.3, 6.9, 5.5, -3.4, 105.8, 109.1, 0.544, 132, 0.738, 0.47, 0.354, 0.8, 43.9, 24.0, 13.3, 8.3, 6.1, 8.6, 112.9, 104.3, 0.574, 0]
['Brooklyn Nets', 'Philadelphia 76ers', '0021700999', 97, 0.318, 0.437, 0.347, 0.776, 44.7, 23.1, 15.2, 6.1, 4.2, -3.9, 105.4, 109.1, 0.546, 120, 0.547, 0.467, 0.365, 0.761, 46.3, 25.6, 17.1, 8.0, 4.9, 1.9, 107.1, 104.9, 0.565, 0]
['Boston Celtics', 'Indiana Pacers', '0021700998', 97, 0

['Denver Nuggets', 'Detroit Pistons', '0021701026', 120, 0.544, 0.47, 0.368, 0.765, 44.9, 25.4, 15.4, 8.0, 4.7, 1.2, 110.8, 109.7, 0.569, 113, 0.448, 0.447, 0.368, 0.746, 44.5, 23.1, 14.2, 7.9, 3.9, -1.5, 105.7, 107.1, 0.539, 1]
['Memphis Grizzlies', 'Chicago Bulls', '0021701024', 110, 0.269, 0.442, 0.347, 0.791, 42.3, 22.3, 15.6, 7.7, 5.2, -5.8, 103.1, 109.0, 0.539, 111, 0.343, 0.438, 0.357, 0.764, 44.8, 23.6, 13.7, 7.4, 3.7, -6.0, 103.8, 109.9, 0.532, 0]
03/16/2018
['Philadelphia 76ers', 'Brooklyn Nets', '0021701030', 120, 0.552, 0.467, 0.365, 0.757, 46.4, 25.8, 17.1, 8.0, 4.9, 2.3, 107.3, 104.8, 0.565, 116, 0.309, 0.436, 0.349, 0.779, 44.6, 23.0, 15.2, 6.2, 4.2, -4.4, 105.3, 109.4, 0.545, 1]
['Golden State Warriors', 'Sacramento Kings', '0021701033', 93, 0.765, 0.51, 0.394, 0.815, 43.5, 29.8, 15.6, 7.9, 7.7, 7.8, 114.2, 106.5, 0.613, 98, 0.319, 0.452, 0.378, 0.739, 42.2, 22.2, 14.4, 8.4, 4.3, -8.0, 103.5, 111.4, 0.531, 0]
['Los Angeles Lakers', 'Miami Heat', '0021701034', 91, 0.456,

['Minnesota Timberwolves', 'LA Clippers', '0021701060', 123, 0.563, 0.48, 0.359, 0.802, 43.0, 23.3, 12.9, 8.7, 4.3, 2.3, 113.0, 110.7, 0.569, 109, 0.536, 0.47, 0.351, 0.755, 44.0, 22.3, 14.7, 7.8, 4.7, 1.0, 109.5, 108.2, 0.566, 1]
['New Orleans Pelicans', 'Dallas Mavericks', '0021701061', 115, 0.571, 0.482, 0.363, 0.771, 42.6, 25.7, 14.6, 7.5, 5.7, 0.5, 108.4, 108.0, 0.573, 105, 0.314, 0.446, 0.364, 0.77, 42.4, 23.4, 12.8, 7.4, 3.8, -2.5, 106.0, 108.2, 0.547, 1]
03/21/2018
['Philadelphia 76ers', 'Memphis Grizzlies', '0021701066', 119, 0.565, 0.467, 0.367, 0.758, 46.6, 25.9, 16.9, 8.0, 4.9, 2.5, 107.6, 104.9, 0.565, 105, 0.271, 0.443, 0.35, 0.79, 42.5, 22.4, 15.6, 7.7, 5.2, -5.5, 103.4, 109.2, 0.54, 1]
['San Antonio Spurs', 'Washington Wizards', '0021701071', 98, 0.577, 0.457, 0.354, 0.777, 46.2, 24.0, 13.9, 7.8, 5.8, 3.4, 106.6, 103.3, 0.543, 90, 0.571, 0.469, 0.374, 0.774, 43.5, 25.2, 14.7, 8.0, 4.4, 1.4, 108.8, 107.6, 0.563, 1]
['Chicago Bulls', 'Denver Nuggets', '0021701069', 102, 0

['Indiana Pacers', 'Miami Heat', '0021701096', 113, 0.575, 0.473, 0.369, 0.78, 43.5, 22.6, 13.7, 9.0, 4.3, 1.5, 108.6, 107.2, 0.558, 107, 0.534, 0.456, 0.363, 0.751, 44.4, 23.4, 14.8, 7.7, 5.3, 0.3, 106.6, 106.4, 0.553, 1]
['Sacramento Kings', 'Boston Celtics', '0021701099', 93, 0.329, 0.452, 0.382, 0.737, 42.3, 22.4, 14.4, 8.4, 4.3, -7.6, 103.4, 110.9, 0.531, 104, 0.681, 0.449, 0.375, 0.77, 45.7, 23.0, 14.3, 7.3, 4.8, 4.0, 107.0, 103.0, 0.552, 0]
['Toronto Raptors', 'LA Clippers', '0021701097', 106, 0.74, 0.472, 0.36, 0.799, 44.1, 24.3, 13.4, 7.9, 6.1, 8.4, 113.6, 105.2, 0.578, 117, 0.528, 0.471, 0.354, 0.755, 43.9, 22.4, 14.8, 7.8, 4.7, 0.8, 109.7, 108.6, 0.567, 0]
['Washington Wizards', 'New York Knicks', '0021701098', 97, 0.556, 0.468, 0.375, 0.773, 43.6, 25.3, 14.7, 8.0, 4.4, 1.1, 108.6, 107.7, 0.562, 101, 0.356, 0.464, 0.351, 0.786, 45.0, 23.4, 15.2, 7.0, 5.3, -3.5, 106.2, 109.5, 0.544, 0]
['Oklahoma City Thunder', 'Portland Trail Blazers', '0021701100', 105, 0.595, 0.455, 0.353,

['Utah Jazz', 'Memphis Grizzlies', '0021701135', 107, 0.56, 0.458, 0.364, 0.788, 44.4, 22.7, 15.1, 9.0, 5.2, 3.4, 106.7, 103.4, 0.56, 97, 0.28, 0.442, 0.35, 0.785, 42.5, 22.4, 15.5, 7.7, 5.1, -6.1, 103.3, 109.6, 0.538, 1]
['Atlanta Hawks', 'Philadelphia 76ers', '0021701130', 91, 0.28, 0.448, 0.363, 0.791, 41.9, 23.9, 15.7, 8.0, 4.3, -5.6, 105.0, 110.7, 0.55, 101, 0.595, 0.469, 0.368, 0.755, 46.7, 26.3, 16.7, 8.0, 5.0, 3.4, 107.9, 104.4, 0.567, 0]
['Dallas Mavericks', 'Minnesota Timberwolves', '0021701134', 92, 0.307, 0.446, 0.363, 0.768, 42.4, 23.4, 12.7, 7.4, 3.9, -2.6, 105.8, 108.3, 0.547, 93, 0.566, 0.479, 0.36, 0.804, 42.9, 23.4, 12.9, 8.5, 4.3, 2.3, 112.9, 110.6, 0.569, 0]
['Oklahoma City Thunder', 'Denver Nuggets', '0021701133', 125, 0.579, 0.455, 0.352, 0.714, 45.9, 21.8, 14.4, 9.2, 5.3, 3.2, 109.7, 106.4, 0.546, 126, 0.533, 0.47, 0.375, 0.767, 44.7, 25.4, 15.3, 8.0, 4.9, 1.2, 111.3, 110.1, 0.571, 0]
03/31/2018
['New York Knicks', 'Detroit Pistons', '0021701139', 109, 0.355, 0.4

['Orlando Magic', 'Dallas Mavericks', '0021701170', 105, 0.299, 0.452, 0.353, 0.762, 42.1, 23.6, 14.8, 7.8, 4.9, -4.5, 105.0, 109.3, 0.546, 100, 0.308, 0.446, 0.363, 0.768, 42.4, 23.3, 12.7, 7.3, 3.9, -2.6, 105.6, 108.0, 0.546, 1]
['Detroit Pistons', 'Philadelphia 76ers', '0021701169', 108, 0.481, 0.449, 0.369, 0.747, 45.1, 23.2, 13.9, 7.9, 4.0, -0.3, 106.2, 106.4, 0.54, 115, 0.61, 0.47, 0.366, 0.755, 46.8, 26.5, 16.5, 8.1, 5.0, 3.9, 108.0, 103.9, 0.567, 0]
['Atlanta Hawks', 'Miami Heat', '0021701172', 86, 0.282, 0.447, 0.361, 0.789, 42.3, 24.0, 15.8, 7.9, 4.3, -5.5, 104.5, 110.1, 0.548, 115, 0.538, 0.456, 0.361, 0.755, 44.4, 23.4, 14.9, 7.8, 5.4, 0.6, 106.3, 105.8, 0.552, 0]
['Toronto Raptors', 'Boston Celtics', '0021701171', 96, 0.714, 0.472, 0.36, 0.797, 44.2, 24.5, 13.4, 7.8, 6.1, 7.7, 113.4, 105.7, 0.577, 78, 0.688, 0.451, 0.379, 0.773, 45.5, 23.0, 14.3, 7.5, 4.7, 4.1, 107.1, 103.0, 0.553, 1]
['New Orleans Pelicans', 'Memphis Grizzlies', '0021701173', 123, 0.558, 0.481, 0.36, 0.77

['Brooklyn Nets', 'Chicago Bulls', '0021701205', 114, 0.338, 0.441, 0.356, 0.771, 44.3, 23.4, 15.1, 6.1, 4.7, -3.8, 106.2, 109.7, 0.551, 105, 0.338, 0.435, 0.355, 0.761, 44.9, 23.6, 13.9, 7.7, 3.5, -6.7, 103.6, 110.3, 0.529, 1]
['LA Clippers', 'New Orleans Pelicans', '0021701212', 100, 0.525, 0.471, 0.356, 0.75, 43.9, 22.3, 14.7, 7.7, 4.6, 0.4, 109.7, 109.2, 0.566, 113, 0.575, 0.482, 0.363, 0.773, 42.9, 25.9, 14.5, 7.7, 5.7, 0.8, 108.4, 107.7, 0.572, 0]
['Miami Heat', 'Oklahoma City Thunder', '0021701206', 93, 0.538, 0.455, 0.361, 0.758, 44.5, 23.3, 14.8, 7.8, 5.5, 0.7, 106.3, 105.7, 0.552, 115, 0.575, 0.452, 0.351, 0.714, 46.0, 21.7, 14.3, 9.3, 5.2, 3.1, 109.7, 106.4, 0.544, 0]
['Detroit Pistons', 'Toronto Raptors', '0021701204', 98, 0.475, 0.449, 0.373, 0.743, 45.0, 23.3, 13.9, 7.8, 4.0, -0.4, 106.4, 106.8, 0.541, 108, 0.725, 0.471, 0.359, 0.794, 44.4, 24.6, 13.5, 7.8, 6.1, 8.0, 113.0, 105.1, 0.575, 0]
['Milwaukee Bucks', 'Orlando Magic', '0021701208', 102, 0.538, 0.48, 0.356, 0.781,

# Dataset

## 2017 - 18

In [None]:
import pandas as pd

df_2017 = pd.read_csv('./data/nba_df_2017.csv')
df_2017['Date'] = pd.to_datetime(df_2017['Date'])
df_2017['Season'] = '2017-18'

df_2017_2 = pd.read_csv('./data/nba_df_2017_v2.csv')
df_2017_2['Date'] = pd.to_datetime(df_2017_2['Date'])
df_2017_2['Season'] = '2017-18'

df_2017_3 = pd.read_csv('./data/nba_df_2017_v3.csv')
df_2017_3['Date'] = pd.to_datetime(df_2017_3['Date'])
df_2017_3['Season'] = '2017-18'

print(len(df_2017), len(df_2017_2), len(df_2017_3))

In [None]:
frames = [df_2017, df_2017_2, df_2017_3]
df_2017_final = pd.concat(frames)
len(df_2017_final)

## 2018 - 19

In [None]:
df_2018 = pd.read_csv('./data/nba_df_2018.csv')
df_2018['Date'] = pd.to_datetime(df_2018['Date'])
df_2018['Season'] = '2018-19'

print(len(df_2018))

## 2019 - 20

In [None]:
df_2019 = pd.read_csv('./data/nba_df_2019.csv')
df_2019['Date'] = pd.to_datetime(df_2019['Date'])
df_2019['Season'] = '2019-20'

df_2019_2 = pd.read_csv('./data/nba_df_2019_2.csv')
df_2019_2['Date'] = pd.to_datetime(df_2019_2['Date'])
df_2019_2['Season'] = '2019-20'

print(len(df_2019), len(df_2019_2))

In [None]:
frames = [df_2019, df_2019_2]
df_2019_final = pd.concat(frames)
len(df_2019_final)

## 2020 - 21

In [None]:
df_2021 = pd.read_csv('./data/nba_df_2020_v0.csv')
df_2021['Date'] = pd.to_datetime(df_2021['Date'])
df_2021['Season'] = '2020-21'

df_2021_2 = pd.read_csv('./data/nba_df_2020.csv')
df_2021_2['Date'] = pd.to_datetime(df_2021_2['Date'])
df_2021_2['Season'] = '2020-21'

df_2021_3 = pd.read_csv('./data/nba_df_2020_v2.csv')
df_2021_3['Date'] = pd.to_datetime(df_2021_3['Date'])
df_2021_3['Season'] = '2020-21'

print(len(df_2021), len(df_2021_2), len(df_2021_3))

In [None]:
frames = [df_2021, df_2021_2, df_2021_3]
df_2021_final = pd.concat(frames)
len(df_2021_final)

## 2021 - 22

In [None]:
import pandas as pd

df_2022 = pd.read_csv('./data/nba_game_2022.csv')
df_2022['Date'] = pd.to_datetime(df_2022['Date'])
df_2022['Season'] = '2021-22'

df_2022_1 = pd.read_csv('./data/nba_game_2022_v1.csv')
df_2022_1['Date'] = pd.to_datetime(df_2022_1['Date'])
df_2022_1['Season'] = '2021-22'

df_2022_2 = pd.read_csv('./data/nba_game_2022_v2.csv')
df_2022_2['Date'] = pd.to_datetime(df_2022_2['Date'])
df_2022_2['Season'] = '2021-22'

df_2022_3 = pd.read_csv('./data/nba_game_2022_v3.csv')
df_2022_3['Date'] = pd.to_datetime(df_2022_3['Date'])
df_2022_3['Season'] = '2021-22'

print(len(df_2022), len(df_2022_1), len(df_2022_2), len(df_2022_3))

In [None]:
frames = [df_2022, df_2022_1, df_2022_2, df_2022_3]
df_2022_final = pd.concat(frames)

print(f"Length of 2022 data: {len(df_2022_final)}\n")

## Merge DataFrames

In [None]:
frames = [df_2018, df_2019_final, df_2021_final, df_2022_final]
df = pd.concat(frames)
df = df.reset_index(drop=True)

df

In [None]:
df.isnull().sum()

In [None]:
duplicates = df[df.duplicated()]
duplicates

# Last N Games Win %

In [None]:
import warnings
warnings.filterwarnings("ignore")
prev_game_df = df[df['Date'] < '12/25/2022'][(df['Home'] == "Golden State Warriors") | (df['Away'] == 'Golden State Warriors')].sort_values(by = 'Date').tail(10)
prev_game_df
h_df = prev_game_df.iloc[:, range(0, 32, 31)]

h_df = h_df.loc[h_df['Home'] == 'Golden State Warriors'] 
print(h_df)

In [None]:
def get_avg_win_pct_last_n_games(team, game_date, df, n):
    prev_game_df = df[df['Date'] < game_date][(df['Home'] == team) | (df['Away'] == team)].sort_values(by = 'Date').tail(n)
    
    wins = 0 
    
    result_df = prev_game_df.iloc[:, range(0,32,31)]
    h_df = result_df.loc[result_df['Home'] == team] 
    
    h_wins = h_df.loc[h_df['Result'] == 1]
    
    wins += len(h_wins)
      
    a_df = result_df.loc[result_df['Home'] != team]
    a_wins = a_df.loc[a_df['Result'] == 0]
    
    wins += len(a_wins)

    return wins/n
get_avg_win_pct_last_n_games('Golden State Warriors', '12/25/2022', df, 10)

In [None]:
for season in df['Season'].unique() :
    
    season_stats = df[df['Season'] == season].sort_values(by='Date').reset_index(drop=True)
    
    for index, row in df.iterrows() : 
        game_id = row['Game_ID']
        game_date = row['Date']
        h_team = row['Home']
        a_team = row['Away']
        
        df.loc[index,'Home_W_Pct_10'] = get_avg_win_pct_last_n_games(h_team, game_date, df, 10)
        
        df.loc[index,'Away_W_Pct_10'] = get_avg_win_pct_last_n_games(a_team, game_date, df, 10)
        

In [None]:
df[df['Season'] == '2021-22'].tail()


In [None]:
df

# ELO Rating
- every team starts with a 1500
$$R_{i+1} = k * (S_{team} - E_{team} + R_{i})$$
- S team is 1 if the team wins and 0 if they lose
- E team is the expected win probability of the team 
$$E_{team} = \frac{1}{1+10^{\frac{opp\_elo - team\_elo}{400}}}$$
- k is a moving constant that depends on margin of victory and difference in Elo ratings
$$k = 20\frac{(MOV_{winner} + 3)^{0.8}}{7.5 + 0.006(elo\_difference_{winner})} $$
- team year by year carryover 
$$(R * 0.75) + (0.25 * 1505)$$

In [None]:
df.head()

In [None]:
# Home and road team win probabilities implied by Elo ratings and home court adjustment 
import math
import time
def win_probs(home_elo, away_elo, home_court_advantage) :
    h = math.pow(10, home_elo/400)
    r = math.pow(10, away_elo/400)
    a = math.pow(10, home_court_advantage/400) 

    denom = r + a*h
    home_prob = a*h / denom
    away_prob = r / denom 
  
    return home_prob, away_prob

  #odds the home team will win based on elo ratings and home court advantage

def home_odds_on(home_elo, away_elo, home_court_advantage) :
    h = math.pow(10, home_elo/400)
    r = math.pow(10, away_elo/400)
    a = math.pow(10, home_court_advantage/400)
    return a*h/r

#this function determines the constant used in the elo rating, based on margin of victory and difference in elo ratings
def elo_k(MOV, elo_diff):
    k = 20 # Optimal K is 20 https://fivethirtyeight.com/features/how-we-calculate-nba-elo-ratings/
    if MOV>0:
        multiplier=(MOV+3)**(0.8)/(7.5+0.006*(elo_diff))
    else:
        multiplier=(-MOV+3)**(0.8)/(7.5+0.006*(-elo_diff))
    return k*multiplier


# Updates the home and away teams elo ratings after a game 

def update_elo(home_score, away_score, home_elo, away_elo, home_court_advantage) :
    home_prob, away_prob = win_probs(home_elo, away_elo, home_court_advantage) 

    if (home_score - away_score > 0) :
        home_win = 1 
        away_win = 0 
    else :
        home_win = 0 
        away_win = 1 
  
    k = elo_k(home_score - away_score, home_elo - away_elo)

    updated_home_elo = home_elo + k * (home_win - home_prob) 
    updated_away_elo = away_elo + k * (away_win - away_prob)
    
    return updated_home_elo, updated_away_elo


# Takes into account prev season elo
# The reason we revert to a mean of 1505 rather than 1500 is that 
# there are liable to be a couple of relatively recent expansion teams in the league at any given time
def get_prev_elo(team, date, season, team_stats, elo_df) :
    prev_game = team_stats[team_stats['Date'] < game_date][(team_stats['Home'] == team) | (team_stats['Away'] == team)].sort_values(by = 'Date').tail(1).iloc[0] 

    if team == prev_game['Home'] :
        elo_rating = elo_df[elo_df['Game_ID'] == prev_game['Game_ID']]['H_Team_Elo_After'].values[0]
    else :
        elo_rating = elo_df[elo_df['Game_ID'] == prev_game['Game_ID']]['A_Team_Elo_After'].values[0]
  
    if prev_game['Season'] != season :
        return (0.75 * elo_rating) + (0.25 * 1505) # Year-to-Year Carry-Over
    else :
        return elo_rating

In [None]:
df.sort_values(by = 'Date', inplace = True)
df.reset_index(inplace=True, drop = True)
elo_df = pd.DataFrame(columns=['Game_ID', 'H_Team', 'A_Team', 'H_Team_Elo_Before', 'A_Team_Elo_Before', 'H_Team_Elo_After', 'A_Team_Elo_After'])
teams_elo_df = pd.DataFrame(columns=['Game_ID','Team', 'Elo', 'Date', 'Where_Played', 'Season']) 

for index, row in df.iterrows(): 
    game_id = row['Game_ID']
    game_date = row['Date']
    season = row['Season']
    h_team, a_team = row['Home'], row['Away']
    h_score, a_score = row['H_Score'], row['A_Score'] 

    if (h_team not in elo_df['H_Team'].values and h_team not in elo_df['A_Team'].values) :
        h_team_elo_before = 1500
    else :
        h_team_elo_before = get_prev_elo(h_team, game_date, season, df, elo_df)

    if (a_team not in elo_df['H_Team'].values and a_team not in elo_df['A_Team'].values) :
        a_team_elo_before = 1500
    else :
        a_team_elo_before = get_prev_elo(a_team, game_date, season, df, elo_df)

    h_team_elo_after, a_team_elo_after = update_elo(h_score, a_score, h_team_elo_before, a_team_elo_before, 69)

    new_row = {'Game_ID': game_id, 'H_Team': h_team, 'A_Team': a_team, 'H_Team_Elo_Before': h_team_elo_before, 'A_Team_Elo_Before': a_team_elo_before, \
                                                                        'H_Team_Elo_After' : h_team_elo_after, 'A_Team_Elo_After': a_team_elo_after}
    teams_row_one = {'Game_ID': game_id,'Team': h_team, 'Elo': h_team_elo_before, 'Date': game_date, 'Where_Played': 'Home', 'Season': season}
    teams_row_two = {'Game_ID': game_id,'Team': a_team, 'Elo': a_team_elo_before, 'Date': game_date, 'Where_Played': 'Away', 'Season': season}
  
    elo_df = elo_df.append(new_row, ignore_index = True)
    teams_elo_df = teams_elo_df.append(teams_row_one, ignore_index=True)
    teams_elo_df = teams_elo_df.append(teams_row_two, ignore_index=True)

In [None]:
#teams_elo_df.set_index(["Team"], append=True)
#dataset = teams_elo_df.pivot(index="Team",values="Elo", columns="Date")
dates = list(set([d.strftime("%m-%d-%Y") for d in teams_elo_df["Date"]]))
dates = sorted(dates, key=lambda x: time.strptime(x, '%m-%d-%Y'))
teams = df["Away"]
dataset = pd.DataFrame(columns=dates)
dataset["Team"] = teams.drop_duplicates()
dataset = dataset.set_index("Team")

for index, row in teams_elo_df.iterrows():
    date = row["Date"].strftime("%m-%d-%Y")
    team = row["Team"]
    elo = row["Elo"]
    dataset[date][team] = elo

teams_elo_df['Elo'] = teams_elo_df['Elo'].astype(float)

elo_df

In [None]:
df.head()

In [None]:
df = df.merge(elo_df.drop(columns=['H_Team', 'A_Team']), on ='Game_ID')
df.head()

# Merging Dataset

# Standardization and Z Score

In [None]:
df.head()

# Evaluate Different Models - No Z Score

In [None]:
df.to_csv(r'C:/Users/alvaro/OneDrive/Documents/School/Flatiron/Projects/NBA_Prediction_Model/data/nba_raw.csv', index=False)
print(f'The final dataset consists of three seasons and {len(df)} games.')
df = df.reset_index(drop=True)
df.tail()


In [None]:
df.head()
df = df.drop(labels=['H_Team_Elo_After', 'A_Team_Elo_After'], axis=1)
df.head()

In [None]:
df["H_Team_Elo_Before"] = df.H_Team_Elo_Before.astype(float)
df["A_Team_Elo_Before"] = df.A_Team_Elo_Before.astype(float)
df = df.drop(['Home', 'Away', 'Game_ID', 'H_Score', 'A_Score', 'Date', 'Season'], axis=1)
#df.head()
#df.columns
#df.info()
df.columns

In [None]:
df.to_csv(r'C:\Users\alvaro\OneDrive\Documents\School\Flatiron\Projects\NBA_Prediction_Model\data\nba.csv', index=False)

In [None]:
import pandas as pd
df = pd.read_csv('./data/nba.csv')

In [None]:
df

# EDA

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

plt.figure(figsize=(44, 34))
correlation = df[['H_W_PCT', 
                  'H_REB', 
                  'H_AST',
                  'H_TOV', 
                  'H_STL', 
                  'H_BLK', 
                  'H_PLUS_MINUS', 
                  'H_OFF_RATING',
                  'H_DEF_RATING', 
                  'H_TS_PCT', 
                  'H_Team_Elo_Before', 
                  'Home_W_Pct_10', 
                  'Result'
                  ]]

sns.heatmap(correlation.corr(), annot=True);
# correlation
# sns.heatmap(df.corr(), annot=True);
# sns.heatmap(df['Result'].corr(), annot=True);

In [None]:
print(df.corr()['Result'].abs().sort_values(ascending=False))

In [None]:
import numpy as np
with sns.axes_style("darkgrid"):
    fig, ax = plt.subplots(figsize=(44, 34))
    
    mask = np.triu(np.ones_like(df.corr(), dtype=bool))
    ax = sns.heatmap(abs(df.corr()),mask=mask,annot=True)
    fig.savefig('images/Corelation_Heatmap');

In [None]:
final_df

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
#import final dataset
#final_df = pd.read_csv('')

#drop non numeric columns
#df.drop(columns = ['Home', 'Away', 'Game_ID', 'Date', 'Season'], axis = 1, inplace = True )

X = final_df.drop(columns = 'Result')

y = final_df['Result']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

print(f'X train shape: {X_train.shape}')
print(f'X test shape: {X_test.shape}')

In [None]:
print(f"""Raw Counts 
{final_df["Result"].value_counts()}\n
Percentages 
{final_df["Result"].value_counts(normalize=True)}


We would get an accuracy score of 0.566312 with a baseline model, i.e. about 56.6% accuracy

This is because about 56.6% of the results are wins""")

In [None]:
print(f"""Train percent wins
{y_train.value_counts(normalize=True)}\n""")

print(f"""Test percent wins: 
{y_test.value_counts(normalize=True)}\n""")

In [None]:
from sklearn.model_selection import cross_val_score


class ModelWithCV():
    '''Structure to save the model and more easily see its crossvalidation'''
    
    def __init__(self, model, model_name, X, y, cv_now=True):
        self.model = model
        self.name = model_name
        self.X = X
        self.y = y
        # For CV results
        self.cv_results = None
        self.cv_mean = None
        self.cv_median = None
        self.cv_std = None
        #
        if cv_now:
            self.cross_validate()
        
    def cross_validate(self, X=None, y=None, kfolds=10):
        '''
        Perform cross-validation and return results.
        
        Args: 
          X:
            Optional; Training data to perform CV on. Otherwise use X from object
          y:
            Optional; Training data to perform CV on. Otherwise use y from object
          kfolds:
            Optional; Number of folds for CV (default is 10)  
        '''
        
        cv_X = X if X else self.X
        cv_y = y if y else self.y

        self.cv_results = cross_val_score(self.model, cv_X, cv_y, cv=kfolds)
        self.cv_mean = np.mean(self.cv_results)
        self.cv_median = np.median(self.cv_results)
        self.cv_std = np.std(self.cv_results)

        
    def print_cv_summary(self):
        cv_summary = (
        f'''CV Results for `{self.name}` model:
            {self.cv_mean:.5f} ± {self.cv_std:.5f} accuracy
        ''')
        print(cv_summary)

        
    def plot_cv(self, ax):
        '''
        Plot the cross-validation values using the array of results and given 
        Axis for plotting.
        '''
        ax.set_title(f'CV Results for `{self.name}` Model')
        # Thinner violinplot with higher bw
        sns.violinplot(y=self.cv_results, ax=ax, bw=.4)
        sns.swarmplot(
                y=self.cv_results,
                color='orange',
                size=10,
                alpha= 0.8,
                ax=ax
        )

        return ax

## Dummy

In [None]:
# Create Dummy/Baseliner
from sklearn.dummy import DummyRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

estimator = DummyRegressor(strategy='mean')

# Create Dummy/Baseliner
from sklearn.dummy import DummyRegressor

pipe = Pipeline(steps=[
    ('estimator', DummyRegressor(strategy='mean'))
])

cv = ModelWithCV(pipe, 'estimator', X_train, y_train)

In [None]:
f, ax = plt.subplots()

cv.plot_cv(ax);

In [None]:
cv.print_cv_summary()

In [None]:
dummy_pipe.print_cv_summary()

## Logistic Regression

In [None]:
# Import relevant class and function
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA

pipe = Pipeline(steps=[
    #('scaler', StandardScaler()),
    #('pca', PCA(n_components=4)),
    ('estimator', LogisticRegression(random_state=42))
])

In [None]:
param_grid = {}
param_grid['estimator__C'] = np.logspace(-4, 4, 50)
param_grid['estimator__solver'] = ['liblinear']
param_grid['estimator__penalty'] = ['l1', 'l2']
param_grid['estimator__class_weight'] = ['balanced', None]
#params['logreg__n_jobs'] = [1]
param_grid['estimator__max_iter'] = [1000]

In [None]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(pipe, param_grid, cv=10, return_train_score=True, scoring='accuracy', verbose=1)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

# Mean training score
grid_train_score = np.mean(grid_search.cv_results_['mean_train_score'])

# Mean test score
grid_test_score = grid_search.score(X_test, y_test)

best_grid = grid_search.best_estimator_
best_grid.fit(X_train, y_train)
y_pred = best_grid.predict(X_test)

print(f"Mean Training Score: {grid_train_score:.2%}\n")
print(f"Mean Test Score: {grid_test_score:.2%}\n")

print(f"Accuracy Score: {accuracy_score(y_test, y_pred):.2%}\n")
print(f"Optimal Parameters: {grid_search.best_params_}\n")
print(f"Testing Accuracy: {grid_search.best_score_:.2%}\n")

print(classification_report(y_test, y_pred))

## Random Forest - Random Search CV

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

pipe = Pipeline(steps=[
    ('estimator', RandomForestClassifier(random_state=42))
])

In [None]:
param_grid = {}
param_grid['estimator__n_estimators'] = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
param_grid['estimator__max_features'] = ['auto', 'sqrt']
param_grid['estimator__max_depth'] = [int(x) for x in np.linspace(10, 110, num = 11)]
param_grid['estimator__min_samples_split'] = [2, 5, 10]
param_grid['estimator__min_samples_leaf'] = [1, 2, 4]
param_grid['estimator__bootstrap'] = [True, False]


In [None]:
from sklearn.model_selection import RandomizedSearchCV

grid_search = RandomizedSearchCV(estimator=pipe, 
                                 param_distributions=param_grid, 
                                 cv=10, return_train_score=True, 
                                 scoring='accuracy', 
                                 n_iter=100, 
                                 random_state=42, 
                                 n_jobs = -1, 
                                 verbose=1)


In [None]:
grid_search.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# Mean training score
grid_train_score = np.mean(grid_search.cv_results_['mean_train_score'])

# Mean test score
grid_test_score = grid_search.score(X_test, y_test)

best_grid = grid_search.best_estimator_
best_grid.fit(X_train, y_train)
y_pred = best_grid.predict(X_test)

print(f"Mean Training Score: {grid_train_score:.2%}\n")
print(f"Mean Test Score: {grid_test_score:.2%}\n")

print(f"Accuracy Score: {accuracy_score(y_test, y_pred):.2%}\n")
print(f"Optimal Parameters: {grid_search.best_params_}\n")
print(f"Testing Accuracy: {grid_search.best_score_:.2%}\n")

print(classification_report(y_test, y_pred))

## Random Forest - Grid Search CV

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

pipe = Pipeline(steps=[
    ('estimator', RandomForestClassifier(random_state=42))
])

In [None]:
param_grid = {}
param_grid['estimator__n_estimators'] = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
param_grid['estimator__max_features'] = ['auto', 'sqrt']
param_grid['estimator__max_depth'] = [int(x) for x in np.linspace(10, 110, num=11)]
param_grid['estimator__min_samples_split'] = [2, 5, 10]
param_grid['estimator__min_samples_leaf'] = [1, 2, 4]
param_grid['estimator__bootstrap'] = [True, False]


In [None]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(estimator=pipe, 
                           param_grid=param_grid, 
                           cv=10, 
                           return_train_score=True, 
                           scoring='accuracy',
                           n_jobs = -1,
                           verbose=1)


In [None]:
grid_search.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# Mean training score
grid_train_score = np.mean(grid_search.cv_results_['mean_train_score'])

# Mean test score
grid_test_score = grid_search.score(X_test, y_test)

best_grid = grid_search.best_estimator_
best_grid.fit(X_train, y_train)
y_pred = best_grid.predict(X_test)

print(f"Mean Training Score: {grid_train_score:.2%}\n")
print(f"Mean Test Score: {grid_test_score:.2%}\n")

print(f"Accuracy Score: {accuracy_score(y_test, y_pred):.2%}\n")
print(f"Optimal Parameters: {grid_search.best_params_}\n")
print(f"Testing Accuracy: {grid_search.best_score_:.2%}\n")

print(classification_report(y_test, y_pred))

## Gaussian Naive Bayes

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB

pipe = Pipeline(steps=[
    ('estimator', GaussianNB())
])

In [None]:
param_grid = {}
param_grid['estimator__var_smoothing'] = np.logspace(0,-9, num=100)
#param_grid['estimator__var_smoothing'] = [1e-11, 1e-10, 1e-9]


In [None]:
from sklearn.model_selection import RandomizedSearchCV

grid_search = RandomizedSearchCV(estimator=pipe, 
                                 param_distributions=param_grid, 
                                 cv=10, 
                                 return_train_score=True, 
                                 scoring='accuracy', 
                                 random_state=42,
                                 n_jobs = -1,
                                 verbose=1)


In [None]:
grid_search.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# Mean training score
grid_train_score = np.mean(grid_search.cv_results_['mean_train_score'])

# Mean test score
grid_test_score = grid_search.score(X_test, y_test)

best_grid = grid_search.best_estimator_
best_grid.fit(X_train, y_train)
y_pred = best_grid.predict(X_test)

print(f"Mean Training Score: {grid_train_score:.2%}\n")
print(f"Mean Test Score: {grid_test_score:.2%}\n")

print(f"Accuracy Score: {accuracy_score(y_test, y_pred):.2%}\n")
print(f"Optimal Parameters: {grid_search.best_params_}\n")
print(f"Testing Accuracy: {grid_search.best_score_:.2%}\n")

print(classification_report(y_test, y_pred))

## XGB

In [None]:
from sklearn.pipeline import Pipeline
import xgboost as xgb

pipe = Pipeline(steps=[
    ('estimator', xgboost.XGBRegressor(random_state=42, objective='reg:squarederror'))
])



In [None]:
param_grid = {}
param_grid['estimator__min_child_weight'] = [1, 5, 10],
param_grid['estimator__gamma'] = [0.5, 1, 1.5, 2, 5],
param_grid['estimator__subsample'] = [0.6, 0.8, 1.0],
param_grid['estimator__colsample_bytree'] = [0.6, 0.8, 1.0],
param_grid['estimator__max_depth'] = [3, 4, 5]

In [None]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(estimator=pipe, 
                           param_grid=param_grid, 
                           cv=10, 
                           return_train_score=True, 
                           scoring='accuracy',
                           n_jobs = -1,
                           verbose=1)


In [None]:
grid_search.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# Mean training score
grid_train_score = np.mean(grid_search.cv_results_['mean_train_score'])

# Mean test score
grid_test_score = grid_search.score(X_test, y_test)

best_grid = grid_search.best_estimator_
best_grid.fit(X_train, y_train)
y_pred = best_grid.predict(X_test)

print(f"Mean Training Score: {grid_train_score:.2%}\n")
print(f"Mean Test Score: {grid_test_score:.2%}\n")

print(f"Accuracy Score: {accuracy_score(y_test, y_pred):.2%}\n")
print(f"Optimal Parameters: {grid_search.best_params_}\n")
print(f"Testing Accuracy: {grid_search.best_score_:.2%}\n")

print(classification_report(y_test, y_pred))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn import model_selection
from sklearn.utils import class_weight
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import numpy as np
import pandas as pd

#script to test the effectivenes of each model, uses default parameters
#test six different classification models 
def run_exps(X_train, y_train, X_test, y_test) :
    '''
    Lightweight script to test many models and find winners
    :param X_train: training split
    :param y_train: training target vector
    :param X_test: test split
    :param y_test: test target vector
    :return: DataFrame of predictions
    '''
    
    dfs = []
    
    models = [
          ('LogReg', LogisticRegression()), 
          ('RF', RandomForestClassifier()),
          ('KNN', KNeighborsClassifier()),
          ('SVM', SVC()), 
          ('GNB', GaussianNB()),
          ('XGB', XGBClassifier())
        ]
    
    results = []
    
    names = []
    
    scoring = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted', 'roc_auc']
    
    target_names = ['win', 'loss']
    
    for name, model in models:
        
        kfold = model_selection.KFold(n_splits=5, shuffle=True, random_state=90210)
        cv_results = model_selection.cross_validate(model, X_train, y_train, cv=kfold, scoring=scoring)
        clf = model.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        
        print(name)
        print(classification_report(y_test, y_pred, target_names=target_names))
        
        results.append(cv_results)
        names.append(name)
        
        this_df = pd.DataFrame(cv_results)
        this_df['model'] = name
        dfs.append(this_df)
        
    final = pd.concat(dfs, ignore_index=True)
    
    return final
final = run_exps(X_train, y_train, X_test, y_test)
final

# Model Performance

In [None]:
bootstraps = []
for model in list(set(final.model.values)):
    model_df = final.loc[final.model == model]
    bootstrap = model_df.sample(n=30, replace=True)
    bootstraps.append(bootstrap)
        
bootstrap_df = pd.concat(bootstraps, ignore_index=True)
results_long = pd.melt(bootstrap_df,id_vars=['model'],var_name='metrics', value_name='values')
time_metrics = ['fit_time','score_time'] # fit time metrics
## PERFORMANCE METRICS
results_long_nofit = results_long.loc[~results_long['metrics'].isin(time_metrics)] # get df without fit data
results_long_nofit = results_long_nofit.sort_values(by='values')
## TIME METRICS
results_long_fit = results_long.loc[results_long['metrics'].isin(time_metrics)] # df with fit data
results_long_fit = results_long_fit.sort_values(by='values')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(20, 12))
sns.set(font_scale=2.5)
g = sns.boxplot(x="model", y="values", hue="metrics", data=results_long_nofit, palette="Set3")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.title('Comparison of Model by Classification Metric')
plt.savefig('./benchmark_models_performance.png',dpi=300)

In [None]:
plt.figure(figsize=(20, 12))
sns.set(font_scale=2.5)
g = sns.boxplot(x="model", y="values", hue="metrics", data=results_long_fit, palette="Set3")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.title('Comparison of Model by Fit and Score Time')
plt.savefig('./benchmark_models_time.png',dpi=300)

In [None]:
metrics = list(set(results_long_nofit.metrics.values))
bootstrap_df.groupby(['model'])[metrics].agg([np.std, np.mean])

# Final Selected Model
- grid search for parameters 
- Gaussian NB

In [None]:
# Gaussian NB only has one parameter 'var_smoothing'
# Portion of the largest variance of all features that is added to variances for calculation stability.
# Number of different combinations of parameters 

from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

nb_classifier = GaussianNB()

target_names = ['Win', 'Loss']

params_NB = {'var_smoothing': np.logspace(0,-9, num=100)}
kfold = model_selection.KFold(n_splits=5, shuffle=True, random_state=90210)

gs_NB = GridSearchCV(estimator=nb_classifier, 
                 param_grid=params_NB, 
                 cv=kfold,   
                 verbose=1, 
                 scoring='accuracy', n_jobs=-1) 

gs_NB.fit(X_train, y_train)

best_gs_grid = gs_NB.best_estimator_
best_gs_grid.fit(X_train, y_train)
y_pred_best_gs = best_gs_grid.predict(X_test)

print(classification_report(y_test, y_pred_best_gs, target_names=target_names))
gs_NB.best_params_

In [None]:
from sklearn.metrics import confusion_matrix
confusionMatrix = confusion_matrix(y_test, y_pred_best_gs)
print(confusionMatrix)

# Confusion Matrix

In [None]:
confusionMatrix = confusion_matrix(y_test, y_pred_best_gs)  

    # Code below prints model accuracy information
print('Coefficient Information:')

for i in range(len(featureColumns)):  

    logregCoefficients = logreg.coef_

    currentFeature = featureColumns[i]
    currentCoefficient = logregCoefficients[0][i]

    print(currentFeature + ': ' + str(currentCoefficient))

print('----------------------------------')

print("Accuracy:", metrics.accuracy_score(Y_test, Y_pred))
print("Precision:", metrics.precision_score(Y_test, Y_pred))
print("Recall:", metrics.recall_score(Y_test, Y_pred))

print('----------------------------------')

print('Confusion Matrix:')
print(confusionMatrix)

# Saving Model

In [None]:
import pickle

# Saves the model in folder to be used in future
# filename should be end in '.pkl'
def save_model(model, filename):

    with open(filename, 'wb') as file:
        pickle.dump(model, file)
save_model()