In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler
from datetime import date
import torch

torch.manual_seed(42)

<torch._C.Generator at 0x706c4b7b8250>

In [2]:
season_dates = {
    '2005-2006': { 'start': date(2005, 11, 1), 'end': date(2006, 6, 20) },
    '2006-2007': { 'start': date(2006, 10, 31), 'end': date(2007, 6, 14) },
    '2007-2008': { 'start': date(2007, 10, 30), 'end': date(2008, 6, 17) },
    '2008-2009': { 'start': date(2008, 10, 28), 'end': date(2009, 6, 14) },
    '2009-2010': { 'start': date(2009, 10, 27), 'end': date(2010, 6, 17) },
    '2010-2011': { 'start': date(2010, 10, 26), 'end': date(2011, 6, 12) },
    '2011-2012': { 'start': date(2011, 12, 25), 'end': date(2012, 6, 21) },
    '2012-2013': { 'start': date(2012, 10, 30), 'end': date(2013, 6, 20) },
    '2013-2014': { 'start': date(2013, 10, 29), 'end': date(2014, 6, 15) },
    '2014-2015': { 'start': date(2014, 10, 28), 'end': date(2015, 6, 16) },
    '2015-2016': { 'start': date(2015, 10, 27), 'end': date(2016, 6, 19) },
    '2016-2017': { 'start': date(2016, 10, 25), 'end': date(2017, 6, 12) },
    '2017-2018': { 'start': date(2017, 10, 17), 'end': date(2018, 6, 8) },
    '2018-2019': { 'start': date(2018, 10, 16), 'end': date(2019, 6, 13) },
    '2019-2020': { 'start': date(2019, 10, 22), 'end': date(2020, 10, 11) },
    '2020-2021': { 'start': date(2020, 12, 22), 'end': date(2021, 7, 22) },
    '2021-2022': { 'start': date(2021, 10, 19), 'end': date(2022, 6, 16) },
    '2022-2023': { 'start': date(2022, 10, 18), 'end': date(2023, 6, 15) },
    '2023-2024': { 'start': date(2023, 10, 17), 'end': date(2024, 6, 13) },
}

numeric_columns = ['SEC', 'PTS', 'FGM', 'FGA', 'FG_PCT', 'FG3M',
                   'FG3A', 'FG3_PCT',  'FTM', 'FTA', 'FT_PCT', 'OREB', 
                   'DREB', 'REB', 'AST', 'STL', 'BLK', 'TO', 'PF']

non_numeric_colums = ['GAME_ID', 'PLAYER_ID', 'SEASON', 'GAME_DATE', 'TEAM_ID']

v3_non_numeric_colums = ['teamCity', 'teamTricode', 'teamName', 'teamSlug', 'firstName', 'familyName', 'nameI', 'playerSlug', 'position', 'comment', 'jerseyNum', 'minutes']

In [3]:
# Load games
games = pd.read_csv('../data/games.csv')

# Convert GAME_DATE to datetime
games['GAME_DATE'] = pd.to_datetime(games['GAME_DATE'])

# Sort by GAME_DATE
games.sort_values('GAME_DATE', inplace=True)

# Keep only games from 2010-2011 season onwards and before the 2022-2023 season
games = games[games['GAME_DATE'] >= '2010-10-26']

# Extract game ids
game_ids = games['GAME_ID'].unique()

# Load games_details
games_details = pd.read_csv('../data/games_details.csv')

# Keep only games in game_ids
games_details = games_details[games_details['GAME_ID'].isin(game_ids)]
games_details = games_details[games_details['COMMENT'].isna()]

# Drop unnecessary columns
games_details.drop(['PLAYER_NAME', 'TEAM_ABBREVIATION', 'TEAM_CITY', 'START_POSITION', 'NICKNAME', 'COMMENT'], axis=1, inplace=True)

# Load advanced_box_scores
advanced_box_scores = pd.read_csv('../data/advanced_box_scores.csv')
advanced_box_scores = advanced_box_scores[advanced_box_scores['comment'].isna()]
advanced_box_scores.drop(v3_non_numeric_colums, axis=1, inplace=True)
advanced_box_scores.drop(['offensiveReboundPercentage', 'defensiveReboundPercentage', 'reboundPercentage', 'usagePercentage'], axis=1, inplace=True)
advanced_box_scores.rename(columns={'gameId': 'GAME_ID', 'teamId': 'TEAM_ID', 'personId': 'PLAYER_ID'}, inplace=True)
advanced_box_scores = advanced_box_scores[advanced_box_scores['GAME_ID'].isin(game_ids)]

# Load usage stats
usage_stats = pd.read_csv('../data/usage_stats.csv')
usage_stats = usage_stats[usage_stats['COMMENT'].isna()]
usage_stats.drop(['MIN', 'TEAM_ABBREVIATION', 'TEAM_CITY', 'COMMENT', 'PLAYER_NAME', 'NICKNAME', 'START_POSITION'], axis=1, inplace=True)
usage_stats = usage_stats[usage_stats['GAME_ID'].isin(game_ids)]

# Add game date to games_details
games_details = games_details.merge(games[['GAME_ID', 'GAME_DATE']], on='GAME_ID', how='left')

# Drop rows with missing values (Player did not play)
games_details.dropna(inplace=True)

# Convert MIN to seconds
def convert_to_seconds(x):
    x = x.split(':')
    return int(float(x[0]))*60 + int(float(x[1])) if len(x) == 2 else int(float(x[0]))*60

games_details['MIN'] = games_details['MIN'].apply(convert_to_seconds)
games_details.rename(columns={'MIN': 'SEC'}, inplace=True)

# Add a column for the season that the game was played in
def get_season(x):
    x = x.date()
    for key, value in season_dates.items():
        if x >= value['start'] and x <= value['end']:
            return key
    return None

games_details['SEASON'] = games_details['GAME_DATE'].apply(get_season)

# only keep advanced_box_scores that are in games_details
games_details = games_details[games_details['GAME_ID'].isin(advanced_box_scores['GAME_ID'])]

# Only keep players that are in advanced_box_scores
games_details = games_details[games_details['PLAYER_ID'].isin(advanced_box_scores['PLAYER_ID'].unique())]

print(len(games_details['GAME_ID'].unique()))
print(len(advanced_box_scores['GAME_ID'].unique()))

# Merge advanced_box_scores with games_details
games_details = games_details.merge(advanced_box_scores, on=['GAME_ID', 'TEAM_ID', 'PLAYER_ID'], how='left')

# Merge usage_stats with games_details
games_details = games_details.merge(usage_stats, on=['GAME_ID', 'TEAM_ID', 'PLAYER_ID'], how='left')

games_details.dropna(inplace=True)

# Sort by GAME_DATE, GAME_ID, TEAM_ID, and MIN
games_details.sort_values(['GAME_DATE', 'GAME_ID', 'TEAM_ID', 'SEC'], ascending=[True, True, True, False], inplace=True)
games_details.reset_index(drop=True, inplace=True)

print(len(games_details))
print(games_details.head())
print(games_details.columns)

  games_details = pd.read_csv('../data/games_details.csv')


ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
# Create a new dataframe and store the rolling averages of the past 10 games in each season for each player
window = 10
min_periods = 10
rolling_averages = pd.DataFrame()
numeric_columns = games_details.columns.drop(non_numeric_colums)
for season in season_dates.keys():
    season_games_details = games_details[games_details['SEASON'] == season]
    for team in season_games_details['TEAM_ID'].unique():
        team_games_details = season_games_details[season_games_details['TEAM_ID'] == team].sort_values('GAME_DATE')
        for player in team_games_details['PLAYER_ID'].unique():
            player_games_details = team_games_details[team_games_details['PLAYER_ID'] == player].sort_values('GAME_DATE')
            player_games_details.reset_index(drop=True, inplace=True)
            # For each numeric column, calculate the rolling average of the past 10 games
            for column in numeric_columns:
                player_games_details.loc[:, column] = player_games_details[column].rolling(window=window, min_periods=min_periods).mean()
            # Make column for number of days since last game
            player_games_details.loc[:, 'DAYS_SINCE_LAST_GAME'] = player_games_details['GAME_DATE'].diff().dt.days
            # Shift the game id and game date columns up by 1
            player_games_details.loc[:, 'GAME_ID'] = player_games_details['GAME_ID'].shift(-1)
            player_games_details.loc[:, 'GAME_DATE'] = player_games_details['GAME_DATE'].shift(-1)
            rolling_averages = pd.concat([rolling_averages, player_games_details], ignore_index=True)

In [None]:
# Calculate the 10 game win percentage for each team
games = pd.read_csv('../data/games.csv')
team_win_percentage = pd.DataFrame()

for season in games['SEASON_ID'].unique():
    season_games = games[games['SEASON_ID'] == season]
    for team in season_games['TEAM_ID'].unique():
        team_games = season_games[season_games['TEAM_ID'] == team].sort_values('GAME_DATE')
        ten_game_win_percentage = team_games['WL'].apply(lambda x: 1 if x == 'W' else 0).rolling(window=10, min_periods=10).mean()
        cumilative_win_percentage = team_games['WL'].apply(lambda x: 1 if x == 'W' else 0).expanding().mean()
        home_win_percentage = team_games[team_games['HOME_TEAM'] == 1]['WL'].apply(lambda x: 1 if x == 'W' else 0).expanding().mean()
        away_win_percentage = team_games[team_games['HOME_TEAM'] == 0]['WL'].apply(lambda x: 1 if x == 'W' else 0).expanding().mean()
        win_percentages = pd.DataFrame({
            'GAME_ID': team_games['GAME_ID'],
            'TEAM_ID': team_games['TEAM_ID'],
            '10_GAME_WIN_PCT': ten_game_win_percentage,
            'CUM_WIN_PCT': cumilative_win_percentage,
            'HOME_WIN_PCT': home_win_percentage,
            'AWAY_WIN_PCT': away_win_percentage
        })
        win_percentages['GAME_ID'] = win_percentages['GAME_ID'].shift(-1)
        win_percentages.fillna(method='ffill', inplace=True)
        win_percentages.dropna(inplace=True)
        team_win_percentage = pd.concat([team_win_percentage, win_percentages], ignore_index=True)

print(len(team_win_percentage))

# Remove games that do not have 2 teams
team_win_percentage = team_win_percentage.groupby('GAME_ID').filter(lambda x: len(x) == 2)
team_win_percentage.dropna(inplace=True)

# Standardize the numeric columns
team_data_numeric_columns = team_win_percentage.columns.drop(['GAME_ID', 'TEAM_ID'])
scaler = StandardScaler()
team_win_percentage[team_data_numeric_columns] = scaler.fit_transform(team_win_percentage[team_data_numeric_columns])

print(len(team_win_percentage))
team_win_percentage.to_csv('../data/team_level_data.csv', index=False)

  win_percentages.fillna(method='ffill', inplace=True)


39826
38400


In [None]:
rolling_averages.dropna(inplace=True)
# Sort by GAME_DATE, GAME_ID, TEAM_ID, and SEC
rolling_averages.sort_values(['GAME_DATE', 'GAME_ID', 'TEAM_ID', 'SEC'], ascending=[True, True, True, False], inplace=True)
rolling_averages.reset_index(drop=True, inplace=True)
print(len(rolling_averages))
print(len(rolling_averages[rolling_averages['PLAYER_ID'] == 203507]))
print(rolling_averages.head())

In [None]:
# Standardize the data
scaler = StandardScaler()
rolling_averages[numeric_columns] = scaler.fit_transform(rolling_averages[numeric_columns])
print(rolling_averages.head())
rolling_averages.to_csv('../data/rolling_averages.csv', index=False)

In [None]:
rolling_averages = pd.read_csv('../data/rolling_averages_deep.csv')
games = pd.read_csv('../data/games.csv')
print(len(rolling_averages))

In [None]:
rolling_averages = pd.read_csv('../data/rolling_averages_deep.csv')
games = pd.read_csv('../data/games.csv')
game_ids = rolling_averages['GAME_ID'].unique()
# Create a map from GAME_ID to the home and away team ids of that game
game_id_to_teams = {}
for game_id in game_ids:
    game = games[games['GAME_ID'] == game_id]
    home_team_id = game[game['HOME_TEAM'] == 1]['TEAM_ID'].values[0]
    away_team_id = game[game['HOME_TEAM'] == 0]['TEAM_ID'].values[0]
    game_id_to_teams[game_id] = {'HOME_TEAM_ID': home_team_id, 'AWAY_TEAM_ID': away_team_id}

In [None]:
# Compute average players per team per game
players_per_team_per_game = rolling_averages.groupby(['GAME_ID', 'TEAM_ID']).size().reset_index(name='PLAYERS_PER_TEAM_PER_GAME')
average = players_per_team_per_game['PLAYERS_PER_TEAM_PER_GAME'].mean()
print(average)

9.41503633314701


In [None]:
# split games_details into home and away teams per game
# keep only the top num_players players per team
# If there are less than num_players players, drop the game
num_players = 9

home_games_details = pd.DataFrame()
away_games_details = pd.DataFrame()
for game_id in rolling_averages['GAME_ID'].unique():
    game_details = rolling_averages[rolling_averages['GAME_ID'] == game_id]
    home_team_id = game_id_to_teams[game_id]['HOME_TEAM_ID']
    away_team_id = game_id_to_teams[game_id]['AWAY_TEAM_ID']
    home_team_details = game_details[game_details['TEAM_ID'] == home_team_id]
    away_team_details = game_details[game_details['TEAM_ID'] == away_team_id]
    home_team_details = home_team_details.sort_values('SEC', ascending=False)
    away_team_details = away_team_details.sort_values('SEC', ascending=False)
    if len(home_team_details) < num_players:
        null_rows = pd.DataFrame(np.nan, index=np.arange(num_players - len(home_team_details)), columns=home_team_details.columns)
        home_team_details = pd.concat([home_team_details, null_rows], ignore_index=True)
        # Foward fill GAME_ID TEAM_ID PLAYER_ID and GAME_DATE
        home_team_details['GAME_ID'] = home_team_details['GAME_ID'].ffill()
        home_team_details['TEAM_ID'] = home_team_details['TEAM_ID'].ffill()
        home_team_details['PLAYER_ID'] = home_team_details['PLAYER_ID'].ffill()
        home_team_details['GAME_DATE'] = home_team_details['GAME_DATE'].ffill()
        home_games_details.fillna(0, inplace=True)
    if len(away_team_details) < num_players:
        null_rows = pd.DataFrame(np.nan, index=np.arange(num_players - len(away_team_details)), columns=away_team_details.columns)
        away_team_details = pd.concat([away_team_details, null_rows], ignore_index=True)
        # Foward fill GAME_ID TEAM_ID PLAYER_ID and GAME_DATE
        away_team_details['GAME_ID'] = away_team_details['GAME_ID'].ffill()
        away_team_details['TEAM_ID'] = away_team_details['TEAM_ID'].ffill()
        away_team_details['PLAYER_ID'] = away_team_details['PLAYER_ID'].ffill()
        away_team_details['GAME_DATE'] = away_team_details['GAME_DATE'].ffill()
        away_games_details.fillna(0, inplace=True)
    home_team_details = home_team_details.head(num_players)
    away_team_details = away_team_details.head(num_players)
    home_games_details = pd.concat([home_games_details, home_team_details], ignore_index=True)
    away_games_details = pd.concat([away_games_details, away_team_details], ignore_index=True)
home_games_details.reset_index(drop=True, inplace=True)
away_games_details.reset_index(drop=True, inplace=True)

In [None]:
print(len(home_games_details))
print(len(away_games_details))
print(home_games_details.tail())
print(away_games_details.head())
print(home_games_details.columns)
df = pd.DataFrame()

178317
178317
           GAME_ID       TEAM_ID  PLAYER_ID     SEC  FGM  FGA  FG_PCT  FG3M  \
178312  22300893.0  1.610613e+09   203484.0  1744.6  3.3  8.0  0.3700   1.7   
178313  22300893.0  1.610613e+09  1631128.0  1284.0  2.1  5.7  0.3623   0.6   
178314  22300893.0  1.610613e+09   202704.0  1270.9  3.0  7.6  0.3756   0.6   
178315  22300893.0  1.610613e+09  1631212.0  1103.7  2.0  4.9  0.3966   0.5   
178316  22300893.0  1.610613e+09  1630192.0   669.9  1.4  3.8  0.3166   0.0   

        FG3A  FG3_PCT  ...  effectiveFieldGoalPercentage  \
178312   4.4   0.3233  ...                        0.4616   
178313   2.1   0.3167  ...                        0.4133   
178314   2.8   0.2093  ...                        0.4310   
178315   1.6   0.3166  ...                        0.4500   
178316   0.4   0.0000  ...                        0.3166   

        trueShootingPercentage  usagePercentage  estimatedUsagePercentage  \
178312                  0.5332           0.1406                    0.1429

In [None]:
home_games_details[1080:1190]

Unnamed: 0,GAME_ID,TEAM_ID,PLAYER_ID,SEC,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,...,effectiveFieldGoalPercentage,trueShootingPercentage,usagePercentage,estimatedUsagePercentage,estimatedPace,pace,pacePer40,possessions,PIE,DAYS_SINCE_LAST_GAME
1080,20500261.0,1.610613e+09,2731.0,2064.7,4.7,12.1,0.3918,0.0,0.0,0.0000,...,0.3918,0.4641,0.2106,0.2121,95.989,94.652,78.875,67.7,0.1061,3.0
1081,20500261.0,1.610613e+09,2222.0,1977.8,6.2,12.2,0.4954,0.9,2.8,0.1625,...,0.5282,0.5405,0.1974,0.1989,97.327,96.183,80.151,65.8,0.1337,3.0
1082,20500261.0,1.610613e+09,1510.0,1915.9,3.6,9.7,0.3476,0.1,0.2,0.1000,...,0.3539,0.4366,0.1808,0.1827,96.837,95.518,79.598,63.2,0.1427,3.0
1083,20500261.0,1.610613e+09,2056.0,1502.6,4.1,7.8,0.4916,0.0,0.0,0.0000,...,0.4916,0.5297,0.1699,0.1709,98.010,96.570,80.474,50.5,0.0836,3.0
1084,20500261.0,1.610613e+09,2408.0,1194.7,3.1,5.9,0.4164,0.0,0.0,0.0000,...,0.4164,0.5498,0.2120,0.2178,91.179,92.265,76.887,37.9,0.1168,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1185,20500272.0,1.610613e+09,1800.0,1455.6,2.5,6.9,0.3488,2.0,5.5,0.3291,...,0.4799,0.5046,0.1423,0.1440,96.621,95.974,79.979,48.8,0.0259,3.0
1186,20500272.0,1.610613e+09,673.0,595.1,0.7,2.1,0.2917,0.0,0.0,0.0000,...,0.2917,0.3572,0.1199,0.1284,96.567,101.826,84.855,20.4,0.0796,3.0
1187,20500272.0,1.610613e+09,2739.0,458.9,0.4,1.8,0.1250,0.2,0.8,0.0400,...,0.1375,0.2137,0.1041,0.1068,86.508,96.082,80.067,14.9,0.3137,3.0
1188,20500273.0,1.610613e+09,2200.0,2263.8,6.5,13.9,0.5069,0.0,0.2,0.0000,...,0.5069,0.5515,0.2456,0.2488,89.728,88.459,73.714,69.9,0.1852,3.0


In [None]:
from torch.utils.data import Dataset

class GamesDataset(Dataset):
    def __init__(self, data):
        self.player_data, self.team_data, self.target = data
        
    def __len__(self):
        return self.target.shape[0]
    
    def __getitem__(self, index):
        return self.player_data[index], self.team_data[index], self.target[index]

In [None]:
games = pd.read_csv('../data/games.csv')
team_level_data = pd.read_csv('../data/team_level_data.csv')
games_outcomes = games[['GAME_ID', 'HOME_TEAM_WON']]

def get_team_player_details(team_games_details, game_id):
    team = team_games_details[team_games_details['GAME_ID'] == game_id]
    team = team.drop(['GAME_ID', 'GAME_DATE', 'SEASON', 'TEAM_ID', 'PLAYER_ID'], axis=1)
    team = team.astype(float)
    team = team.to_numpy()
    return team
    
def get_team_level_data(team_level_data, game_id, team_id):
    team = team_level_data[(team_level_data['GAME_ID'] == game_id) & (team_level_data['TEAM_ID'] == team_id)]
    team = team.drop(['GAME_ID', 'TEAM_ID'], axis=1)
    team = team.astype(float)
    team = team.to_numpy()
    team = team[0]
    return team

seasons = {}
game_ids = home_games_details[home_games_details['GAME_ID'].isin(team_level_data['GAME_ID'])]['GAME_ID'].unique()
for i in range(len(game_ids)):
    season = rolling_averages[rolling_averages['GAME_ID'] == game_ids[i]]['SEASON'].values[0]
    home_team_id = home_games_details[home_games_details['GAME_ID'] == game_ids[i]]['TEAM_ID'].values[0]
    away_team_id = away_games_details[away_games_details['GAME_ID'] == game_ids[i]]['TEAM_ID'].values[0]
    home_team_player_details = get_team_player_details(home_games_details, game_ids[i])
    away_team_player_details = get_team_player_details(away_games_details, game_ids[i])
    home_team_level_data = get_team_level_data(team_level_data, game_ids[i], home_team_id)
    away_team_level_data = get_team_level_data(team_level_data, game_ids[i], away_team_id)
    matchup = np.array([home_team_player_details, away_team_player_details])
    team_data = np.array([home_team_level_data, away_team_level_data])
    outcome = games_outcomes[games_outcomes['GAME_ID'] == game_ids[i]]['HOME_TEAM_WON'].values[0]
    seasons[season] = seasons.get(season, []) + [(matchup, team_data, outcome)]

In [None]:
sample = seasons['2017-2018'][0][0]
num_players = sample.shape[1]
num_features = sample.shape[2]
num_team_features = seasons['2017-2018'][0][1].shape[1]
print(sample.shape)
print(num_team_features)
# Print total number of games
total = 0
for season in seasons:
    total += len(seasons[season])
    print(season, len(seasons[season]))
print(total)

(2, 9, 43)
4
2005-2006 1054
2006-2007 1055
2007-2008 1057
2008-2009 1059
2009-2010 1055
2010-2011 1060
2011-2012 814
2012-2013 1057
2013-2014 1059
2014-2015 1057
2015-2016 1057
2016-2017 1058
2017-2018 1059
2018-2019 1056
2019-2020 889
2020-2021 909
2021-2022 1058
2022-2023 1058
2023-2024 739
19210


In [None]:
from model_definitions import train_test_split
from sklearn.preprocessing import StandardScaler

splits = []
for season in list(seasons.keys())[11:]:
    train, test = train_test_split(seasons, season)
    player_features_train = np.array([game[0] for game in train])
    player_features_train = player_features_train.reshape(-1, num_features)
    player_feature_scaler = StandardScaler()
    player_feature_scaler.fit(player_features_train)
    player_features_train = player_feature_scaler.transform(player_features_train)
    player_features_train = player_features_train.reshape(-1, 2, num_players, num_features)
    player_features_test = np.array([game[0] for game in test])
    player_features_test = player_features_test.reshape(-1, num_features)
    player_features_test = player_feature_scaler.transform(player_features_test)
    player_features_test = player_features_test.reshape(-1, 2, num_players, num_features)
    team_features_train = np.array([game[1] for game in train])
    team_features_train = team_features_train.reshape(-1, num_team_features)
    team_feature_scaler = StandardScaler()
    team_feature_scaler.fit(team_features_train)
    team_features_train = team_feature_scaler.transform(team_features_train)
    team_features_train = team_features_train.reshape(-1, 2, num_team_features)
    team_features_test = np.array([game[1] for game in test])
    team_features_test = team_features_test.reshape(-1, num_team_features)
    team_features_test = team_feature_scaler.transform(team_features_test)
    team_features_test = team_features_test.reshape(-1, 2, num_team_features)
    outcomes_train = np.array([game[2] for game in train])
    outcomes_test = np.array([game[2] for game in test])
    splits.append(((player_features_train, team_features_train, outcomes_train), (player_features_test, team_features_test, outcomes_test)))
    
train = splits[0][0]
sample = train[0]
print(sample)

(11384, 2, 4)


In [None]:
import torch
from torch.utils.data import DataLoader
from torch import nn

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

Using cuda device


In [None]:
import importlib
import dataloaders
importlib.reload(dataloaders)
from dataloaders import *
import model_definitions
importlib.reload(model_definitions)
from model_definitions import *
import utils
importlib.reload(utils)
from torch.utils.data import DataLoader

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

device = 'cuda' if torch.cuda.is_available() else 'cpu'

games, rolling_averages_deep, rolling_averages_wide, team_level_data, _, _ = utils.load_data()
games = games[games['SEASON_ID'] >= 22017]
games = games[games['GAME_ID'].isin(rolling_averages_deep['GAME_ID'])]
games = games[games['GAME_ID'].isin(rolling_averages_wide['GAME_ID'])]
games = games[games['GAME_ID'].isin(team_level_data['GAME_ID'])]

def get_predictions(model, dataloader):
    predictions = []
    outcomes = []
    model.eval()
    for player_data, team_data, outcome in dataloader:
        player_data = player_data.to(device)
        team_data = team_data.to(device)
        outcome = outcome.to(device)
        prediction = model(player_data, team_data)
        predictions.append(prediction)
        outcomes.append(outcome)
    return predictions, outcomes

model_wide = NeuralNetwork(10, 119, 2, hidden_layers=[111, 301])
model_wide.load_state_dict(torch.load('../models/model_widev1.pth'))
model_wide.to(device)

model_deep = NeuralNetwork(10, 38, 2, hidden_layers=[251, 19])
model_deep.load_state_dict(torch.load('../models/modelv6.pth'))
model_deep.to(device)

wide_dataset = GamesDataset(games, rolling_averages_wide, team_level_data)
deep_dataset = GamesDataset(games, rolling_averages_deep, team_level_data)

wide_dataloader = DataLoader(wide_dataset, batch_size=1, shuffle=False)
deep_dataloader = DataLoader(deep_dataset, batch_size=1, shuffle=False)

wide_predictions, _ = get_predictions(model_wide, wide_dataloader)
deep_predictions, outcomes = get_predictions(model_deep, deep_dataloader)

In [None]:
wide_predictions = torch.tensor([wide_predictions]).cpu().detach().numpy()
deep_predictions = torch.tensor([deep_predictions]).cpu().detach().numpy()
print(wide_predictions.shape)
print(deep_predictions.shape)

meta_dataset = np.concatenate((wide_predictions, deep_predictions), axis=0)
print(meta_dataset.shape)


In [None]:
from sklearn.metrics import classification_report

meta_dataset = meta_dataset.T
outcomes = torch.tensor(outcomes).cpu().detach().numpy()
X_train, X_test, y_train, y_test = train_test_split(meta_dataset, outcomes, test_size=0.1, shuffle=False)

model = LogisticRegression(max_iter=10000)
model.fit(X_train, y_train)

output = model.predict(X_test)
accuracy = accuracy_score(y_test, output)
report = classification_report(y_test, output)
print(accuracy)
print(report)
print(model.coef_)
predictions = np.zeros((len(y_test), 2))
predictions[:, 0] = model.predict_proba(X_test)[:, 0]
predictions[:, 1] = model.predict_proba(X_test)[:, 1]
print(predictions.shape)

print(class_specific_ece(y_test, predictions, 2, 20, .8))

NameError: name 'meta_dataset' is not defined

In [None]:
# Perform hyperparameter tuning.  Optimize for calibration and Brier score loss
import optuna
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau
from tqdm import trange
import importlib
import model_definitions
importlib.reload(model_definitions)
from model_definitions import *

sigmoid = nn.Sigmoid()

torch.manual_seed(42)

def objective(trial):
    loss_fn = BrierScoreLoss()
    lr = trial.suggest_float('lr', 1e-6, 1e-1, log=True)
    hidden_layers = [0, 0]
    hidden_layers[0] = trial.suggest_int('hidden_layer_0', 1, 512)
    hidden_layers[1] = trial.suggest_int('hidden_layer_1', 1, 512)
    stepsize = trial.suggest_int('stepsize', 1, 10)
    gamma = trial.suggest_float('gamma', 0.1, 1.0)
    epochs = trial.suggest_int('epochs', 1, 30)
    shuffle = trial.suggest_categorical('shuffle', [True, False])
        
    
    # ece_scores = []
    # accuracy_scores = []
    class_specific_ece_scores = []
    for train, test in splits:
        train_data = GamesDataset(train)
        train_dataloader = DataLoader(train_data, batch_size=256, shuffle=shuffle)
        test_data = GamesDataset(test)
        test_dataloader = DataLoader(test_data, batch_size=512, shuffle=shuffle)
        model = NeuralNetwork(num_players, num_features, num_team_features, hidden_layers=hidden_layers)
        optimizer = optim.Adam(model.parameters(), lr=lr)
        lr_scheduler = StepLR(optimizer, step_size=stepsize, gamma=gamma)
        model.to(device)
        for epoch in range(epochs):
            model.train()
            for matchup, team_data, outcome in train_dataloader:
                matchup = matchup.to(dtype=torch.float32)
                matchup = matchup.to(device)
                team_data = team_data.to(dtype=torch.float32)
                team_data = team_data.to(device)
                outcome = outcome.view(-1, 1).to(dtype=torch.float32)
                outcome = outcome.to(device)
                optimizer.zero_grad()
                output = sigmoid(model(matchup, team_data))
                loss = loss_fn(output, outcome)
                loss.backward()
                optimizer.step()
            lr_scheduler.step()

        model.eval()
        y_true = []
        y_pred = []
        with torch.no_grad():
            for matchup, team_data, outcome in test_dataloader:
                matchup = matchup.to(dtype=torch.float32)
                matchup = matchup.to(device)
                team_data = team_data.to(dtype=torch.float32)
                team_data = team_data.to(device)
                outcome = outcome.view(-1, 1).to(dtype=torch.float32)
                outcome = outcome.to(device)
                output = sigmoid(model(matchup, team_data))
# temp_scaled_model.set_temperature(train_dataloader)

# temp_scaled_model.eval() 
                y_true += list(outcome.cpu().numpy())
                y_pred += list(output.cpu().numpy())
        y_true = np.array(y_true)
        y_pred = np.array(y_pred)
        predictions = np.zeros((len(y_true), 2))
        predictions[:, 0] = 1 - y_pred[:, 0]
        predictions[:, 1] = y_pred[:, 0]
        class_specific_ece_scores.append(class_specific_ece(y_true, predictions, 2, n_bins=20, min_bins_filled=0.8))
        # ece_scores.append(expected_calibration_error(predictions, y_true, M=10))
        # accuracy_scores.append(np.mean(np.round(y_pred) == y_true))
    return np.mean(class_specific_ece_scores)

study = optuna.create_study(directions=['minimize'])
study.optimize(objective, n_trials=100000)
# temp_scaled_model.set_temperature(train_dataloader)

# temp_scaled_model.eval() 

[I 2024-03-07 02:18:43,579] A new study created in memory with name: no-name-bf21d7dc-b6a1-454a-a1bf-6a2da52acd5a
[I 2024-03-07 02:19:16,581] Trial 0 finished with value: 1.1293154761904762 and parameters: {'lr': 1.8697862597320003e-05, 'hidden_layer_0': 106, 'hidden_layer_1': 305, 'stepsize': 3, 'gamma': 0.513516676423884, 'epochs': 26, 'shuffle': True}. Best is trial 0 with value: 1.1293154761904762.
[I 2024-03-07 02:19:26,719] Trial 1 finished with value: 0.1711811786687111 and parameters: {'lr': 8.134046206979838e-05, 'hidden_layer_0': 501, 'hidden_layer_1': 443, 'stepsize': 3, 'gamma': 0.6102529023049055, 'epochs': 8, 'shuffle': True}. Best is trial 1 with value: 0.1711811786687111.
[I 2024-03-07 02:19:56,581] Trial 2 finished with value: 0.15926879997252552 and parameters: {'lr': 0.0009975031728562374, 'hidden_layer_0': 138, 'hidden_layer_1': 341, 'stepsize': 7, 'gamma': 0.9506462237290636, 'epochs': 24, 'shuffle': True}. Best is trial 2 with value: 0.15926879997252552.
[I 2024-0

KeyboardInterrupt: 

In [None]:
for trial in study.best_trials:
    print(trial.number)
    print(trial.values)
    print(trial.params)

876
[0.03724365296175605]
{'lr': 0.0001838368527157626, 'hidden_layer_0': 189, 'hidden_layer_1': 13, 'stepsize': 7, 'gamma': 0.7234329400126882, 'epochs': 12, 'shuffle': False}


In [None]:
# {'lr': 0.0004184459608600268, 'hidden_layer_0': 92, 'hidden_layer_1': 210, 'stepsize': 1, 'gamma': 0.6908620771848416, 'batch_size': 306, 'epochs': 6, 'shuffle': False}

import torch
from tqdm import trange
from sklearn.metrics import classification_report
# from dataloaders import GamesDataset
import pandas as pd
from torch.utils.data import DataLoader
import importlib
import model_definitions
importlib.reload(model_definitions)
from model_definitions import *
from torch import optim
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau

sigmoid = nn.Sigmoid()

# best_seed = 3126602462524780931
# # seed = torch.seed()
# torch.manual_seed(best_seed)

torch.manual_seed(42)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

# Train with optimal hyperparameters on full dataset
# optimal_params = {'lr': 0.0002571686895477055, 'hidden_layer_0': 110, 'hidden_layer_1': 121, 'stepsize': 4, 'gamma': 0.9738997244939319, 'epochs': 7, 'shuffle': False}
optimal_params = {'lr': 0.0001838368527157626, 'hidden_layer_0': 189, 'hidden_layer_1': 13, 'stepsize': 7, 'gamma': 0.7234329400126882, 'epochs': 12, 'shuffle': False}

# train_dataset = GamesDataset(games, players)
train, test = train_test_split(seasons, '2023-2024')
train = train + test
train_player_features = np.array([game[0] for game in train])
train_player_features = train_player_features.reshape(-1, num_features)
train_player_feature_scaler = StandardScaler()
train_player_feature_scaler.fit(train_player_features)
train_player_features = train_player_feature_scaler.transform(train_player_features)
train_player_features = train_player_features.reshape(-1, 2, num_players, num_features)
train_team_features = np.array([game[1] for game in train])
train_team_features = train_team_features.reshape(-1, num_team_features)
train_team_feature_scaler = StandardScaler()
train_team_feature_scaler.fit(train_team_features)
train_team_features = train_team_feature_scaler.transform(train_team_features)
train_team_features = train_team_features.reshape(-1, 2, num_team_features)
train_outcomes = np.array([game[2] for game in train])
print(train_player_features.shape)
print(train_team_features.shape)
print(train_outcomes.shape)
train_dataset = GamesDataset((train_player_features, train_team_features, train_outcomes))

num_players = train_dataset[0][0].shape[1]
num_features = train_dataset[0][0].shape[2]

train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=optimal_params['shuffle'])
loss_fn = BrierScoreLoss()

model = NeuralNetwork(num_players, num_features, num_team_features, hidden_layers=[optimal_params['hidden_layer_0'], optimal_params['hidden_layer_1']])
optimizer = optim.Adam(model.parameters(), lr=optimal_params['lr'])
model.to(device)

lr_scheduler = StepLR(optimizer, step_size=optimal_params['stepsize'], gamma=optimal_params['gamma'])

for epoch in trange(12):
    model.train()
    for matchup, team_data, outcome in train_dataloader:
        matchup = matchup.to(dtype=torch.float32)
        matchup = matchup.to(device)
        team_data = team_data.to(dtype=torch.float32)
        team_data = team_data.to(device)
        outcome = outcome.view(-1, 1).to(dtype=torch.float32)
        outcome = outcome.to(device)
        optimizer.zero_grad()
        output = sigmoid(model(matchup, team_data))
        loss = loss_fn(output, outcome)
        loss.backward()
        optimizer.step()
    lr_scheduler.step()
    
# model.eval()
# y_true = []
# y_pred = []
# with torch.no_grad():
#     for matchup, team_data, outcome in test_dataloader:
#         matchup = matchup.to(dtype=torch.float32)
#         matchup = matchup.to(device)
#         team_data = team_data.to(dtype=torch.float32)
#         team_data = team_data.to(device)
#         outcome = outcome.view(-1, 1).to(dtype=torch.float32)
#         outcome = outcome.to(device)
#         output = sigmoid(model(matchup, team_data))
#         y_true += list(outcome.cpu().numpy())
#         y_pred += list(output.cpu().numpy())
# y_true = np.array(y_true)
# y_pred = np.array(y_pred)
# output = np.zeros((len(y_true), 2))
# output[:, 0] = 1 - y_pred[:, 0]
# output[:, 1] = y_pred[:, 0]
# classwise_ece = class_specific_ece(y_true, output, 2, n_bins=20, min_bins_filled=0.8)
# accuracy_score = np.mean(np.round(y_pred) == y_true)
# print(classwise_ece)
# print(accuracy_score)
# print(classification_report(y_true, np.round(y_pred)))
# # print(seed)

# # Perform temperature scaling
# temp_scaled_model = TemperatureScaledModel(model)
# temp_scaled_model.to(device)
# temp_scaled_model.set_temperature(train_dataloader)

# temp_scaled_model.eval()
# y_true = []
# y_pred = []
# with torch.no_grad():
#     for matchup, team_data, outcome in test_dataloader:
#         matchup = matchup.to(dtype=torch.float32)
#         matchup = matchup.to(device)
#         team_data = team_data.to(dtype=torch.float32)
#         team_data = team_data.to(device)
#         outcome = outcome.view(-1, 1).to(dtype=torch.double)
#         outcome = outcome.to(device)
#         output = sigmoid(temp_scaled_model(matchup, team_data))
#         y_true += list(outcome.cpu().numpy())
#         y_pred += list(output.cpu().numpy())
# y_true = np.array(y_true)
# y_pred = np.array(y_pred)
# predictions = np.zeros((len(y_true), 2))
# predictions[:, 0] = 1 - y_pred[:, 0]
# predictions[:, 1] = y_pred[:, 0]
# classwise_ece = class_specific_ece(y_true, predictions, 2, n_bins=20, min_bins_filled=0)
# accuracy_score = np.mean(np.round(y_pred) == y_true)
# print(classwise_ece)
# print(accuracy_score)
# print(classification_report(y_true, np.round(y_pred)))

Using cuda device
(19210, 2, 9, 43)
(19210, 2, 4)
(19210,)


100%|██████████| 12/12 [00:03<00:00,  3.47it/s]


In [None]:
# save hyperparameters and mean and standard deviation of player and team features
import pickle
with open('../models/model_deepv4_params.pkl', 'wb') as f:
    pickle.dump(optimal_params, f)
    pickle.dump(train_player_feature_scaler.mean_, f)
    pickle.dump(train_player_feature_scaler.scale_, f)
    pickle.dump(train_team_feature_scaler.mean_, f)
    pickle.dump(train_team_feature_scaler.scale_, f)

# Save model to modelv1.pth
torch.save(model.state_dict(), '../models/model_deepv4.pth')

In [None]:
# Read pickled data
import pickle
with open('../models/model_deepv4_params.pkl', 'rb') as f:
    optimal_params = pickle.load(f)
    train_player_feature_mean = pickle.load(f)
    train_player_feature_std = pickle.load(f)
    train_team_feature_mean = pickle.load(f)
    train_team_feature_std = pickle.load(f)
    
print(train_player_feature_mean)
print(train_player_feature_std)
print(train_team_feature_mean)
print(train_team_feature_std)

[ 1.46095003e+03  3.94165510e+00  8.54259645e+00  4.21475242e-01
  8.97843716e-01  2.48118833e+00  2.17736092e-01  1.83994910e+00
  2.39821505e+00  4.53830260e-01  1.06595060e+00  3.24592805e+00
  4.31187865e+00  2.31546157e+00  7.56972642e-01  4.84755336e-01
  1.37140928e+00  2.02280843e+00  1.06211030e+01  1.71945746e-01
  1.00649209e+02  1.00900961e+02  1.02196884e+02  1.02399539e+02
 -1.54763824e+00 -1.49866973e+00  1.34037392e-01  1.07097749e+00
  1.53577228e+01  4.23852745e-02  1.27493091e-01  8.49494453e-02
  1.03029461e+01  4.68945350e-01  5.03141791e-01  1.80415094e-01
  1.84167615e-01  9.38915952e+01  9.41691958e+01  7.84743191e+01
  4.90298956e+01  8.68212708e-02  2.45330557e+00]
[5.67703502e+02 2.36507098e+00 4.89035699e+00 1.28782081e-01
 9.01653920e-01 2.26191723e+00 1.64295863e-01 1.65334154e+00
 2.03983109e+00 2.41767981e-01 9.45503002e-01 2.01141529e+00
 2.74243966e+00 2.06377507e+00 5.05143190e-01 5.36836089e-01
 8.98250817e-01 8.48902124e-01 6.54116506e+00 4.39351704

In [None]:
# Load model
model = NeuralNetwork(10, 38, 2, hidden_layers=[optimal_params['hidden_layer_0'], optimal_params['hidden_layer_1']])
model.load_state_dict(torch.load('../models/modelv6.pth'))
model.to('cuda')

In [None]:
bankroll = 1000

In [None]:
# Simulate a season
import importlib
import simulation
importlib.reload(simulation)
import model_definitions
importlib.reload(model_definitions)
from simulation import *
from model_definitions import *


betting_rules = {
    'hundreth_uniform': uniform_betting(bankroll, bankroll_fraction=0.01),
    'tenths_uniform': uniform_betting(bankroll, bankroll_fraction=0.1),
    'tenths_kelly': kelly_criterion(kelly_fraction=0.1),
    'full_kelly': kelly_criterion(kelly_fraction=1),
    'eighth_kelly': kelly_criterion(kelly_fraction=0.125),
    'sixth_kelly': kelly_criterion(kelly_fraction=0.166),
    'tenths_uniform_threshold_1': uniform_betting(bankroll, bankroll_fraction=0.1, threshold=0.1),
    'tenths_uniform_threshold_2': uniform_betting(bankroll, bankroll_fraction=0.1, threshold=0.2),
    'tenths_uniform_threshold_3': uniform_betting(bankroll, bankroll_fraction=0.1, threshold=0.3),
    'eighth_kelly_threshold_1': kelly_criterion(kelly_fraction=0.125, threshold=0.1),
    'eighth_kelly_threshold_2': kelly_criterion(kelly_fraction=0.125, threshold=0.2),
    'eighth_kelly_threshold_3': kelly_criterion(kelly_fraction=0.125, threshold=0.3),
    'full_kelly_threshold_1': kelly_criterion(kelly_fraction=1, threshold=0.1),
    'full_kelly_threshold_2': kelly_criterion(kelly_fraction=1, threshold=0.2),
    'full_kelly_threshold_3': kelly_criterion(kelly_fraction=1, threshold=0.3),
}

bankroll_history, oddbet_history, bet_history, win_history, model_predictions, bookmaker_predictions, labels, bets_placed = run_simulation(bankroll, betting_rules, model, '2023-2024')

plot_bankroll_histories(bankroll_history, bets_placed, betting_rules)

In [None]:
for key, _ in betting_rules.items():
    plot_bankroll_history(bankroll_history, bets_placed, key, betting_rules)

In [None]:
print_statistics(model_predictions, labels, bookmaker_predictions, bets_placed, np.sum(win_history == 1), bankroll_history)