- [1. Import Packages and Functions](#1)
- [2. Assembling Datasets](#2)
- [3. Model Buildings](#3)
    - [3.1 Random Forest Classifier](#3_1)
    - [3.2 AdaBoost Tree](#3_2)
    - [3.3 XGBoost (Extreme Gradient Boosting)](#3_3)
    - [3.4 CatBoost](#3_4)

## 1. Import Packages and Functions <a id='1'></a>

In [220]:
## Import desired packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import timedelta
import statsmodels.api as sm
import gc
import matplotlib
from tqdm import tqdm
import itertools
import pickle

from sklearn import __version__ as sklearn_version
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve, TimeSeriesSplit, ParameterGrid
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression, RidgeClassifier, LassoCV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, AdaBoostClassifier

from sklearn.cluster import KMeans, DBSCAN

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, silhouette_score, f1_score, accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression, SequentialFeatureSelector, RFE

from catboost import CatBoostClassifier
import xgboost as xgb

from datetime import datetime

In [8]:
def add_target(team):
    team['target'] = team['won'].shift(-1)
    return team

def rest_days(team):
    team['date_time'] = pd.to_datetime(team['date'])
    team['rest'] = (team['date_time'] - team['date_time'].shift(1)).dt.days
    team['rest'] = team['rest'].fillna(0)
    team['rest'] = team['rest'].astype(int)
    del team['date_time']
    return team

def winrate(team):
    total = team['Wins'] + team['Losses']
    total_opp = team['Wins_opp'] + team['Losses_opp']
    team['winrate'] = team['Wins'] / total
    team['winrate_opp'] = team['Wins_opp'] / total_opp
    return team

def differential(team):
    team['differential'] = team['Total'] - team['Total_opp']
    return team

def find_team_exp_average_5(team):
    numeric_columns = team.select_dtypes(include=np.number)
    rolling = numeric_columns.ewm(span=5, adjust=False).mean()
    return rolling

def find_team_exp_average_9(team):
    numeric_columns = team.select_dtypes(include=np.number)
    rolling = numeric_columns.ewm(span=9, adjust=False).mean()
    return rolling

def find_team_exp_average_12(team):
    numeric_columns = team.select_dtypes(include=np.number)
    rolling = numeric_columns.ewm(span=12, adjust=False).mean()
    return rolling

def find_team_average_15(team):
    numeric_columns = team.select_dtypes(include=np.number)
    rolling = numeric_columns.rolling(15).mean()
    return rolling

def find_team_average_10(team):
    numeric_columns = team.select_dtypes(include=np.number)
    rolling = numeric_columns.rolling(10).mean()
    return rolling

def find_team_average_5(team):
    numeric_columns = team.select_dtypes(include=np.number)
    rolling = numeric_columns.rolling(5).mean()
    return rolling

def find_team_average_3(team):
    numeric_columns = team.select_dtypes(include=np.number)
    rolling = numeric_columns.rolling(3).mean()
    return rolling

def rolling(data):
    df_rolling_3 = data[list(valid_columns) + ['Teams','won', "season"]]
    df_rolling_3 = df_rolling_3.groupby(['Teams', 'season'], group_keys = False).apply(find_team_average_3)
    df_rolling_5 = data[list(valid_columns) + ['Teams','won', "season"]]
    df_rolling_5 = df_rolling_5.groupby(['Teams', 'season'], group_keys = False).apply(find_team_average_5)
    df_rolling_10 = data[list(valid_columns) + ['Teams','won', "season"]]
    df_rolling_10 = df_rolling_10.groupby(['Teams', 'season'], group_keys = False).apply(find_team_average_10)
    df_rolling_15 = data[list(valid_columns) + ['Teams','won', "season"]]
    df_rolling_15 = df_rolling_15.groupby(['Teams', 'season'], group_keys = False).apply(find_team_average_15)
    df_exp_rolling_5 = data[list(valid_columns) + ['Teams','won', "season"]]
    df_exp_rolling_5 = df_exp_rolling_5.groupby(['Teams', 'season'], group_keys = False).apply(find_team_exp_average_5)
    df_exp_rolling_9 = data[list(valid_columns) + ['Teams','won', "season"]]
    df_exp_rolling_9 = df_exp_rolling_9.groupby(['Teams', 'season'], group_keys = False).apply(find_team_exp_average_9)
    df_exp_rolling_12 = data[list(valid_columns) + ['Teams','won', "season"]]
    df_exp_rolling_12 = df_exp_rolling_12.groupby(['Teams', 'season'], group_keys = False).apply(find_team_exp_average_12)
    exp_rolling_columns_5 = [f"{col}_exp_5" for col in df_exp_rolling_5.columns]
    exp_rolling_columns_9 = [f"{col}_exp_9" for col in df_exp_rolling_9.columns]
    exp_rolling_columns_12 = [f"{col}_exp_12" for col in df_exp_rolling_12.columns]
    rolling_columns_15 = [f"{col}_15" for col in df_rolling_15.columns]
    rolling_columns_10 = [f"{col}_10" for col in df_rolling_10.columns]
    rolling_columns_5 = [f"{col}_5" for col in df_rolling_5.columns]
    rolling_columns_3 = [f"{col}_3" for col in df_rolling_3.columns]
    df_exp_rolling_12.columns = exp_rolling_columns_12
    df_exp_rolling_9.columns = exp_rolling_columns_9
    df_exp_rolling_5.columns = exp_rolling_columns_5
    df_rolling_15.columns = rolling_columns_15
    df_rolling_10.columns = rolling_columns_10
    df_rolling_5.columns = rolling_columns_5
    df_rolling_3.columns = rolling_columns_3
    df = pd.concat([data, df_rolling_3, df_rolling_5, df_rolling_10, df_rolling_15,df_exp_rolling_5,df_exp_rolling_9, df_exp_rolling_12], axis=1)
    return df

def ratio(feature):
    feature_opp = 'OPP_' + str(feature)
    free = nba[feature] / nba[feature_opp]
    return free

def ratios(nba):
    regard = []
    disregard = [col for col in nba.columns if "OPP_" in col]
    for col in disregard:
        col = col[4:100]
        if col in nba.columns:
            regard.append(col)
    nba_ratio = nba[regard].apply(ratio)
    nba_ratios_columns = [f"{col}_ratio" for col in nba_ratio.columns]
    nba_ratio.columns
    return regard

def shift_col(team, col_name):
    next_col = team[col_name].shift(-1)
    return next_col

def add_col(df, col_name):
    return df.groupby("Teams", group_keys=False).apply(lambda x: shift_col(x, col_name))

def date_change(datetime_str):
    # Parse the datetime string into a datetime object
    datetime_obj = datetime.strptime(datetime_str, '%m/%d/%Y')

    # Format the datetime object into a new string structure
    new_datetime_str = datetime_obj.strftime('%Y-%m-%d')

    return new_datetime_str

def haircut(df, date):
    df[date] = df[date].str[:10]
    return df

def convert_date_format(df):
    # Create a boolean mask to identify values in the "m/d/y" format
    mask = df['Date'].str.contains(r'\d{1,2}/\d{1,2}/\d{2}')
    
    # Apply the conversion only to values that match the mask
    df.loc[mask, 'Date'] = nba.loc[mask, 'Date'].apply(date_change)
    return df

## 2. Assembling Dataset <a id='3'></a>

In [208]:
folder_path = "/Users/liqingyang/Documents/GitHub/sports_trading/sports_betting/data/raw_data/NBA_2018_2024.csv"
df = pd.read_csv(folder_path, index_col=0)

folder_path = "/Users/liqingyang/Documents/GitHub/sports_trading/sports_betting/nba_api/data/teams_stats/processed_cumulative_season_stats_2019_2024.csv"
nba = pd.read_csv(folder_path, index_col=0)

In [209]:
# nba dataframe does not include the 2018 season
df = df[~df['season'].isin([2018])]
df = df.reset_index(drop=True)
df = haircut(df, 'date')

# rename nba columns to match df
nba = haircut(nba, 'Date')
nba = convert_date_format(nba)
nba.rename(columns={'Date': 'date_next', 'Teams':'Teams_x'}, inplace=True)

# construct winrate for team
df = winrate(df)
# construct differential points
df = differential(df)
# construct target
df = df.groupby("Teams", group_keys=False).apply(add_target)
# construct resting
df = df.groupby(["Teams",'season'], group_keys=False).apply(rest_days)
# games yet to play are 2
df.loc[pd.isnull(df['target']), 'target'] = 2
# convert win/loss to 1/0
df['target'] = df['target'].astype(int)

# remove metadata and target for df
removed = ['target', 'date', 'Teams_opp', 'Teams',
           'season','won', 'Wins', 'Losses', 
           'Wins_opp', 'Losses_opp']
valid_columns = df.columns[~df.columns.isin(removed)]

# scale the data
scaler = MinMaxScaler()
df[valid_columns] = scaler.fit_transform(df[valid_columns])

# construct rolling features to df
df = rolling(df).copy()
df = df.dropna()

# remove metadata for nba ranking 
removed = ['date_next', 'Teams_x']
valid_columns = nba.columns[~nba.columns.isin(removed)]

# scale the ranking data
scaler = MinMaxScaler()
nba[valid_columns] = scaler.fit_transform(nba[valid_columns])

# construct current game metadata
df['home_next'] = add_col(df, 'home')
df['team_next_opp'] = add_col(df, 'Teams_opp')
df['date_next'] = add_col(df, 'date')
df = df.copy()

# merge stats from opposing teams
full = df.merge(df,
               left_on=['Teams', 'date_next'],
               right_on = ['team_next_opp', 'date_next'])
full['date_next_dt'] = pd.to_datetime(full['date_next'])
full['date_next_prev'] = (full['date_next_dt']-timedelta(days=1)).dt.strftime('%Y-%m-%d')

# merge stats from nba dataframe
nba_renamed_home = nba.rename(columns={col: col + '_for_home' for col in nba.columns if col not in ['Teams_x', 'date_next']})
nba_renamed_away = nba.rename(columns={col: col + '_for_away' for col in nba.columns if col not in ['Teams_x', 'date_next']})

for_home = pd.merge(full, nba_renamed_home, left_on=['Teams_x', 'date_next_prev'],right_on=['Teams_x', 'date_next'], how='left')
complete = pd.merge(for_home, nba_renamed_away, left_on=['team_next_opp_x', 'date_next_prev'], right_on=['Teams_x', 'date_next'], how='left')
complete = complete.dropna()

# remove metadata and useless data
disregard = list(complete.columns[complete.dtypes == 'object']) 
disregard = disregard + ["home_opp_5_x","target_y", 
                         "Wins_x", "Losses_x", "Wins_opp_x", 
                         "Losses_opp_x", "season_x" , "won_x" , 
                         "home_5_x" ,"home_10_x" ,"season_5_x", 
                         "season_10_x" , "Wins_y" , "Losses_y" , 
                         "Wins_opp_y" , "Losses_opp_y" , "season_y" , 
                         "won_y" ,"home_5_y", "home_10_y","season_5_y", 
                         "season_10_y","home_opp_5_y", "home_opp_10_y"]
regard = complete.columns[~complete.columns.isin(disregard)]

In [210]:
# want to exclude January 2024 - March 2024 data from dataframe to use for out of sample testing
complete = complete[~complete['date_next'].str.contains('2024-03')|complete['date_next'].str.contains('2024-02')|complete['date_next'].str.contains('2024-01')]
complete = complete.reset_index(drop=True)

In [211]:
same_value_columns = [column for column, is_equal in columns_with_same_values.items() if is_equal]
complete_cleaning = complete.copy()
# Dropping duplicated columns
complete_cleaning = complete_cleaning.T.drop_duplicates(keep='first').T
for column in complete_cleaning.columns:
    complete_cleaning[column] = pd.to_numeric(complete_cleaning[column], errors='ignore')
complete_cleaning

Unnamed: 0,mp_x,fg_x,fga_x,fg%_x,3p_x,3pa_x,3p%_x,ft_x,fta_x,ft%_x,...,OPP_OREB_RANK_opponent_for_away,OPP_DREB_RANK_opponent_for_away,OPP_REB_RANK_opponent_for_away,OPP_AST_RANK_opponent_for_away,OPP_TOV_RANK_opponent_for_away,OPP_STL_RANK_opponent_for_away,OPP_BLK_RANK_opponent_for_away,OPP_PF_RANK_opponent_for_away,OPP_PFD1_opponent_for_away,OPP_PTS_RANK_opponent_for_away
0,0.00,0.727273,0.516667,0.705742,0.407407,0.350000,0.452781,0.404762,0.431373,0.674355,...,0.448276,0.517241,0.448276,0.724138,0.758621,0.862069,0.517241,0.793103,0.206897,0.689655
1,0.00,0.250000,0.483333,0.188995,0.259259,0.533333,0.187581,0.428571,0.470588,0.648575,...,0.379310,0.206897,0.310345,0.448276,0.206897,0.206897,0.724138,0.206897,0.965517,0.241379
2,0.00,0.363636,0.400000,0.373206,0.037037,0.133333,0.126779,0.595238,0.607843,0.720488,...,0.620690,0.137931,0.310345,0.241379,1.000000,0.241379,0.655172,0.137931,0.931034,0.206897
3,0.00,0.454545,0.516667,0.399522,0.407407,0.383333,0.420440,0.166667,0.196078,0.582090,...,0.896552,0.655172,0.655172,0.862069,0.241379,0.413793,0.448276,0.620690,0.137931,0.758621
4,0.00,0.363636,0.416667,0.361244,0.222222,0.316667,0.267788,0.285714,0.333333,0.592944,...,0.827586,0.517241,0.655172,0.758621,0.172414,0.344828,0.310345,0.620690,0.448276,0.655172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10407,0.00,0.477273,0.450000,0.473684,0.333333,0.316667,0.401035,0.404762,0.392157,0.763908,...,0.655172,0.482759,0.586207,0.172414,0.172414,0.241379,0.758621,0.379310,0.000000,0.965517
10408,0.00,0.522727,0.583333,0.430622,0.444444,0.566667,0.322122,0.142857,0.254902,0.321574,...,0.482759,0.620690,0.620690,0.793103,0.586207,0.172414,0.689655,0.241379,0.724138,0.689655
10409,0.25,0.704545,0.650000,0.576555,0.555556,0.516667,0.447607,0.190476,0.176471,0.773406,...,0.517241,0.000000,0.000000,0.241379,0.724138,0.310345,0.103448,0.310345,0.793103,0.103448
10410,0.00,0.409091,0.383333,0.440191,0.518519,0.483333,0.441138,0.357143,0.333333,0.796472,...,0.137931,0.655172,0.517241,0.758621,0.206897,0.068966,0.517241,0.448276,0.551724,0.413793


#### Getting ranking spread

In [212]:
ranks = [i for i in complete_cleaning.columns if 'RANK' in i]
ranks_home = [i for i in ranks if 'for_home' in i]
ranks_away = [i for i in ranks if 'for_away' in i]
spread_columns_names = [i[:-4] for i in ranks_home]
temp = complete_cleaning.copy()
for i in spread_columns_names:
    complete_cleaning[i[:-4] + 'spread'] = complete_cleaning[i + 'home'] - complete_cleaning[i + 'away']
complete_cleaning

  complete_cleaning[i[:-4] + 'spread'] = complete_cleaning[i + 'home'] - complete_cleaning[i + 'away']
  complete_cleaning[i[:-4] + 'spread'] = complete_cleaning[i + 'home'] - complete_cleaning[i + 'away']
  complete_cleaning[i[:-4] + 'spread'] = complete_cleaning[i + 'home'] - complete_cleaning[i + 'away']
  complete_cleaning[i[:-4] + 'spread'] = complete_cleaning[i + 'home'] - complete_cleaning[i + 'away']
  complete_cleaning[i[:-4] + 'spread'] = complete_cleaning[i + 'home'] - complete_cleaning[i + 'away']
  complete_cleaning[i[:-4] + 'spread'] = complete_cleaning[i + 'home'] - complete_cleaning[i + 'away']
  complete_cleaning[i[:-4] + 'spread'] = complete_cleaning[i + 'home'] - complete_cleaning[i + 'away']
  complete_cleaning[i[:-4] + 'spread'] = complete_cleaning[i + 'home'] - complete_cleaning[i + 'away']
  complete_cleaning[i[:-4] + 'spread'] = complete_cleaning[i + 'home'] - complete_cleaning[i + 'away']
  complete_cleaning[i[:-4] + 'spread'] = complete_cleaning[i + 'home'] - 

Unnamed: 0,mp_x,fg_x,fga_x,fg%_x,3p_x,3pa_x,3p%_x,ft_x,fta_x,ft%_x,...,OPP_FT_PCT_RANK_opponent_spread,OPP_OREB_RANK_opponent_spread,OPP_DREB_RANK_opponent_spread,OPP_REB_RANK_opponent_spread,OPP_AST_RANK_opponent_spread,OPP_TOV_RANK_opponent_spread,OPP_STL_RANK_opponent_spread,OPP_BLK_RANK_opponent_spread,OPP_PF_RANK_opponent_spread,OPP_PTS_RANK_opponent_spread
0,0.00,0.727273,0.516667,0.705742,0.407407,0.350000,0.452781,0.404762,0.431373,0.674355,...,-0.793103,-0.206897,-0.413793,-0.344828,-0.172414,-0.413793,-0.241379,0.379310,-0.241379,-0.379310
1,0.00,0.250000,0.483333,0.188995,0.259259,0.533333,0.187581,0.428571,0.470588,0.648575,...,-0.724138,0.310345,0.517241,0.482759,0.551724,-0.137931,0.793103,-0.034483,0.689655,0.620690
2,0.00,0.363636,0.400000,0.373206,0.037037,0.133333,0.126779,0.595238,0.607843,0.720488,...,0.310345,0.206897,0.379310,0.344828,0.517241,-0.827586,0.103448,-0.344828,0.482759,0.448276
3,0.00,0.454545,0.516667,0.399522,0.407407,0.383333,0.420440,0.166667,0.196078,0.582090,...,0.655172,0.068966,0.344828,0.344828,0.034483,0.103448,0.413793,0.551724,-0.482759,0.137931
4,0.00,0.363636,0.416667,0.361244,0.222222,0.316667,0.267788,0.285714,0.333333,0.592944,...,-0.310345,-0.206897,-0.379310,-0.344828,-0.517241,0.827586,-0.103448,0.344828,-0.482759,-0.448276
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10407,0.00,0.477273,0.450000,0.473684,0.333333,0.316667,0.401035,0.404762,0.392157,0.763908,...,-0.172414,-0.275862,0.034483,-0.103448,0.517241,-0.034483,-0.103448,-0.137931,0.206897,-0.482759
10408,0.00,0.522727,0.583333,0.430622,0.444444,0.566667,0.322122,0.142857,0.254902,0.321574,...,0.034483,0.068966,-0.482759,-0.344828,-0.517241,0.241379,0.517241,-0.379310,0.137931,-0.103448
10409,0.25,0.704545,0.650000,0.576555,0.555556,0.516667,0.447607,0.190476,0.176471,0.773406,...,-0.068966,0.448276,1.000000,1.000000,0.758621,-0.620690,0.413793,0.689655,-0.172414,0.896552
10410,0.00,0.409091,0.383333,0.440191,0.518519,0.483333,0.441138,0.357143,0.333333,0.796472,...,0.586207,0.448276,0.068966,0.241379,-0.172414,0.724138,0.275862,-0.448276,0.379310,0.379310


In [213]:
# complete is dataframe with both ranking and stats 
home = complete_cleaning[complete_cleaning['home_x'] == 1]
home = home.reset_index(drop=True)

In [214]:
rankings_cols = [i for i in home.columns if 'rank' in i.lower() and 'spread' not in i.lower()]
no_rankings = home.columns[~home.columns.isin(rankings_cols)]
no_rankings_df = home[no_rankings]
no_rankings_df

Unnamed: 0,mp_x,fg_x,fga_x,fg%_x,3p_x,3pa_x,3p%_x,ft_x,fta_x,ft%_x,...,OPP_FT_PCT_RANK_opponent_spread,OPP_OREB_RANK_opponent_spread,OPP_DREB_RANK_opponent_spread,OPP_REB_RANK_opponent_spread,OPP_AST_RANK_opponent_spread,OPP_TOV_RANK_opponent_spread,OPP_STL_RANK_opponent_spread,OPP_BLK_RANK_opponent_spread,OPP_PF_RANK_opponent_spread,OPP_PTS_RANK_opponent_spread
0,0.00,0.727273,0.516667,0.705742,0.407407,0.350000,0.452781,0.404762,0.431373,0.674355,...,-0.793103,-0.206897,-0.413793,-0.344828,-0.172414,-0.413793,-0.241379,0.379310,-0.241379,-0.379310
1,0.00,0.363636,0.400000,0.373206,0.037037,0.133333,0.126779,0.595238,0.607843,0.720488,...,0.310345,0.206897,0.379310,0.344828,0.517241,-0.827586,0.103448,-0.344828,0.482759,0.448276
2,0.00,0.613636,0.600000,0.516746,0.296296,0.400000,0.291074,0.500000,0.568627,0.618725,...,0.793103,0.206897,0.413793,0.344828,0.172414,0.413793,0.241379,-0.379310,0.241379,0.379310
3,0.25,0.590909,0.433333,0.619617,0.259259,0.266667,0.358344,0.428571,0.411765,0.773406,...,0.034483,0.344828,0.448276,0.448276,-0.241379,0.034483,-0.517241,-0.137931,-0.724138,-0.275862
4,0.00,0.590909,0.500000,0.564593,0.592593,0.550000,0.452781,0.214286,0.274510,0.521031,...,-0.344828,-0.137931,0.034483,0.068966,-0.103448,0.379310,-0.482759,-0.586207,-0.448276,-0.068966
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5206,0.00,0.545455,0.433333,0.566986,0.296296,0.450000,0.260026,0.238095,0.254902,0.660787,...,0.068966,-0.448276,-1.000000,-1.000000,-0.758621,0.620690,-0.413793,-0.689655,0.172414,-0.896552
5207,0.50,0.522727,0.783333,0.313397,0.407407,0.450000,0.364812,0.690476,0.686275,0.750339,...,-0.586207,-0.448276,-0.068966,-0.241379,0.172414,-0.724138,-0.275862,0.448276,-0.379310,-0.379310
5208,0.00,0.568182,0.583333,0.478469,0.407407,0.416667,0.390686,0.428571,0.529412,0.548168,...,-0.034483,-0.068966,0.482759,0.344828,0.517241,-0.241379,-0.517241,0.379310,-0.137931,0.103448
5209,0.00,0.659091,0.650000,0.528708,0.444444,0.466667,0.386805,0.166667,0.117647,1.000000,...,0.172414,0.275862,-0.034483,0.103448,-0.517241,0.034483,0.103448,0.137931,-0.206897,0.482759


### Currently Only Training with No Ranking

In [215]:
numerical_df = no_rankings_df.select_dtypes(include=['number'])
non_numerical_df = no_rankings_df.select_dtypes(exclude=['number'])

numerical_df['target_x']

0       0
1       0
2       1
3       0
4       1
       ..
5206    1
5207    0
5208    0
5209    0
5210    0
Name: target_x, Length: 5211, dtype: int64

In [221]:
non_numerical_df

Unnamed: 0,Teams_x_x,Teams_opp_x,date_x,won_x,team_next_opp_x,date_next_x,Teams_opp_y,date_y,won_y,date_next_prev
0,DEN,ATL,2018-11-15,True,NOP,2018-11-17,NYK,2018-11-16,True,2018-11-16
1,WAS,BRK,2018-11-16,False,POR,2018-11-18,MIN,2018-11-16,False,2018-11-17
2,NOP,NYK,2018-11-16,True,DEN,2018-11-17,ATL,2018-11-15,True,2018-11-16
3,BOS,TOR,2018-11-16,True,UTA,2018-11-17,PHI,2018-11-16,False,2018-11-16
4,MIL,CHI,2018-11-16,True,DEN,2018-11-19,NOP,2018-11-17,False,2018-11-18
...,...,...,...,...,...,...,...,...,...,...
5206,LAC,LAL,2024-02-28,False,WAS,2024-03-01,LAL,2024-02-29,False,2024-02-29
5207,CHI,CLE,2024-02-28,True,MIL,2024-03-01,CHO,2024-02-29,True,2024-02-29
5208,TOR,DAL,2024-02-28,False,GSW,2024-03-01,NYK,2024-02-29,True,2024-02-29
5209,IND,NOP,2024-02-28,True,NOP,2024-03-01,IND,2024-02-28,False,2024-02-29


## 3. Model Building <a id='3'></a>

In [222]:
def backtest(data, model, predictors, true):
    X = data[predictors]
    y = data['target_x']
    
    test_size = len(y) // 4
    
    best_accuracy = 0
    best_f1 = 0
    best_model = None
    fold = 0
    
    tscv = TimeSeriesSplit(n_splits=3, test_size=test_size)
    accuracy_scores = []
    
    for train_index, test_index in tscv.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        
        accuracy = accuracy_score(y_test, predictions)
        f1 = f1_score(y_test, predictions)
        accuracy_scores.append(accuracy)
    
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_f1 = f1
            best_model = model
            fold = len(accuracy_scores)

        print(f'Accuracy for fold {len(accuracy_scores)}: {accuracy}')
        print(f'F1 for fold {len(accuracy_scores)}: {f1}')
        print("\n")

    if (True):
        count = len(predictors)
        accuracy = round(best_accuracy, 3) * 100
        save_path = '/Users/liqingyang/Documents/GitHub/sports_trading/sports_betting/ml_notebooks/weights/'
        file_path = save_path + f'ridge_with_random_forest_only_spread_{count}_predictors_{accuracy}%_2019_2024.pkl'
        with open(file_path, 'wb') as f:
            pickle.dump(best_model, f)
        
        save_path = '/Users/liqingyang/Documents/GitHub/sports_trading/sports_betting/ml_notebooks/factors/'
        file_path = save_path + f'predictors_ridge_with_random_forest_only_spread_{count}_predictors_{accuracy}%_2019_2024.txt'
        with open(file_path, 'w') as f:
            for predictor in predictors:
                f.write(f'{predictor},')
    
        print('-----------------------------------Saved the best model into directory-----------------------------------')
        print(f'Best accuracy: {best_accuracy}')
        print(f'Best f1 score: {best_f1}')

In [223]:
X = numerical_df.loc[:, ~numerical_df.columns.isin(['target_x'])]
y = numerical_df['target_x']

### 3.1 Random Forest Classifier <a id='3_1'></a>

In [None]:
# rf = RandomForestClassifier(n_estimators=350)
# split = TimeSeriesSplit(n_splits=3)

# # can also try direction backward
# sfs = SequentialFeatureSelector(rf, n_features_to_select = 70, direction='forward', cv=split)
# sfs.fit(X, y)
# predictors = list(regard[sfs.get_support()])

rf = RandomForestClassifier(n_estimators=350)
split = TimeSeriesSplit(n_splits=3)

# Using RFE for feature selection instead of SequentialFeatureSelector
rfe = RFE(estimator=rf, n_features_to_select=70)
rfe.fit(X, y)

# Get the support for the features, assuming 'X' is a DataFrame
predictors = X.columns[rfe.support_]

In [None]:
backtest(numerical_df, rf, predictors, False)

### 3.2 AdaBoost Tree <a id='3_2'></a>

In [None]:
adaboost = AdaBoostClassifier(n_estimators=100, learning_rate=1.1)
split = TimeSeriesSplit(n_splits=3)

# Using RFE for feature selection instead of SequentialFeatureSelector
rfe = RFE(estimator=adaboost, n_features_to_select=70)
rfe.fit(X, y)

predictors = X.columns[rfe.support_]

In [None]:
backtest(numerical_df, adaboost, predictors, False)

### 3.3 XGBoost (Extreme Gradient Boosting) <a id='3_3'></a>

In [None]:
xgb = xgb.XGBClassifier(objective='binary:logistic', n_estimators=100, learning_rate=0.1, max_depth=4)
split = TimeSeriesSplit(n_splits=3)

# Using RFE for feature selection instead of SequentialFeatureSelector
rfe = RFE(estimator=xgb, n_features_to_select=70)
rfe.fit(X, y)

predictors = X.columns[rfe.support_]

In [None]:
backtest(numerical_df, xgb, predictors, False)

### 3.4 CatBoost Trees

In [None]:
catboost = CatBoostClassifier(iterations=100, learning_rate=1.1, depth=2, loss_function='Logloss', verbose=False)
split = TimeSeriesSplit(n_splits=3)

# Using RFE for feature selection instead of SequentialFeatureSelector
rfe = RFE(estimator=catboost, n_features_to_select=70)
rfe.fit(X, y)

predictors = X.columns[rfe.support_]

In [None]:
backtest(numerical_df, catboost, predictors, False)