## 1. Import Packages and Dataset 

In [6]:
import pandas as pd
import numpy as np
import sklearn
import torch.nn as nn
import sklearn.model_selection
from sklearn import linear_model
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime
pd.set_option('display.max_columns', None)

In [3]:
def add_target(team):
    team['target'] = team['won'].shift(-1)
    return team

def winrate(team):
    total = team['Wins'] + team['Losses']
    total_opp = team['Wins_opp'] + team['Losses_opp']
    team['winrate'] = team['Wins'] / total
    team['winrate_opp'] = team['Wins_opp'] / total_opp
    return team

def differential(team):
    team['differential'] = team['Total'] - team['Total_opp']
    return team

def find_team_exp_average_5(team):
    numeric_columns = team.select_dtypes(include=np.number)
    rolling = numeric_columns.ewm(span=5, adjust=False).mean()
    return rolling

def find_team_exp_average_9(team):
    numeric_columns = team.select_dtypes(include=np.number)
    rolling = numeric_columns.ewm(span=9, adjust=False).mean()
    return rolling

def find_team_exp_average_12(team):
    numeric_columns = team.select_dtypes(include=np.number)
    rolling = numeric_columns.ewm(span=12, adjust=False).mean()
    return rolling

def find_team_average_15(team):
    numeric_columns = team.select_dtypes(include=np.number)
    rolling = numeric_columns.rolling(15).mean()
    return rolling

def find_team_average_10(team):
    numeric_columns = team.select_dtypes(include=np.number)
    rolling = numeric_columns.rolling(10).mean()
    return rolling

def find_team_average_5(team):
    numeric_columns = team.select_dtypes(include=np.number)
    rolling = numeric_columns.rolling(5).mean()
    return rolling

def find_team_average_3(team):
    numeric_columns = team.select_dtypes(include=np.number)
    rolling = numeric_columns.rolling(3).mean()
    return rolling

def rolling(data):
    df_rolling_3 = data[list(valid_columns) + ['Teams','won', "season"]]
    df_rolling_3 = df_rolling_3.groupby(['Teams', 'season'], group_keys = False).apply(find_team_average_3)
    df_rolling_5 = data[list(valid_columns) + ['Teams','won', "season"]]
    df_rolling_5 = df_rolling_5.groupby(['Teams', 'season'], group_keys = False).apply(find_team_average_5)
    df_rolling_10 = data[list(valid_columns) + ['Teams','won', "season"]]
    df_rolling_10 = df_rolling_10.groupby(['Teams', 'season'], group_keys = False).apply(find_team_average_10)
    df_rolling_15 = data[list(valid_columns) + ['Teams','won', "season"]]
    df_rolling_15 = df_rolling_15.groupby(['Teams', 'season'], group_keys = False).apply(find_team_average_15)
    df_exp_rolling_5 = data[list(valid_columns) + ['Teams','won', "season"]]
    df_exp_rolling_5 = df_exp_rolling_5.groupby(['Teams', 'season'], group_keys = False).apply(find_team_exp_average_5)
    df_exp_rolling_9 = data[list(valid_columns) + ['Teams','won', "season"]]
    df_exp_rolling_9 = df_exp_rolling_9.groupby(['Teams', 'season'], group_keys = False).apply(find_team_exp_average_9)
    df_exp_rolling_12 = data[list(valid_columns) + ['Teams','won', "season"]]
    df_exp_rolling_12 = df_exp_rolling_12.groupby(['Teams', 'season'], group_keys = False).apply(find_team_exp_average_12, include_groups=False)
    exp_rolling_columns_5 = [f"{col}_exp_5" for col in df_exp_rolling_5.columns]
    exp_rolling_columns_9 = [f"{col}_exp_9" for col in df_exp_rolling_9.columns]
    exp_rolling_columns_12 = [f"{col}_exp_12" for col in df_exp_rolling_12.columns]
    rolling_columns_15 = [f"{col}_15" for col in df_rolling_15.columns]
    rolling_columns_10 = [f"{col}_10" for col in df_rolling_10.columns]
    rolling_columns_5 = [f"{col}_5" for col in df_rolling_5.columns]
    rolling_columns_3 = [f"{col}_3" for col in df_rolling_3.columns]
    df_exp_rolling_12.columns = exp_rolling_columns_12
    df_exp_rolling_9.columns = exp_rolling_columns_9
    df_exp_rolling_5.columns = exp_rolling_columns_5
    df_rolling_15.columns = rolling_columns_15
    df_rolling_10.columns = rolling_columns_10
    df_rolling_5.columns = rolling_columns_5
    df_rolling_3.columns = rolling_columns_3
    df = pd.concat([data, df_rolling_3, df_rolling_5, df_rolling_10, df_rolling_15,df_exp_rolling_5,df_exp_rolling_9, df_exp_rolling_12], axis=1)
    return df

def shift_col(team, col_name):
    next_col = team[col_name].shift(-1)
    return next_col

def add_col(df, col_name):
    return df.groupby("Teams", group_keys=False).apply(lambda x: shift_col(x, col_name))

def date_change(datetime_str):
    # Parse the datetime string into a datetime object
    datetime_obj = datetime.strptime(datetime_str, '%m/%d/%Y')

    # Format the datetime object into a new string structure
    new_datetime_str = datetime_obj.strftime('%Y-%m-%d')

    return new_datetime_str

In [4]:
folder_path = "/Users/benjamincheng/Documents/GitHub/Sports-Betting/data/raw_data/NBA_2018_2024.csv"
df = pd.read_csv(folder_path, index_col=0)

folder_path = "/Users/benjamincheng/Documents/GitHub/Sports-Betting/nba_api/data/teams_stats/processed_cumulative_season_stats_2019_2024.csv"
nba = pd.read_csv(folder_path, index_col=0)

### 2. Data Wrangling and Preprocessing for Modeling

In [7]:
# nba dataframe does not include the 2018 season
df = df[~df['season'].isin([2018])]
df = df.reset_index(drop=True)

# rename nba columns to match df
nba.rename(columns={'Date': 'date_next', 'Teams':'Teams_x'}, inplace=True)

# construct winrate for team
df = winrate(df)
# construct differential points
df = differential(df)
# construct target
df = df.groupby("Teams", group_keys=False).apply(add_target)

# games yet to play are 2
df.loc[pd.isnull(df['target']), 'target'] = 2
# convert win/loss to 1/0
df['target'] = df['target'].astype(int)

# remove metadata and target
removed = ['target', 'date', 'Teams_opp', 'Teams',
           'season','won', 'Wins', 'Losses', 
           'Wins_opp', 'Losses_opp']
valid_columns = df.columns[~df.columns.isin(removed)]

# scale the data
scaler = MinMaxScaler()
df[valid_columns] = scaler.fit_transform(df[valid_columns])

# construct rolling features to df
df = rolling(df).copy()
df = df.dropna()

# construct current game metadata
df['home_next'] = add_col(df, 'home')
df['team_next_opp'] = add_col(df, 'Teams_opp')
df['date_next'] = add_col(df, 'date')
df = df.copy()

# merge stats from opposing teams
full = df.merge(df,
               left_on=['Teams', 'date_next'],
               right_on = ['team_next_opp', 'date_next'])

# merge stats from nba dataframe
full = pd.merge(full, nba, on=['Teams_x', 'date_next'], how='left')
full = full.dropna()

# remove metadata and useless data
disregard = list(full.columns[full.dtypes == 'object']) 
disregard = disregard + ["home_opp_5_x","target_x","target_y", 
                         "Wins_x", "Losses_x", "Wins_opp_x", 
                         "Losses_opp_x", "season_x" , "won_x" , 
                         "home_5_x" ,"home_10_x" ,"season_5_x", 
                         "season_10_x" , "Wins_y" , "Losses_y" , 
                         "Wins_opp_y" , "Losses_opp_y" , "season_y" , 
                         "won_y" ,"home_5_y", "home_10_y","season_5_y", 
                         "season_10_y","home_opp_5_y", "home_opp_10_y"]
regard = full.columns[~full.columns.isin(disregard)]

  df = df.groupby("Teams", group_keys=False).apply(add_target)
  df_rolling_3 = df_rolling_3.groupby(['Teams', 'season'], group_keys = False).apply(find_team_average_3)
  df_rolling_5 = df_rolling_5.groupby(['Teams', 'season'], group_keys = False).apply(find_team_average_5)
  df_rolling_10 = df_rolling_10.groupby(['Teams', 'season'], group_keys = False).apply(find_team_average_10)
  df_rolling_15 = df_rolling_15.groupby(['Teams', 'season'], group_keys = False).apply(find_team_average_15)
  df_exp_rolling_5 = df_exp_rolling_5.groupby(['Teams', 'season'], group_keys = False).apply(find_team_exp_average_5)
  df_exp_rolling_9 = df_exp_rolling_9.groupby(['Teams', 'season'], group_keys = False).apply(find_team_exp_average_9)
  return df.groupby("Teams", group_keys=False).apply(lambda x: shift_col(x, col_name))
  return df.groupby("Teams", group_keys=False).apply(lambda x: shift_col(x, col_name))
  return df.groupby("Teams", group_keys=False).apply(lambda x: shift_col(x, col_name))


In [9]:
# save processed dataframe
save_path = '/Users/benjamincheng/Documents/GitHub/Sports-Betting/data/processed_data/'
file = 'processed_2019_2024.csv'
file_name = save_path + file
full.to_csv(file_name)

## 3. Modeling

In [66]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier

rr = RidgeClassifier(alpha=1)
split = TimeSeriesSplit(n_splits=3)

# can also try direction backward
sfs = SequentialFeatureSelector(rr, n_features_to_select = 70, direction='forward', cv=split)

In [67]:
sfs.fit(full[regard], full['target_x'])

In [68]:
predictors = list(regard[sfs.get_support()])

In [69]:
predictors

['efg%_x',
 'usg%_x',
 'eFG%_x',
 'usg%_opp_x',
 'winrate_x',
 'tov_3_x',
 'usg%_3_x',
 'usg%_opp_3_x',
 'tov%maxes_opp_3_x',
 'winrate_3_x',
 'usg%_5_x',
 'usg%_opp_5_x',
 'fga_10_x',
 'usg%_10_x',
 'ortg_10_x',
 'drtg_10_x',
 '3pamaxes_10_x',
 'drtgmaxes_10_x',
 'ORtg_10_x',
 '3pa_opp_10_x',
 'usg%_opp_10_x',
 'ortg_opp_10_x',
 'drtg_opp_10_x',
 'differential_10_x',
 'tov_15_x',
 'usg%_15_x',
 'usg%_opp_15_x',
 '3p%maxes_opp_15_x',
 'tov_exp_5_x',
 'ast%_exp_5_x',
 'usg%_exp_5_x',
 'stl%maxes_exp_5_x',
 'usg%_opp_exp_5_x',
 'winrate_exp_5_x',
 'usg%_exp_9_x',
 'ts%_opp_exp_9_x',
 'usg%_opp_exp_9_x',
 '+/-maxes_opp_exp_9_x',
 'usg%_exp_12_x',
 '3pa_opp_exp_12_x',
 'usg%_opp_exp_12_x',
 '3pamaxes_opp_exp_12_x',
 'usg%_y',
 'usg%_opp_y',
 'usg%_3_y',
 'usg%_opp_3_y',
 'usg%_5_y',
 'usg%_opp_5_y',
 'usg%_10_y',
 'usg%_opp_10_y',
 'stl%maxes_opp_10_y',
 'usg%_15_y',
 'usg%_opp_15_y',
 'stl%maxes_opp_15_y',
 'usg%_exp_5_y',
 'usg%_opp_exp_5_y',
 'usg%_exp_9_y',
 'stl%_opp_exp_9_y',
 'usg%_

In [79]:
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import accuracy_score,f1_score
import pickle 

def backtest(data, model, predictors, true):
    X = data[predictors]
    y = data['target_x']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.20, random_state=42)
    
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    result = pd.Series(predictions, index = y_test.index)
    
    final = pd.concat([y_test, result], axis=1)
    final.columns = ['Actual', 'Predictions']
    if (True):
        with open('70_87.3%_ridge_classifier_2019-2024.pkl', 'wb') as f:
            pickle.dump(model, f)
    print(model.coef_)
    return final

In [80]:
final = backtest(full, rr, predictors, False)

[[-1.62641137e-01  0.00000000e+00 -1.62641137e-01  0.00000000e+00
  -9.62305732e+00  1.74947981e-01  0.00000000e+00  0.00000000e+00
  -8.62025456e-03 -4.55184334e+00  0.00000000e+00  0.00000000e+00
   1.55051862e-01  0.00000000e+00  3.88640686e-02 -3.31291176e-01
   6.00263675e-02  4.29007434e-01  3.88640686e-02  2.27879600e-01
   0.00000000e+00 -3.31291176e-01  3.88640686e-02  1.27507706e-01
  -5.52326079e-01  0.00000000e+00  0.00000000e+00  1.10658538e-01
   2.20684009e-01  6.30930866e-03  0.00000000e+00  7.96480812e-01
   0.00000000e+00 -3.87215967e+00  0.00000000e+00  6.20079359e-01
   0.00000000e+00  1.11541847e+00  0.00000000e+00 -3.10677707e-01
   0.00000000e+00 -2.29087179e-01  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00 -1.37408096e-01  0.00000000e+00
   0.00000000e+00 -1.00269910e-01  0.00000000e+00  0.00000000e+00
   0.00000000e+00  8.58078342e-01  0.00000000e+00 -2.18016405e-01
   0.00000

In [81]:
final

Unnamed: 0,Actual,Predictions
7348,1,1
7349,1,1
7350,0,0
7351,0,0
7352,0,0
...,...,...
10669,1,1
10670,1,1
10671,1,1
10672,0,0


In [82]:
accuracy = accuracy_score(final['Actual'], final['Predictions'])
f1 = f1_score(final['Actual'], final['Predictions'])

In [83]:
print("The accuracy is: ", accuracy)
print("The f1 is: ", f1)

The accuracy is:  0.8541893362350381
The f1 is:  0.8559139784946237


In [52]:
# save the predictors 
# file_path = 'Predictions/Factors/70_87.3%_predictors_ridge_classifier_2019-2024.txt'
# with open(file_path, 'w') as f:
#     for predictor in predictors:
#         f.write(f'{predictor},')

In [53]:
def backtest_time_series(data, model, predictors, true):
    X = data[predictors]
    y = data['target_x']
    
    test_size = len(y) // 4
    
    tscv = TimeSeriesSplit(n_splits=3, test_size=test_size)
    
    for train_index, test_index in tscv.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        result = pd.Series(predictions, index=y_test.index)
        
        final = pd.concat([y_test, result], axis=1)
        final.columns = ['Actual', 'Predictions']
        
        
    if (True):
        with open('time_series_ridge_classifier_2018-2024.pkl', 'wb') as f:
            pickle.dump(model, f)
    return final

In [54]:
final_time_series = backtest_time_series(full, rr, predictors, False)

In [55]:
accuracy_time = accuracy_score(final_time_series['Actual'], final_time_series['Predictions'])
f1_time = f1_score(final_time_series['Actual'], final_time_series['Predictions'])

In [56]:
print("The accuracy is: ", accuracy_time)
print("The f1 is: ", f1_time)

The accuracy is:  0.850609756097561
The f1 is:  0.8533561351004704
