# Modelling

In [232]:
import pandas as pd
import numpy as np

In [233]:
df = pd.read_csv("data/games_with_features.csv", index_col="id")

In [234]:
df.head()

Unnamed: 0_level_0,date,home_team_score,period,postseason,season,status,visitor_team_score,home_team.id,home_team.abbreviation,home_team.conference,...,visitor_team.full_name,winner,home_team_avg_score_historical,visitor_team_avg_score_historical,home_team_id_year,visitor_team_id_year,home_team_avg_score,visitor_team_avg_score,home_avg_score_diff,visitor_avg_score_diff
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
47179,2019-01-30,126,4,False,2018,Final,94,2,BOS,East,...,Charlotte Hornets,1,105.8,98.2,2 2018,4 2018,112.8,108.3,3.62,-3.831707
48751,2019-02-09,112,4,False,2018,Final,123,2,BOS,East,...,LA Clippers,0,105.8,100.7,2 2018,13 2018,112.8,113.1,3.62,0.581818
48739,2019-02-08,117,4,False,2018,Final,110,23,PHI,East,...,Denver Nuggets,1,103.3,104.2,23 2018,8 2018,117.9,108.2,8.725532,-4.670213
48740,2019-02-08,119,4,False,2018,Final,106,30,WAS,East,...,Cleveland Cavaliers,1,103.4,98.3,30 2018,6 2018,116.4,103.8,7.429268,-8.419512
48746,2019-02-08,102,4,False,2018,Final,96,26,SAC,West,...,Miami Heat,1,105.5,96.0,26 2018,16 2018,114.9,105.4,5.129268,-6.670732


### Ultra Baseline

In [242]:
# predict winner only using home_team_avg_score when playing at home
# vs visitor_team_avg_score when playing as visitor
home_should_win = df[df["home_team_avg_score"].gt(df["visitor_team_avg_score"])]
home_should_win["winner"].value_counts(normalize=True)

1    0.664878
0    0.335122
Name: winner, dtype: float64

In [243]:
s2018 = df[df["season"].eq(2018)]
# predict winner only using home_team_avg_score when playing at home
# vs visitor_team_avg_score when playing as visitor
home_should_win = s2018[s2018["home_team_avg_score"].gt(s2018["visitor_team_avg_score"])]
home_should_win["winner"].value_counts(normalize=True)

1    0.669248
0    0.330752
Name: winner, dtype: float64

> Home team wins 66% of the time when their avg score is higher   
> there is a flaw with this baseline in that it uses the average score from all games   
> that occured that season and correlates it with the winner of games that occured before   
> that average score was known

In [58]:
df[(df["home_team_avg_score"] - 5).gt(df["visitor_team_avg_score"])]["winner"].value_counts(normalize=True)

1    0.717187
0    0.282813
Name: winner, dtype: float64

> same experiment as above but only using teams that are heavy favourites (10 pt average more).  
> win percantage seems to increse 1% per 1pt advantage

In [61]:
s2020 = df[df["season"].eq(2020)]
s2019 = df[df["season"].eq(2019)]
s2018 = df[df["season"].eq(2018)]
s2017 = df[df["season"].eq(2017)]
s2016 = df[df["season"].eq(2016)]
s2015 = df[df["season"].eq(2015)]

In [206]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

### Baseline

In [297]:
lr = LogisticRegression(max_iter=1000)
nb = GaussianNB()
knn = KNeighborsClassifier()
rf = RandomForestClassifier()
xgb = XGBClassifier(eval_metric="logloss", use_label_encoder=False)

models = [lr, nb, knn, rf, xgb]

In [222]:
def train_model(model, train_data, test_data):
    X_train = train_data.drop("winner", axis=1)
    y_train = train_data["winner"]
    X_test = test_data.drop("winner", axis=1)
    y_test = test_data["winner"]                     

    model = model
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(accuracy_score(y_test, y_pred))
    #print(cross_val_score(model, feats, target).mean())

In [231]:
train_data = df[df["season"].isin([2017])][["home_team_avg_score", "visitor_team_avg_score", "winner"]]
test_data = df[df["season"].isin([2018])][["home_team_avg_score", "visitor_team_avg_score", "winner"]]
for model in models:
    train_model(model, train_data, test_data)

0.6399694889397407
0.6414950419527079
0.5713196033562167
0.5812356979405034
0.5408085430968727


#### using score diff feat

In [261]:
features = ["winner", "home_team_avg_score", "visitor_team_avg_score", "home_avg_score_diff", "visitor_avg_score_diff"]
train_data = df[df["season"].isin(range(2015,2018))][features]
test_data = df[df["season"].isin([2018])][features]
for model in models:
    train_model(model, train_data, test_data)

0.6361556064073226
0.6376811594202898
0.566742944317315
0.5903890160183066
0.5881006864988558


> xgboost and random forest predict around 80% correctly when using data from the same season.  
> unfortunately when trying to predict results of future seasons the result is horrible.

##### The Big problem with this so far is that averages for the season are used to predict outcomes of games that contributed to those averages

# New Baseline

In [298]:
stats = pd.read_csv("data/stats_feats2020.csv", index_col="game_id")

In [300]:
stats = pd.read_csv("data/stats_diff2020.csv", index_col="game_id")

In [301]:
X = stats.drop(["winner", "game_date", "season"], axis=1)
y = stats["winner"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75)

for model in models:
    model = model
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_pred = model.predict(X_test)
    
    print(model)
    print(accuracy_score(y_train, y_train_pred))
    print(accuracy_score(y_test, y_pred))
    print(cross_val_score(model, X_test, y_test).mean())

LogisticRegression(max_iter=1000)
0.8149466192170819
0.8014184397163121
0.7659774436090225
GaussianNB()
0.7805456702253856
0.7553191489361702
0.7412907268170426
KNeighborsClassifier()
0.830367734282325
0.7411347517730497
0.7094611528822055
RandomForestClassifier()
1.0
0.7943262411347518
0.7519423558897244
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              use_label_encoder=False, validate_parameters=1, verbosity=None)
1.0
0.7907801418439716
0.7199248120300752


> this seems to good to be true   

## Using stats from 2015 - 2020

In [344]:
lr = LogisticRegression(max_iter=1000)
nb = GaussianNB()
knn = KNeighborsClassifier()
rf = RandomForestClassifier(n_estimators=100)
xgb = XGBClassifier(n_estimators=100, eval_metric="logloss", use_label_encoder=False)

models = [lr, nb, knn, rf, xgb]

In [345]:
stats = pd.read_csv("data/stats_feats.csv", index_col="game_id")

In [346]:
stats = pd.read_csv("data/stats_diff.csv", index_col="game_id")

In [347]:
X = stats.drop(["winner", "game_date", "season"], axis=1)
y = stats["winner"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75)

for model in models:
    model = model
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_pred = model.predict(X_test)
    
    print(model)
    print(accuracy_score(y_train, y_train_pred))
    print(accuracy_score(y_test, y_pred))
    print(cross_val_score(model, X_test, y_test).mean())

LogisticRegression(max_iter=1000)
0.6042614811750103
0.6315136476426799
0.6234544160913794
GaussianNB()
0.5732312784443525
0.5887096774193549
0.5669749822125647
KNeighborsClassifier()
0.7139015308233347
0.5359801488833746
0.5651673941887967
RandomForestClassifier()
1.0
0.5874689826302729
0.580020383439417
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              use_label_encoder=False, validate_parameters=1, verbosity=None)
0.9940008274720729
0.5874689826302729
0.565128934

> The test data is no longer leaking into the training data. But the model still overfits

In [329]:
stats

Unnamed: 0_level_0,ast,blk,dreb,fg3_pct,fg3a,fg3m,fg_pct,fga,fgm,ft_pct,...,ftm,oreb,pf,pts,reb,stl,turnover,winner,game_date,season
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
30003,-11.0,-1.0,-25.5,-0.031928,-7.0,-4.0,0.027298,-43.5,-16.0,0.001452,...,-7.5,-4.5,-14.0,-43.5,-30.0,7.5,-1.0,1,2016-03-17,2015
30007,10.0,1.5,-1.0,0.061259,4.5,1.0,0.022100,10.0,6.0,-0.106318,...,-8.0,-2.0,-3.0,5.0,-3.0,5.0,3.0,1,2016-03-19,2015
30015,5.5,4.5,-7.0,0.266854,4.5,5.0,0.054572,-2.0,5.0,-0.110832,...,-8.0,-6.0,-5.0,7.0,-13.0,1.0,-6.0,0,2016-03-21,2015
29327,3.0,-0.5,-8.0,0.200218,14.0,6.0,-0.025511,9.0,1.0,-0.170871,...,-10.0,0.0,-2.0,-2.0,-8.0,3.0,-7.5,1,2016-03-25,2015
30972,4.0,-2.0,-29.0,0.077327,-15.5,-0.5,0.074432,-51.5,-10.5,0.030245,...,-6.5,-10.0,-15.0,-28.0,-39.0,0.5,-5.0,0,2016-04-01,2015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
264689,-4.5,-0.5,-3.0,-12.449231,-12.0,-8.0,-5.912404,4.0,3.0,-19.677308,...,-1.5,3.5,4.5,-3.5,0.5,-2.5,-5.5,1,2021-05-03,2020
264801,10.0,0.5,2.5,1.664103,-16.0,-7.0,6.241220,9.0,15.0,0.304157,...,2.0,-1.0,1.0,25.0,1.5,0.0,1.5,1,2021-05-16,2020
423334,9.0,1.0,11.5,-11.306818,-2.0,0.0,0.393636,9.5,12.5,-15.695455,...,-1.0,2.5,1.5,24.0,14.0,0.5,-2.5,1,2021-05-20,2020
430019,6.0,1.5,5.5,5.009805,-1.5,0.5,9.536515,12.5,13.0,12.927381,...,-1.5,3.5,-1.5,25.0,9.0,-3.0,0.5,0,2021-05-29,2020


In [334]:
train = stats[stats["season"].isin([2015,2016,2017,2018,2019])]
test = stats[stats["season"].eq(2020)]

In [335]:
X_train = train.drop(["winner", "game_date", "season"], axis=1)
y_train = train["winner"]
X_test = test.drop(["winner", "game_date", "season"], axis=1)
y_test = test["winner"]

In [337]:
for model in models:
    model = model
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_pred = model.predict(X_test)
    
    print(model)
    print(accuracy_score(y_train, y_train_pred))
    print(accuracy_score(y_test, y_pred))
    print(cross_val_score(model, X_test, y_test).mean())

LogisticRegression(max_iter=1000)
0.604091059056417
0.5489690721649485
0.5498150066597602
GaussianNB()
0.5747278126031012
0.5446735395189003
0.5292326476246856
KNeighborsClassifier()
0.7040580666446717
0.5077319587628866
0.5368876720438064
RandomForestClassifier()
1.0
0.5498281786941581
0.5541142518869321
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              use_label_encoder=False, validate_parameters=1, verbosity=None)
0.9731111844275817
0.5223367697594502
0.537797839

## Using all stats

In [344]:
lr = LogisticRegression(max_iter=1000)
nb = GaussianNB()
knn = KNeighborsClassifier()
rf = RandomForestClassifier(n_estimators=100)
xgb = XGBClassifier(n_estimators=100, eval_metric="logloss", use_label_encoder=False)

models = [lr, nb, knn, rf, xgb]

In [348]:
stats = pd.read_csv("data/all_stats_feats.csv", index_col="game_id")

In [346]:
stats = pd.read_csv("data/all_stats_diff.csv", index_col="game_id")

In [349]:
X = stats.drop(["winner", "game_date", "season"], axis=1)
y = stats["winner"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75)

for model in models:
    model = model
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_pred = model.predict(X_test)
    
    print(model)
    print(accuracy_score(y_train, y_train_pred))
    print(accuracy_score(y_test, y_pred))
    print(cross_val_score(model, X_test, y_test).mean())

LogisticRegression(max_iter=1000)
0.646621661633209
0.6394884092725819
0.6358469454665105
GaussianNB()
0.6025641025641025
0.6032507327471356
0.6005859641619711
KNeighborsClassifier()
0.7380825487061052
0.5949018562927436
0.5878844058255538
RandomForestClassifier()
1.0
0.6324718003375077
0.6205701748502541
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              use_label_encoder=False, validate_parameters=1, verbosity=None)
0.8204239947888908
0.628386179944933
0.6108887859

In [351]:
train = stats[stats["season"].lt(2018)]
test = stats[stats["season"].eq(2018)]

In [352]:
X_train = train.drop(["winner", "game_date", "season"], axis=1)
y_train = train["winner"]
X_test = test.drop(["winner", "game_date", "season"], axis=1)
y_test = test["winner"]

In [353]:
for model in models:
    model = model
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_pred = model.predict(X_test)
    
    print(model)
    print(accuracy_score(y_train, y_train_pred))
    print(accuracy_score(y_test, y_pred))
    print(cross_val_score(model, X_test, y_test).mean())

LogisticRegression(max_iter=1000)
0.6487454329889424
0.6147978642257819


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.6224131425420137
GaussianNB()
0.5155943768298289
0.5034324942791762
0.5957333178533073
KNeighborsClassifier()
0.7420213409470348
0.5766590389016019
0.5477345949554466
RandomForestClassifier()
1.0
0.593440122044241
0.5789597422575683
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              use_label_encoder=False, validate_parameters=1, verbosity=None)
0.8124561445958044
0.5682684973302822
0.52939076422953
