# Modelling

In [232]:
import pandas as pd
import numpy as np

In [233]:
df = pd.read_csv("data/games_with_features.csv", index_col="id")

In [234]:
df.head()

Unnamed: 0_level_0,date,home_team_score,period,postseason,season,status,visitor_team_score,home_team.id,home_team.abbreviation,home_team.conference,...,visitor_team.full_name,winner,home_team_avg_score_historical,visitor_team_avg_score_historical,home_team_id_year,visitor_team_id_year,home_team_avg_score,visitor_team_avg_score,home_avg_score_diff,visitor_avg_score_diff
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
47179,2019-01-30,126,4,False,2018,Final,94,2,BOS,East,...,Charlotte Hornets,1,105.8,98.2,2 2018,4 2018,112.8,108.3,3.62,-3.831707
48751,2019-02-09,112,4,False,2018,Final,123,2,BOS,East,...,LA Clippers,0,105.8,100.7,2 2018,13 2018,112.8,113.1,3.62,0.581818
48739,2019-02-08,117,4,False,2018,Final,110,23,PHI,East,...,Denver Nuggets,1,103.3,104.2,23 2018,8 2018,117.9,108.2,8.725532,-4.670213
48740,2019-02-08,119,4,False,2018,Final,106,30,WAS,East,...,Cleveland Cavaliers,1,103.4,98.3,30 2018,6 2018,116.4,103.8,7.429268,-8.419512
48746,2019-02-08,102,4,False,2018,Final,96,26,SAC,West,...,Miami Heat,1,105.5,96.0,26 2018,16 2018,114.9,105.4,5.129268,-6.670732


### Ultra Baseline

In [242]:
# predict winner only using home_team_avg_score when playing at home
# vs visitor_team_avg_score when playing as visitor
home_should_win = df[df["home_team_avg_score"].gt(df["visitor_team_avg_score"])]
home_should_win["winner"].value_counts(normalize=True)

1    0.664878
0    0.335122
Name: winner, dtype: float64

In [243]:
s2018 = df[df["season"].eq(2018)]
# predict winner only using home_team_avg_score when playing at home
# vs visitor_team_avg_score when playing as visitor
home_should_win = s2018[s2018["home_team_avg_score"].gt(s2018["visitor_team_avg_score"])]
home_should_win["winner"].value_counts(normalize=True)

1    0.669248
0    0.330752
Name: winner, dtype: float64

> Home team wins 66% of the time when their avg score is higher   
> there is a flaw with this baseline in that it uses the average score from all games   
> that occured that season and correlates it with the winner of games that occured before   
> that average score was known

In [58]:
df[(df["home_team_avg_score"] - 5).gt(df["visitor_team_avg_score"])]["winner"].value_counts(normalize=True)

1    0.717187
0    0.282813
Name: winner, dtype: float64

> same experiment as above but only using teams that are heavy favourites (10 pt average more).  
> win percantage seems to increse 1% per 1pt advantage

In [61]:
s2020 = df[df["season"].eq(2020)]
s2019 = df[df["season"].eq(2019)]
s2018 = df[df["season"].eq(2018)]
s2017 = df[df["season"].eq(2017)]
s2016 = df[df["season"].eq(2016)]
s2015 = df[df["season"].eq(2015)]

In [206]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

### Baseline

In [215]:
lr = LogisticRegression()
nb = GaussianNB()
knn = KNeighborsClassifier()
rf = RandomForestClassifier()
xgb = XGBClassifier(eval_metric="logloss", use_label_encoder=False)

models = [lr, nb, knn, rf, xgb]

In [222]:
def train_model(model, train_data, test_data):
    X_train = train_data.drop("winner", axis=1)
    y_train = train_data["winner"]
    X_test = test_data.drop("winner", axis=1)
    y_test = test_data["winner"]                     

    model = model
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(accuracy_score(y_test, y_pred))
    #print(cross_val_score(model, feats, target).mean())

In [231]:
train_data = df[df["season"].isin([2017])][["home_team_avg_score", "visitor_team_avg_score", "winner"]]
test_data = df[df["season"].isin([2018])][["home_team_avg_score", "visitor_team_avg_score", "winner"]]
for model in models:
    train_model(model, train_data, test_data)

0.6399694889397407
0.6414950419527079
0.5713196033562167
0.5812356979405034
0.5408085430968727


#### using score diff feat

In [261]:
features = ["winner", "home_team_avg_score", "visitor_team_avg_score", "home_avg_score_diff", "visitor_avg_score_diff"]
train_data = df[df["season"].isin(range(2015,2018))][features]
test_data = df[df["season"].isin([2018])][features]
for model in models:
    train_model(model, train_data, test_data)

0.6361556064073226
0.6376811594202898
0.566742944317315
0.5903890160183066
0.5881006864988558


> xgboost and random forest predict around 80% correctly when using data from the same season.  
> unfortunately when trying to predict results of future seasons the result is horrible.

##### The Big problem with this so far is that averages for the season are used to predict outcomes of games that contributed to those averages