# MODEL BAKEOFF
- Begin by gathering the dataset, and cleaning columns as needed
- Start for modeling against home-team-win (work out over under later)
- Choose from a flat selection of classification models, linear, logistic, and ensemble methods w/o hyperparameters

In [46]:
import sqlite3
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

import sys
import os

sys.path.insert(1, os.path.join(sys.path[0], '../..'))
from include_columns import include_columns_corr_ml

In [47]:
dataset = "games_2018-present"
db = sqlite3.connect(f"/Users/aidenflynn/ML_Python/python-nfl/Data/v2.sqlite")
df = pd.read_sql_query(f"select * from \"{dataset}\"", db, index_col="index")
df.head()

Unnamed: 0_level_0,SEASON,AWAY_TEAM_NAME,AWAY_TEAM_PREV_RANK,HOME_TEAM_NAME,HOME_TEAM_PREV_RANK,AWAY_SCORE,HOME_SCORE,WEEK,AWAY_UNIQ_STARTERS_QB,AWAY_UNIQ_STARTERS_DEFENSE,...,DIV_MATCH,SCORE,OU,OU_COVER,SPREAD,SPREAD_COVER,WIND_SPEED,TEMP,SURFACE,ROOF
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2018,ARI,18.0,ATL,7.0,14,40,15,2,21,...,0.0,54,43.5,1.0,10.0,0.0,084,72,72084,Cardinals (deferred)
0,2018,ARI,18.0,GNB,19.0,20,17,13,2,19,...,0.0,37,41.0,0.0,13.5,0.0,wind 20 mph,34 degrees,"34 degrees, wind 20 mph",outdoors
0,2018,ARI,18.0,KAN,8.0,14,26,10,2,17,...,0.0,40,49.5,0.0,16.5,0.0,wind 5 mph,43 degrees,"43 degrees, wind 5 mph",outdoors
0,2018,ARI,18.0,LAC,15.0,10,45,12,2,18,...,0.0,55,43.0,1.0,14.0,0.0,wind 3 mph,74 degrees,"74 degrees, wind 3 mph",outdoors
0,2018,ARI,18.0,LAR,10.0,0,34,2,1,11,...,1.0,34,43.5,0.0,13.5,0.0,wind 2 mph,86 degrees,"86 degrees, wind 2 mph",outdoors


In [48]:
# fill scrm RELATED column Null with 0.0
df['AWAY_PASS_YDSSCRM_MAX'] = df['AWAY_PASS_YDSSCRM_MAX'].fillna(0.0)
df['AWAY_PASS_YDSSCRM_MEAN'] = df['AWAY_PASS_YDSSCRM_MEAN'].fillna(0.0)
df['AWAY_PASS_YDSSCRM_MIN'] = df['AWAY_PASS_YDSSCRM_MIN'].fillna(0.0)
df['AWAY_PASS_YDSSCRM_STD'] = df['AWAY_PASS_YDSSCRM_STD'].fillna(0.0)
df['HOME_PASS_YDSSCRM_MAX'] = df['HOME_PASS_YDSSCRM_MAX'].fillna(0.0)
df['HOME_PASS_YDSSCRM_MEAN'] = df['HOME_PASS_YDSSCRM_MEAN'].fillna(0.0)
df['HOME_PASS_YDSSCRM_MIN'] = df['HOME_PASS_YDSSCRM_MIN'].fillna(0.0)
df['HOME_PASS_YDSSCRM_STD'] = df['HOME_PASS_YDSSCRM_STD'].fillna(0.0)

In [49]:
# All other null values can be safely filled with -1: unavailable
df.fillna(-1, inplace=True) 
df.isnull().any(axis=1).sum()

0

Combine w/l columns to get a percentage and drop W L columns. Watch out for zero division exceptions.

In [50]:
df['HOME_PASS_PLAYOFF_WINPCT'] = np.where(
    (df['HOME_PASS_PLAYOFF_W'] + df['HOME_PASS_PLAYOFF_L']) > 0,
    df['HOME_PASS_PLAYOFF_W'] / (df['HOME_PASS_PLAYOFF_W'] + df['HOME_PASS_PLAYOFF_L']),
    -1
)
df['HOME_PASS_PLAYOFF_GAMES'] = df['HOME_PASS_PLAYOFF_W'] + df['HOME_PASS_PLAYOFF_L']

# df['AWAY_PASS_PLAYOFF_WINPCT'] = np.where(
#     (df['AWAY_PASS_PLAYOFF_W'] + df['AWAY_PASS_PLAYOFF_L']) > 0,
#     df['AWAY_PASS_PLAYOFF_W'] / (df['AWAY_PASS_PLAYOFF_W'] + df['AWAY_PASS_PLAYOFF_L']),
#     -1
# )
# df['AWAY_PASS_PLAYOFF_GAMES'] = df['AWAY_PASS_PLAYOFF_W'] + df['AWAY_PASS_PLAYOFF_L']

df['HOME_PASS_BIG_GAME_WINPCT'] = np.where(
    (df['HOME_PASS_BIG_GAME_W'] + df['HOME_PASS_BIG_GAME_L']) > 0,
    df['HOME_PASS_BIG_GAME_W'] / (df['HOME_PASS_BIG_GAME_W'] + df['HOME_PASS_BIG_GAME_L']),
    -1
)
df['HOME_PASS_BIG_GAMES'] = df['HOME_PASS_BIG_GAME_W'] + df['HOME_PASS_BIG_GAME_L']

# df['AWAY_PASS_BIG_GAME_WINPCT'] = np.where(
#     (df['AWAY_PASS_BIG_GAME_W'] + df['AWAY_PASS_BIG_GAME_L']) > 0,
#     df['AWAY_PASS_BIG_GAME_W'] / (df['AWAY_PASS_BIG_GAME_W'] + df['AWAY_PASS_BIG_GAME_L']),
#     -1
# )
# df['AWAY_PASS_BIG_GAMES'] = df['AWAY_PASS_BIG_GAME_W'] + df['AWAY_PASS_BIG_GAME_L']

df.tail()

Unnamed: 0_level_0,SEASON,AWAY_TEAM_NAME,AWAY_TEAM_PREV_RANK,HOME_TEAM_NAME,HOME_TEAM_PREV_RANK,AWAY_SCORE,HOME_SCORE,WEEK,AWAY_UNIQ_STARTERS_QB,AWAY_UNIQ_STARTERS_DEFENSE,...,SPREAD,SPREAD_COVER,WIND_SPEED,TEMP,SURFACE,ROOF,HOME_PASS_PLAYOFF_WINPCT,HOME_PASS_PLAYOFF_GAMES,HOME_PASS_BIG_GAME_WINPCT,HOME_PASS_BIG_GAMES
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2024,WAS,31.0,NYG,27.0,27,22,9,1,16,...,4.0,0.0,relative humidity 35%,52 degrees,"52 degrees, relative humidity 35%, wind 6 mph",outdoors,0.5,2,0.44,25
0,2024,WAS,31.0,PHI,11.0,18,26,11,1,16,...,3.5,0.0,relative humidity 63%,46 degrees,"46 degrees, relative humidity 63%, wind 9 mph",outdoors,0.4,5,0.565217,23
0,2024,WAS,31.0,PHI,11.0,23,55,21,1,21,...,6.0,0.0,relative humidity 45%,40 degrees,"40 degrees, relative humidity 45%, wind 11 mph",outdoors,0.571429,7,0.6,25
0,2024,WAS,31.0,TAM,7.0,20,37,1,-1,-1,...,3.5,0.0,relative humidity 60%,93 degrees,"93 degrees, relative humidity 60%, wind 6 mph",outdoors,0.5,4,0.487179,39
0,2024,WAS,31.0,TAM,7.0,23,20,19,1,20,...,3.0,0.0,relative humidity 57%,58 degrees,"58 degrees, relative humidity 57%, wind 2 mph",outdoors,0.5,4,0.488372,43


In [56]:
# target home team win; drop dud and standard columns
TARGET = df['Home-Team-Win']
df_dropped = df.drop(
    [
        'ROOF', 'SURFACE', 'TEMP', 'WIND_SPEED', 'SPREAD_COVER', 'SPREAD', 'OU_COVER', 'OU', 'SCORE', 'Home-Team-Win', 'HOME_SCORE', 
        'AWAY_SCORE', 'HOME_TEAM_NAME', 'AWAY_TEAM_NAME', 'SEASON', 'AWAY_PASS_PLAYOFF_L',
        'AWAY_PASS_PLAYOFF_W','AWAY_PASS_BIG_GAME_W', 'AWAY_PASS_BIG_GAME_L'
    ], 
    axis=1
)

In [57]:
df_dropped.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1942 entries, 0 to 0
Columns: 544 entries, AWAY_TEAM_PREV_RANK to HOME_PASS_BIG_GAMES
dtypes: float64(525), int64(19)
memory usage: 8.1 MB


In [58]:
# split data for training
x_train, x_test, y_train, y_test = train_test_split(df_dropped, TARGET, test_size=.3)

In [59]:
# 1) HistGradientBoostingClassifier (with defaults)
model = HistGradientBoostingClassifier(random_state=42)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"HistGradientBoostingClassifier Accuracy: {accuracy:.4f}")

# 2) LinearSVC (with defaults)
# model = LinearSVC(random_state=42)
# model.fit(x_train, y_train)
# y_pred = model.predict(x_test)
# accuracy = accuracy_score(y_test, y_pred)
# print(f"LinearSVC Accuracy: {accuracy:.4f}")

# 3) RandomForestClassifier (with defaults)
model = RandomForestClassifier(random_state=42)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"RandomForestClassifier Accuracy: {accuracy:.4f}")

# 4) LogisticRegression (add max_iter to avoid warnings)
# model = LogisticRegression(max_iter=100000, random_state=42)
# model.fit(x_train, y_train)
# y_pred = model.predict(x_test)
# accuracy = accuracy_score(y_test, y_pred)
# print(f"LogisticRegression Accuracy: {accuracy:.4f}")

# 5) GradientBoostingClassifier (with defaults)
model = GradientBoostingClassifier(random_state=42)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"GradientBoostingClassifier Accuracy: {accuracy:.4f}")

# 6) SVC (with defaults)
# model = SVC(random_state=42)
# model.fit(x_train, y_train)
# y_pred = model.predict(x_test)
# accuracy = accuracy_score(y_test, y_pred)
# print(f"SVC Accuracy: {accuracy:.4f}")

# 7) XGBoost 
model = xgb.XGBClassifier(random_state=42)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"XGBoost Accuracy: {accuracy:.4f}")

# 8) SGDClassifier
# model = SGDClassifier(random_state=42)
# model.fit(x_train, y_train)
# y_pred = model.predict(x_test)
# accuracy = accuracy_score(y_test, y_pred)
# print(f"SGDClassifier Accuracy: {accuracy:.4f}")

# 9) GaussianProcessClassifier
# model = GaussianProcessClassifier(random_state=42)
# model.fit(x_train, y_train)
# y_pred = model.predict(x_test)
# accuracy = accuracy_score(y_test, y_pred)
# print(f"GaussianProcessClassifier Accuracy: {accuracy:.4f}")

# 10) DecisionTreeClassifier
# model = DecisionTreeClassifier(random_state=42)
# model.fit(x_train, y_train)
# y_pred = model.predict(x_test)
# accuracy = accuracy_score(y_test, y_pred)
# print(f"DecisionTreeClassifier Accuracy: {accuracy:.4f}")

HistGradientBoostingClassifier Accuracy: 0.6278
RandomForestClassifier Accuracy: 0.6158
GradientBoostingClassifier Accuracy: 0.6175
XGBoost Accuracy: 0.6038


In [60]:
# USE COLUMNS FOUND FROM ANY POSITIVE CORR WITH HOME-TEAM-WIN
df_corr = df_dropped[include_columns_corr_ml]
x_train, x_test, y_train, y_test = train_test_split(df_corr, TARGET, test_size=.3)

In [61]:
# 1) HistGradientBoostingClassifier (with defaults)
model = HistGradientBoostingClassifier(random_state=42)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"HistGradientBoostingClassifier Accuracy: {accuracy:.4f}")

# 2) LinearSVC (with defaults)
# model = LinearSVC(random_state=42)
# model.fit(x_train, y_train)
# y_pred = model.predict(x_test)
# accuracy = accuracy_score(y_test, y_pred)
# print(f"LinearSVC Accuracy: {accuracy:.4f}")

# 3) RandomForestClassifier (with defaults)
model = RandomForestClassifier(random_state=42)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"RandomForestClassifier Accuracy: {accuracy:.4f}")

# 4) LogisticRegression (add max_iter to avoid warnings)
# model = LogisticRegression(max_iter=100000, random_state=42)
# model.fit(x_train, y_train)
# y_pred = model.predict(x_test)
# accuracy = accuracy_score(y_test, y_pred)
# print(f"LogisticRegression Accuracy: {accuracy:.4f}")

# 5) GradientBoostingClassifier (with defaults)
model = GradientBoostingClassifier(random_state=42)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"GradientBoostingClassifier Accuracy: {accuracy:.4f}")

# 6) SVC (with defaults)
# model = SVC(random_state=42)
# model.fit(x_train, y_train)
# y_pred = model.predict(x_test)
# accuracy = accuracy_score(y_test, y_pred)
# print(f"SVC Accuracy: {accuracy:.4f}")

# 7) XGBoost 
model = xgb.XGBClassifier(random_state=42)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"XGBoost Accuracy: {accuracy:.4f}")

# 8) SGDClassifier
# model = SGDClassifier(random_state=42)
# model.fit(x_train, y_train)
# y_pred = model.predict(x_test)
# accuracy = accuracy_score(y_test, y_pred)
# print(f"SGDClassifier Accuracy: {accuracy:.4f}")

# 9) GaussianProcessClassifier
# model = GaussianProcessClassifier(random_state=42)
# model.fit(x_train, y_train)
# y_pred = model.predict(x_test)
# accuracy = accuracy_score(y_test, y_pred)
# print(f"GaussianProcessClassifier Accuracy: {accuracy:.4f}")

# 10) DecisionTreeClassifier
model = DecisionTreeClassifier(random_state=42)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"DecisionTreeClassifier Accuracy: {accuracy:.4f}")

HistGradientBoostingClassifier Accuracy: 0.6072
RandomForestClassifier Accuracy: 0.6072
GradientBoostingClassifier Accuracy: 0.6003
XGBoost Accuracy: 0.5918
DecisionTreeClassifier Accuracy: 0.5334
