# MODEL BAKEOFF
- Begin by gathering the dataset, and cleaning columns as needed
- Start for modeling against home-team-win (work out over under later)
- Choose from a flat selection of classification models, linear, logistic, and ensemble methods w/o hyperparameters

In [2]:
import sqlite3
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

In [3]:
dataset = "games_2018-present"
db = sqlite3.connect(f"/Users/aidenflynn/ML_Python/python-nfl/Data/v2.sqlite")
df = pd.read_sql_query(f"select * from \"{dataset}\"", db, index_col="index")
df.head()

Unnamed: 0_level_0,SEASON,AWAY_TEAM_NAME,AWAY_TEAM_PREV_RANK,HOME_TEAM_NAME,HOME_TEAM_PREV_RANK,AWAY_SCORE,HOME_SCORE,WEEK,AWAY_UNIQ_STARTERS_QB,AWAY_UNIQ_STARTERS_DEFENSE,...,DIV_MATCH,SCORE,OU,OU_COVER,SPREAD,SPREAD_COVER,WIND_SPEED,TEMP,SURFACE,ROOF
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2018,ARI,18.0,ATL,7.0,14,40,15,2,21,...,0.0,54,43.5,1.0,10.0,0.0,084,72,72084,Cardinals (deferred)
0,2018,ARI,18.0,GNB,19.0,20,17,13,2,19,...,0.0,37,41.0,0.0,13.5,0.0,wind 20 mph,34 degrees,"34 degrees, wind 20 mph",outdoors
0,2018,ARI,18.0,KAN,8.0,14,26,10,2,17,...,0.0,40,49.5,0.0,16.5,0.0,wind 5 mph,43 degrees,"43 degrees, wind 5 mph",outdoors
0,2018,ARI,18.0,LAC,15.0,10,45,12,2,18,...,0.0,55,43.0,1.0,14.0,0.0,wind 3 mph,74 degrees,"74 degrees, wind 3 mph",outdoors
0,2018,ARI,18.0,LAR,10.0,0,34,2,1,11,...,1.0,34,43.5,0.0,13.5,0.0,wind 2 mph,86 degrees,"86 degrees, wind 2 mph",outdoors


In [4]:
# fill scrm RELATED column Null with 0.0
df['AWAY_PASS_YDSSCRM_MAX'] = df['AWAY_PASS_YDSSCRM_MAX'].fillna(0.0)
df['AWAY_PASS_YDSSCRM_MEAN'] = df['AWAY_PASS_YDSSCRM_MEAN'].fillna(0.0)
df['AWAY_PASS_YDSSCRM_MIN'] = df['AWAY_PASS_YDSSCRM_MIN'].fillna(0.0)
df['AWAY_PASS_YDSSCRM_STD'] = df['AWAY_PASS_YDSSCRM_STD'].fillna(0.0)
df['HOME_PASS_YDSSCRM_MAX'] = df['HOME_PASS_YDSSCRM_MAX'].fillna(0.0)
df['HOME_PASS_YDSSCRM_MEAN'] = df['HOME_PASS_YDSSCRM_MEAN'].fillna(0.0)
df['HOME_PASS_YDSSCRM_MIN'] = df['HOME_PASS_YDSSCRM_MIN'].fillna(0.0)
df['HOME_PASS_YDSSCRM_STD'] = df['HOME_PASS_YDSSCRM_STD'].fillna(0.0)

In [5]:
# All other null values can be safely filled with -1: unavailable
df.fillna(-1, inplace=True) 
df.isnull().any(axis=1).sum()

0

In [7]:
df = df.query('OU_COVER >= 0.0 & OU >= 0.0')

In [8]:
# target home team win; drop dud and standard columns
TARGET = df['OU_COVER']
df_dropped = df.drop(['ROOF', 'SURFACE', 'TEMP', 'WIND_SPEED', 'SPREAD_COVER', 'SPREAD', 'OU_COVER', 'OU', 'SCORE', 'Home-Team-Win', 'HOME_SCORE', 'AWAY_SCORE', 'HOME_TEAM_NAME', 'AWAY_TEAM_NAME', 'SEASON'], axis=1)

In [9]:
df_dropped.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1879 entries, 0 to 0
Columns: 544 entries, AWAY_TEAM_PREV_RANK to DIV_MATCH
dtypes: float64(523), int64(21)
memory usage: 7.8 MB


In [10]:
# split data for training
x_train, x_test, y_train, y_test = train_test_split(df_dropped, TARGET, test_size=.3)

In [12]:
# 1) HistGradientBoostingClassifier (with defaults)
model = HistGradientBoostingClassifier(random_state=42)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"HistGradientBoostingClassifier Accuracy: {accuracy:.4f}")

# 2) LinearSVC (with defaults)
model = LinearSVC(random_state=42)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"LinearSVC Accuracy: {accuracy:.4f}")

# 3) RandomForestClassifier (with defaults)
model = RandomForestClassifier(random_state=42)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"RandomForestClassifier Accuracy: {accuracy:.4f}")

# 4) LogisticRegression (add max_iter to avoid warnings)
model = LogisticRegression(max_iter=100000, random_state=42)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"LogisticRegression Accuracy: {accuracy:.4f}")

# 5) GradientBoostingClassifier (with defaults)
model = GradientBoostingClassifier(random_state=42)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"GradientBoostingClassifier Accuracy: {accuracy:.4f}")

# 6) SVC (with defaults)
model = SVC(random_state=42)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"SVC Accuracy: {accuracy:.4f}")

# 7) XGBoost 
model = xgb.XGBClassifier(random_state=42)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"XGBoost Accuracy: {accuracy:.4f}")

# 8) SGDClassifier
model = SGDClassifier(random_state=42)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"SGDClassifier Accuracy: {accuracy:.4f}")

# 9) GaussianProcessClassifier
model = GaussianProcessClassifier(random_state=42)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"GaussianProcessClassifier Accuracy: {accuracy:.4f}")

# 10) DecisionTreeClassifier
model = DecisionTreeClassifier(random_state=42)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"DecisionTreeClassifier Accuracy: {accuracy:.4f}")

HistGradientBoostingClassifier Accuracy: 0.4716
LinearSVC Accuracy: 0.4787
RandomForestClassifier Accuracy: 0.5018
LogisticRegression Accuracy: 0.4805
GradientBoostingClassifier Accuracy: 0.4911
SVC Accuracy: 0.5266
XGBoost Accuracy: 0.4911
SGDClassifier Accuracy: 0.5142
GaussianProcessClassifier Accuracy: 0.5301
DecisionTreeClassifier Accuracy: 0.5053
