# Projected Points Model
* Uses fantasy football data and NFL game spreads to predict player scores on a given week
* Brock Ricker
* https://github.com/brock-ricker
* Created 07/06/2022

In [1]:
#import modules here
import pandas as pd
import numpy as np
import math
from sqlalchemy import create_engine, inspect
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor

In [2]:
# create sqlite engine for fantasy_league db
engine = create_engine("sqlite:///fantasy_football_data.db", echo=True)
#create connection to the engine
conn = engine.connect()

In [3]:
#creating inspector
inspector = inspect(engine)

In [4]:
#using the inspector
inspector.get_table_names()

2022-07-09 09:53:25,271 INFO sqlalchemy.engine.Engine SELECT name FROM sqlite_master WHERE type='table' ORDER BY name
2022-07-09 09:53:25,272 INFO sqlalchemy.engine.Engine [raw sql] ()


['adp',
 'league_teams',
 'points_allowed',
 'projected_scores',
 'simple_players',
 'spreads',
 'weekly_matchup',
 'weekly_players',
 'weekly_teams']

In [5]:
#Defining dictionairy to bin game time into categorical
time_dict = {'8:30':"night", 
'1:00':"morning", 
'4:04':"afternoon", 
'4:25':"afternoon", 
'7:10':"night", 
'10:20':"night", 
'8:24':"night", 
'9:29':"night", 
'12:30':"afternoon",
'4:30':"afternoon",
'8:20':"night",
'8:15':"night",
'7:15':"night",
'10:10':"night",
'7:05':"night",
'7:00':"night",
'5:00':"afternoon",
'3:39':"afternoon",
'8:05':"night",
'4:00':"afternoon"
}

In [6]:
#SQL query for relevant data
sql = """
SELECT wp.score AS player_score, wp.week, wp.year, wp.status, sp.name, sp.team AS player_team, sp.position, s.day, s.time, s.team, s.opp, s.spread, s.predicted_score, pa.points AS team_points_allowed, ps.projected_scores, adp.averagePick
FROM weekly_players AS wp
LEFT JOIN 
simple_players AS sp
ON wp.id = sp.id AND wp.year = sp.year
LEFT JOIN
spreads AS s
ON wp.week = s.week AND wp.year = s.year AND sp.team = s.team
LEFT JOIN
points_allowed AS pa
ON s.opp = pa.team AND s.year = pa.year AND sp.position = pa.pos
LEFT JOIN
projected_scores as ps
ON wp.id = ps.id AND wp.week = ps.week AND wp.year = ps.year
LEFT JOIN
adp
ON wp.id = adp.id AND wp.year = adp.year;
"""
df = pd.read_sql_query(sql, engine)

2022-07-09 09:53:25,333 INFO sqlalchemy.engine.Engine 
SELECT wp.score AS player_score, wp.week, wp.year, wp.status, sp.name, sp.team AS player_team, sp.position, s.day, s.time, s.team, s.opp, s.spread, s.predicted_score, pa.points AS team_points_allowed, ps.projected_scores, adp.averagePick
FROM weekly_players AS wp
LEFT JOIN 
simple_players AS sp
ON wp.id = sp.id AND wp.year = sp.year
LEFT JOIN
spreads AS s
ON wp.week = s.week AND wp.year = s.year AND sp.team = s.team
LEFT JOIN
points_allowed AS pa
ON s.opp = pa.team AND s.year = pa.year AND sp.position = pa.pos
LEFT JOIN
projected_scores as ps
ON wp.id = ps.id AND wp.week = ps.week AND wp.year = ps.year
LEFT JOIN
adp
ON wp.id = adp.id AND wp.year = adp.year;

2022-07-09 09:53:25,333 INFO sqlalchemy.engine.Engine [raw sql] ()


In [7]:
#filling in blank projected scores to 0
df["player_score"] = df["player_score"].fillna(0)
df["projected_scores"] = df["projected_scores"].fillna(0)

In [8]:
#dropping FAs
df = df[~(df["player_team"]=="FA")]
df = df[~(df["player_team"]=="FA*")]

In [9]:
#remaining null values are from bye weeks and cancelled games, I think it is best to just drop these completely
df_clean = df.dropna()

In [10]:
#binning time slots of games to morning/afternoon/night
df_clean["time"].replace(time_dict,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean["time"].replace(time_dict,inplace=True)


In [11]:
#split into features and target
X = df_clean[["week","position","day","time","spread","predicted_score","team_points_allowed","averagePick","projected_scores"]]
y = df_clean["player_score"]

In [12]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)

In [13]:
#create column transformer - isntantiate tools for column selector here

#column selectors
num_selector = make_column_selector(dtype_include="number")
cat_selector = make_column_selector(dtype_include="object")

#instantiate encoder
ohe = OneHotEncoder(sparse="False",handle_unknown="ignore")

#instantiate scaler
scaler = StandardScaler()

In [14]:
#make column transformer, not all models will need data scaled, but it wont hurt so I will scale everything for simplicity

transformer = make_column_transformer((scaler,num_selector),(ohe,cat_selector),remainder = "passthrough")

In [15]:
#fit transformer on X_train
transformer.fit(X_train)

In [16]:
X_train_processed = transformer.transform(X_train)
X_test_processed = transformer.transform(X_test)

In [17]:
#Building Function to evaluate all models - this will be used later
def eval_model(true,pred):
    r2 = r2_score(true,pred)
    mae = mean_absolute_error(true,pred)
    mse = mean_squared_error(true,pred)
    rmse = np.sqrt(mse)
    print(f" R2: {r2:,.4f} \n MAE: {mae:,.2f} \n MSE: {mse:,.2f} \n RMSE: {rmse:,.2f}")

In [18]:
gbr = GradientBoostingRegressor(random_state=42,loss="squared_error")

In [19]:
gbr_params = {
    'max_depth': [1,2,3,4,5,6],
    'n_estimators': [100,200,300,400,500]
}

In [20]:
gbr_grid = GridSearchCV(gbr,gbr_params)

In [21]:
gbr_grid.fit(X_train_processed,y_train)

In [22]:
gbr_best = gbr_grid.best_estimator_

In [23]:
eval_model(y_test,gbr_best.predict(X_test_processed))

 R2: 0.4409 
 MAE: 4.52 
 MSE: 39.04 
 RMSE: 6.25


In [24]:
gbr_mae = GradientBoostingRegressor(random_state=42, loss="absolute_error")

In [25]:
gbr_mae_params = {
    'max_depth': [1,2,3,4,5,6,7,8,9,10],
    'n_estimators': [50,100,200,300,400,500]
}

In [26]:
gbr_mae_grid = GridSearchCV(gbr_mae,gbr_mae_params)
gbr_mae_grid.fit(X_train_processed,y_train)

In [27]:
gbr_mae_best = gbr_mae_grid.best_estimator_

In [28]:
eval_model(y_test,gbr_mae_best.predict(X_test_processed))

 R2: 0.4150 
 MAE: 4.37 
 MSE: 40.85 
 RMSE: 6.39


In [29]:
eval_model(y_test,gbr_best.predict(X_test_processed))

 R2: 0.4409 
 MAE: 4.52 
 MSE: 39.04 
 RMSE: 6.25


In [30]:
import pickle

filename = "points_projection_model_mse.sav"
pickle.dump(gbr_best, open(filename, "wb"))

filename = "points_projection_model_mae.sav"
pickle.dump(gbr_mae, open(filename, "wb"))