# Modeling

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.dummy import DummyRegressor
import xgboost as xgb
import joblib

### Feature Selection

I will begin selecting predictive features to use in our model based on our EDA and domain knowledge to limit multicollinearity and select good predictors.

In [2]:
# Load the data frames
df = pd.read_csv('qb.csv')

In [3]:
# Function to convert height from 'feet-inches' to inches
def height_to_inches(height):
    feet, inches = height.split('-')
    return int(feet) * 12 + int(inches)

# Apply the function to the Height column
df['Height'] = df['Height'].apply(height_to_inches)
df['Height'].head()

0    75
1    73
2    76
3    75
4    73
Name: Height, dtype: int64

In [4]:
df.columns

Index(['Player', 'Tm', 'Age', 'G_P', 'GS', 'Att_R_P', 'Yds_R_P', 'TD_R_P',
       '1D_R', 'Succ%_R', 'Y/A_R_P', 'Y/G_R', 'Fmb', 'Cmp_P', 'Att_P', 'Pct_P',
       'Yds_P', 'TD_P', 'TD%', 'Int_P', 'Int%', '1D', 'Succ%', 'Y/A_P',
       'AY/A_P', 'Y/C', 'Y/G', 'Rate_P', 'QBR', 'Sk', 'SackYds', 'Sk%', 'NY/A',
       'ANY/A', 'G', 'Cmp', 'Att', 'Pct', 'Yds', 'Y/A', 'AY/A', 'TD', 'Int',
       'Rate', 'Att_R', 'Yds_R', 'Avg_R', 'TD_R', 'Conf', 'School', 'Height',
       'Weight', '40yd', 'Vertical', 'Broad Jump', '3Cone', 'Shuttle',
       'Drafted', 'OvrPick'],
      dtype='object')

In [5]:
X = df.drop(columns=['Player', 'Tm', 'G_P', 'GS', 'Att_R_P', 'Yds_R_P', 'TD_R_P',
       '1D_R', 'Succ%_R', 'Y/A_R_P', 'Y/G_R', 'Fmb', 'Cmp_P', 'Att_P', 'Pct_P',
       'Yds_P', 'TD_P', 'TD%', 'Int_P', 'Int%', '1D', 'Succ%', 'Y/A_P',
       'AY/A_P', 'Y/C', 'Y/G', 'Cmp', 'Att', 'Rate_P', 'QBR', 'Sk', 'SackYds', 'Sk%', 'NY/A',
       'ANY/A', 'School'])
X.columns

Index(['Age', 'G', 'Pct', 'Yds', 'Y/A', 'AY/A', 'TD', 'Int', 'Rate', 'Att_R',
       'Yds_R', 'Avg_R', 'TD_R', 'Conf', 'Height', 'Weight', '40yd',
       'Vertical', 'Broad Jump', '3Cone', 'Shuttle', 'Drafted', 'OvrPick'],
      dtype='object')

In [6]:
models = []

### Predicting Passing Yards

In [7]:
y1 = df['Yds_P']

In [8]:
X['Conf'] = X['Conf'].astype('category')

In [9]:
X_train, X_test, y1_train, y1_test = train_test_split(X, y1, test_size=0.2, random_state=42)

In [10]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84 entries, 0 to 83
Data columns (total 23 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   Age         84 non-null     int64   
 1   G           84 non-null     float64 
 2   Pct         84 non-null     float64 
 3   Yds         84 non-null     float64 
 4   Y/A         84 non-null     float64 
 5   AY/A        84 non-null     float64 
 6   TD          84 non-null     float64 
 7   Int         84 non-null     float64 
 8   Rate        84 non-null     float64 
 9   Att_R       84 non-null     float64 
 10  Yds_R       84 non-null     float64 
 11  Avg_R       84 non-null     float64 
 12  TD_R        84 non-null     float64 
 13  Conf        84 non-null     category
 14  Height      84 non-null     int64   
 15  Weight      84 non-null     float64 
 16  40yd        84 non-null     float64 
 17  Vertical    84 non-null     float64 
 18  Broad Jump  84 non-null     float64 
 19  3Cone     

In [11]:
# Define the column transformer with handle_unknown='ignore'
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Conf'])  # Apply one-hot encoding to the categorical feature
    ],
    remainder='passthrough'  # Leave other features untouched
)

# Define the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])


In [12]:
# Define hyperparameters for tuning
param_grid = {
    'regressor__n_estimators': [245, 250, 255, 265, 270, 275, 280],
    'regressor__max_depth': [12, 13, 14, 15, 16]
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Train the model
grid_search.fit(X_train, y1_train)

# Best parameters and estimator
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
print("Best Parameters:", best_params)

# Make predictions
y1_pred = best_model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y1_test, y1_pred)
print("Mean Squared Error:", mse)

Best Parameters: {'regressor__max_depth': 14, 'regressor__n_estimators': 270}
Mean Squared Error: 1461711.505708061


To evaluate this error, let's compare against a baseline.

In [13]:
mse = mean_squared_error(y1_test, y1_pred)
rmse = np.sqrt(mse)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)

# Baseline model
dummy = DummyRegressor(strategy="mean")
dummy.fit(X_train, y1_train)
dummy_pred = dummy.predict(X_test)
baseline_mse = mean_squared_error(y1_test, dummy_pred)
baseline_rmse = np.sqrt(baseline_mse)
print("Baseline Mean Squared Error:", baseline_mse)
print("Baseline Root Mean Squared Error:", baseline_rmse)

Mean Squared Error: 1461711.505708061
Root Mean Squared Error: 1209.0126160251848
Baseline Mean Squared Error: 1718306.6424069293
Baseline Root Mean Squared Error: 1310.8419593554859


In [14]:
models.append(best_model)

### Predicting Pass Touchdowns

In [15]:
y2 = df['TD_P']
X_train, X_test, y2_train, y2_test = train_test_split(X, y2, test_size=0.2, random_state=42)

In [16]:
# Define hyperparameters for tuning
param_grid = {
    'regressor__n_estimators': [100, 150, 175, 200, 225, 250],
    'regressor__max_depth': [None, 2, 4, 6, 8, 10]
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Train the model
grid_search.fit(X_train, y2_train)

# Best parameters and estimator
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
print("Best Parameters:", best_params)

# Make predictions
y2_pred = best_model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y2_test, y2_pred)
print("Mean Squared Error:", mse)

Best Parameters: {'regressor__max_depth': 2, 'regressor__n_estimators': 225}
Mean Squared Error: 53.77413840383418


In [17]:
mse = mean_squared_error(y2_test, y2_pred)
rmse = np.sqrt(mse)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)

# Baseline model
dummy = DummyRegressor(strategy="mean")
dummy.fit(X_train, y2_train)
dummy_pred = dummy.predict(X_test)
baseline_mse = mean_squared_error(y2_test, dummy_pred)
baseline_rmse = np.sqrt(baseline_mse)
print("Baseline Mean Squared Error:", baseline_mse)
print("Baseline Root Mean Squared Error:", baseline_rmse)

Mean Squared Error: 53.77413840383418
Root Mean Squared Error: 7.333085190002512
Baseline Mean Squared Error: 60.86387640375821
Baseline Root Mean Squared Error: 7.801530388568528


In [18]:
models.append(best_model)

### Predicting QBR

In [19]:
y3 = df['QBR']
X_train, X_test, y3_train, y3_test = train_test_split(X, y3, test_size=0.2, random_state=42)

In [20]:
# Define hyperparameters for tuning
param_grid = {
    'regressor__n_estimators': [400, 450, 500, 550, 600],
    'regressor__max_depth': [None, 10, 20, 30, 40, 50]
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Train the model
grid_search.fit(X_train, y3_train)

# Best parameters and estimator
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
print("Best Parameters:", best_params)

# Make predictions
y3_pred = best_model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y3_test, y3_pred)
print("Mean Squared Error:", mse)

Best Parameters: {'regressor__max_depth': None, 'regressor__n_estimators': 450}
Mean Squared Error: 485.8867023645596


In [21]:
mse = mean_squared_error(y3_test, y3_pred)
rmse = np.sqrt(mse)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)

# Baseline model
dummy = DummyRegressor(strategy="mean")
dummy.fit(X_train, y3_train)
dummy_pred = dummy.predict(X_test)
baseline_mse = mean_squared_error(y3_test, dummy_pred)
baseline_rmse = np.sqrt(baseline_mse)
print("Baseline Mean Squared Error:", baseline_mse)
print("Baseline Root Mean Squared Error:", baseline_rmse)

Mean Squared Error: 485.8867023645596
Root Mean Squared Error: 22.042837892716072
Baseline Mean Squared Error: 466.92322985598776
Baseline Root Mean Squared Error: 21.608406462670676


In [22]:
models.append(best_model)

### Predicting Passer Rating

In [23]:
y4 = df['Rate_P']
X_train, X_test, y4_train, y4_test = train_test_split(X, y4, test_size=0.2, random_state=42)

In [24]:
# Define hyperparameters for tuning
param_grid = {
    'regressor__n_estimators': [20, 30, 40, 50, 60, 70],
    'regressor__max_depth': [None, 2, 4, 6, 8, 10]
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Train the model
grid_search.fit(X_train, y4_train)

# Best parameters and estimator
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
print("Best Parameters:", best_params)

# Make predictions
y4_pred = best_model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y4_test, y4_pred)
print("Mean Squared Error:", mse)

Best Parameters: {'regressor__max_depth': 2, 'regressor__n_estimators': 70}
Mean Squared Error: 601.7693106656069


In [25]:
mse = mean_squared_error(y4_test, y4_pred)
rmse = np.sqrt(mse)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)

# Baseline model
dummy = DummyRegressor(strategy="mean")
dummy.fit(X_train, y4_train)
dummy_pred = dummy.predict(X_test)
baseline_mse = mean_squared_error(y4_test, dummy_pred)
baseline_rmse = np.sqrt(baseline_mse)
print("Baseline Mean Squared Error:", baseline_mse)
print("Baseline Root Mean Squared Error:", baseline_rmse)

Mean Squared Error: 601.7693106656069
Root Mean Squared Error: 24.53098674463803
Baseline Mean Squared Error: 499.7804340020705
Baseline Root Mean Squared Error: 22.35576959091479


In [26]:
models.append(best_model)

### Predicting Completion Percentage

In [27]:
y5 = df['Pct_P']
X_train, X_test, y5_train, y5_test = train_test_split(X, y5, test_size=0.2, random_state=42)

In [28]:
# Define hyperparameters for tuning
param_grid = {
    'regressor__n_estimators': [50, 100, 200, 300, 400, 500],
    'regressor__max_depth': [None, 10, 20, 30, 40, 50]
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Train the model
grid_search.fit(X_train, y5_train)

# Best parameters and estimator
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
print("Best Parameters:", best_params)

# Make predictions
y5_pred = best_model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y5_test, y5_pred)
print("Mean Squared Error:", mse)

Best Parameters: {'regressor__max_depth': 40, 'regressor__n_estimators': 100}
Mean Squared Error: 163.71966311764703


In [29]:
mse = mean_squared_error(y5_test, y5_pred)
rmse = np.sqrt(mse)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)

# Baseline model
dummy = DummyRegressor(strategy="mean")
dummy.fit(X_train, y5_train)
dummy_pred = dummy.predict(X_test)
baseline_mse = mean_squared_error(y5_test, dummy_pred)
baseline_rmse = np.sqrt(baseline_mse)
print("Baseline Mean Squared Error:", baseline_mse)
print("Baseline Root Mean Squared Error:", baseline_rmse)

Mean Squared Error: 163.71966311764703
Root Mean Squared Error: 12.795298477083175
Baseline Mean Squared Error: 150.5508010430726
Baseline Root Mean Squared Error: 12.269914467634752


In [30]:
models.append(best_model)

### Predicting Rushing Yards

In [31]:
y6 = df['Yds_R_P']
X_train, X_test, y6_train, y6_test = train_test_split(X, y6, test_size=0.2, random_state=42)

In [32]:
# Define hyperparameters for tuning
param_grid = {
    'regressor__n_estimators': [50, 100, 200, 300, 400, 500],
    'regressor__max_depth': [None, 10, 20, 30, 40, 50]
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Train the model
grid_search.fit(X_train, y6_train)

# Best parameters and estimator
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
print("Best Parameters:", best_params)

# Make predictions
y6_pred = best_model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y6_test, y6_pred)
print("Mean Squared Error:", mse)

Best Parameters: {'regressor__max_depth': 30, 'regressor__n_estimators': 100}
Mean Squared Error: 25421.28033529412


In [33]:
mse = mean_squared_error(y6_test, y6_pred)
rmse = np.sqrt(mse)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)

# Baseline model
dummy = DummyRegressor(strategy="mean")
dummy.fit(X_train, y6_train)
dummy_pred = dummy.predict(X_test)
baseline_mse = mean_squared_error(y6_test, dummy_pred)
baseline_rmse = np.sqrt(baseline_mse)
print("Baseline Mean Squared Error:", baseline_mse)
print("Baseline Root Mean Squared Error:", baseline_rmse)

Mean Squared Error: 25421.28033529412
Root Mean Squared Error: 159.44052287700927
Baseline Mean Squared Error: 30416.332079724296
Baseline Root Mean Squared Error: 174.40278690354776


In [34]:
models.append(best_model)

### Predicting Rushing Touchdowns

In [35]:
y7 = df['TD_R_P']
X_train, X_test, y7_train, y7_test = train_test_split(X, y7, test_size=0.2, random_state=42)

In [36]:
# Define hyperparameters for tuning
param_grid = {
    'regressor__n_estimators': [50, 75, 100, 125],
    'regressor__max_depth': [None, 5, 10, 15, 20, 30, 40]
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Train the model
grid_search.fit(X_train, y7_train)

# Best parameters and estimator
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
print("Best Parameters:", best_params)

# Make predictions
y7_pred = best_model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y7_test, y7_pred)
print("Mean Squared Error:", mse)

Best Parameters: {'regressor__max_depth': 15, 'regressor__n_estimators': 50}
Mean Squared Error: 6.829223529411764


In [37]:
mse = mean_squared_error(y7_test, y7_pred)
rmse = np.sqrt(mse)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)

# Baseline model
dummy = DummyRegressor(strategy="mean")
dummy.fit(X_train, y7_train)
dummy_pred = dummy.predict(X_test)
baseline_mse = mean_squared_error(y7_test, dummy_pred)
baseline_rmse = np.sqrt(baseline_mse)
print("Baseline Mean Squared Error:", baseline_mse)
print("Baseline Root Mean Squared Error:", baseline_rmse)

Mean Squared Error: 6.829223529411764
Root Mean Squared Error: 2.6132783107452915
Baseline Mean Squared Error: 7.077535937520475
Baseline Root Mean Squared Error: 2.660363873142258


In [38]:
models.append(best_model)

In [39]:
for i, model in enumerate(models):
    joblib.dump(model, f'model_{i}.joblib')