# Modeling

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.dummy import DummyRegressor
import xgboost as xgb
import joblib

In [2]:
# Load the data frames
df = pd.read_csv('rb.csv')

In [3]:
# Function to convert height from 'feet-inches' to inches
def height_to_inches(height):
    feet, inches = height.split('-')
    return int(feet) * 12 + int(inches)

# Apply the function to the Height column
df['Height'] = df['Height'].apply(height_to_inches)
df['Height'].head()

0    70
1    71
2    71
3    72
4    72
Name: Height, dtype: int64

In [4]:
df.columns

Index(['Player', 'Tm', 'Age', 'Tgt', 'Rec_P', 'Ctch%', 'Yds_p', 'TD_P', '1D',
       'Succ%', 'Y/Tgt', 'R/G', 'Y/G', 'G_P', 'GS', 'Att', 'Yds_R_P', 'TD_R_P',
       '1D_P', 'Succ%_R', 'Y/A', 'Y/G_R', 'Fmb', 'G', 'Att_R', 'Yds_R',
       'Avg_R', 'TD_R', 'Rec', 'Yds', 'Avg', 'TD', 'Plays_S', 'Yds_S', 'Avg_S',
       'TD_S', 'Conf', 'School', 'Height', 'Weight', '40yd', 'Vertical',
       'Broad Jump', 'Drafted', 'OvrPick'],
      dtype='object')

In [5]:
X = df.drop(columns=['Player', 'Tm', 'Tgt', 'Rec_P', 'Ctch%', 'Yds_p', 'TD_P', '1D',
       'Succ%', 'Y/Tgt', 'R/G', 'Y/G', 'G_P', 'GS', 'Att', 'Yds_R_P', 'TD_R_P',
       '1D_P', 'Succ%_R', 'Y/A', 'Y/G_R', 'Fmb', 'Avg_R', 'Avg', 'Plays_S', 'Yds_S', 'Avg_S',
       'TD_S', 'School'])
X.columns

Index(['Age', 'G', 'Att_R', 'Yds_R', 'TD_R', 'Rec', 'Yds', 'TD', 'Conf',
       'Height', 'Weight', '40yd', 'Vertical', 'Broad Jump', 'Drafted',
       'OvrPick'],
      dtype='object')

In [6]:
models = []

### Predicting Rushing Yards

In [7]:
y1 = df['Yds_R_P']

In [8]:
X['Conf'] = X['Conf'].astype('category')

In [9]:
X_train, X_test, y1_train, y1_test = train_test_split(X, y1, test_size=0.2, random_state=42)

In [10]:
# Define the column transformer with handle_unknown='ignore'
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Conf'])  # Apply one-hot encoding to the categorical feature
    ],
    remainder='passthrough'  # Leave other features untouched
)

# Define the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [11]:
# Define hyperparameters for tuning
param_grid = {
    'regressor__n_estimators': [150, 200, 250, 300],
    'regressor__max_depth': [25, 30, 35, 40]
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Train the model
grid_search.fit(X_train, y1_train)

# Best parameters and estimator
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
print("Best Parameters:", best_params)

# Make predictions
y1_pred = best_model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y1_test, y1_pred)
print("Mean Squared Error:", mse)

Best Parameters: {'regressor__max_depth': 40, 'regressor__n_estimators': 200}
Mean Squared Error: 118354.14456524391


In [12]:
mse = mean_squared_error(y1_test, y1_pred)
rmse = np.sqrt(mse)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)

# Baseline model
dummy = DummyRegressor(strategy="mean")
dummy.fit(X_train, y1_train)
dummy_pred = dummy.predict(X_test)
baseline_mse = mean_squared_error(y1_test, dummy_pred)
baseline_rmse = np.sqrt(baseline_mse)
print("Baseline Mean Squared Error:", baseline_mse)
print("Baseline Root Mean Squared Error:", baseline_rmse)

Mean Squared Error: 118354.14456524391
Root Mean Squared Error: 344.02637190373053
Baseline Mean Squared Error: 126353.76780856657
Baseline Root Mean Squared Error: 355.4627516471544


In [13]:
models.append(best_model)

### Predicting Rushing Touchdowns

In [14]:
y2 = df['TD_R_P']
X_train, X_test, y2_train, y2_test = train_test_split(X, y2, test_size=0.2, random_state=42)

In [15]:
# Define hyperparameters for tuning
param_grid = {
    'regressor__n_estimators': [50, 100, 150, 175, 200, 225, 250],
    'regressor__max_depth': [10, 20, 30, None]
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Train the model
grid_search.fit(X_train, y2_train)

# Best parameters and estimator
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
print("Best Parameters:", best_params)

# Make predictions
y2_pred = best_model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y2_test, y2_pred)
print("Mean Squared Error:", mse)

Best Parameters: {'regressor__max_depth': 10, 'regressor__n_estimators': 250}
Mean Squared Error: 8.745196117179631


In [16]:
mse = mean_squared_error(y2_test, y2_pred)
rmse = np.sqrt(mse)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)

# Baseline model
dummy = DummyRegressor(strategy="mean")
dummy.fit(X_train, y2_train)
dummy_pred = dummy.predict(X_test)
baseline_mse = mean_squared_error(y2_test, dummy_pred)
baseline_rmse = np.sqrt(baseline_mse)
print("Baseline Mean Squared Error:", baseline_mse)
print("Baseline Root Mean Squared Error:", baseline_rmse)

Mean Squared Error: 8.745196117179631
Root Mean Squared Error: 2.9572277756675476
Baseline Mean Squared Error: 9.821146052593196
Baseline Root Mean Squared Error: 3.1338707779028154


In [17]:
models.append(best_model)

### Predicting Receptions

In [18]:
y3 = df['Rec_P']
X_train, X_test, y3_train, y3_test = train_test_split(X, y3, test_size=0.2, random_state=42)

In [19]:
# Define hyperparameters for tuning
param_grid = {
    'regressor__n_estimators': [50, 100, 150, 175, 200, 225, 250],
    'regressor__max_depth': [10, 20, 30, 40, None]
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Train the model
grid_search.fit(X_train, y3_train)

# Best parameters and estimator
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
print("Best Parameters:", best_params)

# Make predictions
y3_pred = best_model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y3_test, y3_pred)
print("Mean Squared Error:", mse)

Best Parameters: {'regressor__max_depth': 20, 'regressor__n_estimators': 150}
Mean Squared Error: 202.02671653116528


In [20]:
mse = mean_squared_error(y3_test, y3_pred)
rmse = np.sqrt(mse)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)

# Baseline model
dummy = DummyRegressor(strategy="mean")
dummy.fit(X_train, y3_train)
dummy_pred = dummy.predict(X_test)
baseline_mse = mean_squared_error(y3_test, dummy_pred)
baseline_rmse = np.sqrt(baseline_mse)
print("Baseline Mean Squared Error:", baseline_mse)
print("Baseline Root Mean Squared Error:", baseline_rmse)

Mean Squared Error: 202.02671653116528
Root Mean Squared Error: 14.213610256763243
Baseline Mean Squared Error: 224.11936926552633
Baseline Root Mean Squared Error: 14.970616863226656


In [21]:
models.append(best_model)

### Predicting Receiving Yards

In [22]:
y4 = df['Yds_p']
X_train, X_test, y4_train, y4_test = train_test_split(X, y4, test_size=0.2, random_state=42)

In [23]:
# Define hyperparameters for tuning
param_grid = {
    'regressor__n_estimators': [50, 100, 150, 175, 200, 225, 250],
    'regressor__max_depth': [10, 20, 30, 40, None]
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Train the model
grid_search.fit(X_train, y4_train)

# Best parameters and estimator
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
print("Best Parameters:", best_params)

# Make predictions
y4_pred = best_model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y4_test, y4_pred)
print("Mean Squared Error:", mse)

Best Parameters: {'regressor__max_depth': 10, 'regressor__n_estimators': 200}
Mean Squared Error: 17736.327937047456


In [24]:
mse = mean_squared_error(y4_test, y4_pred)
rmse = np.sqrt(mse)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)

# Baseline model
dummy = DummyRegressor(strategy="mean")
dummy.fit(X_train, y4_train)
dummy_pred = dummy.predict(X_test)
baseline_mse = mean_squared_error(y4_test, dummy_pred)
baseline_rmse = np.sqrt(baseline_mse)
print("Baseline Mean Squared Error:", baseline_mse)
print("Baseline Root Mean Squared Error:", baseline_rmse)

Mean Squared Error: 17736.327937047456
Root Mean Squared Error: 133.1778057224531
Baseline Mean Squared Error: 19936.010610099547
Baseline Root Mean Squared Error: 141.19493833030825


In [25]:
models.append(best_model)

### Predicting Receiving Touchdowns

In [26]:
y5 = df['TD_P']
X_train, X_test, y5_train, y5_test = train_test_split(X, y5, test_size=0.2, random_state=42)

In [27]:
# Define hyperparameters for tuning
param_grid = {
    'regressor__n_estimators': [50, 100, 150, 175, 200, 225, 250],
    'regressor__max_depth': [10, 20, 30, 40, None]
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Train the model
grid_search.fit(X_train, y5_train)

# Best parameters and estimator
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
print("Best Parameters:", best_params)

# Make predictions
y5_pred = best_model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y5_test, y5_pred)
print("Mean Squared Error:", mse)

Best Parameters: {'regressor__max_depth': 10, 'regressor__n_estimators': 225}
Mean Squared Error: 0.8742414377311925


In [28]:
mse = mean_squared_error(y5_test, y5_pred)
rmse = np.sqrt(mse)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)

# Baseline model
dummy = DummyRegressor(strategy="mean")
dummy.fit(X_train, y5_train)
dummy_pred = dummy.predict(X_test)
baseline_mse = mean_squared_error(y5_test, dummy_pred)
baseline_rmse = np.sqrt(baseline_mse)
print("Baseline Mean Squared Error:", baseline_mse)
print("Baseline Root Mean Squared Error:", baseline_rmse)

Mean Squared Error: 0.8742414377311925
Root Mean Squared Error: 0.9350087901892647
Baseline Mean Squared Error: 0.8523176894899231
Baseline Root Mean Squared Error: 0.9232105336757825


In [29]:
models.append(best_model)

In [30]:
for i, model in enumerate(models):
    joblib.dump(model, f'rb_model_{i}.joblib')