# Modeling

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.dummy import DummyRegressor
import xgboost as xgb
import joblib

In [2]:
# Load the data frames
df = pd.read_csv('wrte.csv')

In [3]:
# Function to convert height from 'feet-inches' to inches
def height_to_inches(height):
    feet, inches = height.split('-')
    return int(feet) * 12 + int(inches)

# Apply the function to the Height column
df['Height'] = df['Height'].apply(height_to_inches)
df['Height'].head()

0    77
1    71
2    75
3    74
4    74
Name: Height, dtype: int64

In [4]:
df.columns

Index(['Player', 'Tm', 'Age', 'Pos', 'G_P', 'GS', 'Tgt', 'Rec_P', 'Ctch%',
       'Yds_P', 'TD_P', '1D', 'Succ%', 'Y/Tgt', 'R/G', 'Y/G', 'Fmb', 'G',
       'Rec', 'Yds', 'Avg', 'TD', 'Plays_S', 'Yds_S', 'Avg_S', 'TD_S', 'Conf',
       'School', 'Height', 'Weight', '40yd', 'Vertical', 'Broad Jump',
       'Drafted', 'OvrPick'],
      dtype='object')

In [5]:
X = df.drop(columns=['Player', 'Tm', 'Pos', 'G_P', 'GS', 'Tgt', 'Rec_P', 'Ctch%',
       'Yds_P', 'TD_P', '1D', 'Succ%', 'Y/Tgt', 'R/G', 'Y/G', 'Fmb', 'Avg', 'Plays_S', 'Yds_S', 'Avg_S',
       'TD_S', 'School'])
X.columns

Index(['Age', 'G', 'Rec', 'Yds', 'TD', 'Conf', 'Height', 'Weight', '40yd',
       'Vertical', 'Broad Jump', 'Drafted', 'OvrPick'],
      dtype='object')

In [6]:
models = []

### Predicting Receptions

In [7]:
y1 = df['Rec_P']

In [8]:
X['Conf'] = X['Conf'].astype('category')

In [9]:
X_train, X_test, y1_train, y1_test = train_test_split(X, y1, test_size=0.2, random_state=42)

In [10]:
# Define the column transformer with handle_unknown='ignore'
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Conf'])  # Apply one-hot encoding to the categorical feature
    ],
    remainder='passthrough'  # Leave other features untouched
)

# Define the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [11]:
# Define hyperparameters for tuning
param_grid = {
    'regressor__n_estimators': [50, 100, 150, 175, 200, 225, 250],
    'regressor__max_depth': [10, 20, 30, None]
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Train the model
grid_search.fit(X_train, y1_train)

# Best parameters and estimator
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
print("Best Parameters:", best_params)

# Make predictions
y1_pred = best_model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y1_test, y1_pred)
print("Mean Squared Error:", mse)

Best Parameters: {'regressor__max_depth': None, 'regressor__n_estimators': 50}
Mean Squared Error: 282.15217727272733


In [12]:
mse = mean_squared_error(y1_test, y1_pred)
rmse = np.sqrt(mse)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)

# Baseline model
dummy = DummyRegressor(strategy="mean")
dummy.fit(X_train, y1_train)
dummy_pred = dummy.predict(X_test)
baseline_mse = mean_squared_error(y1_test, dummy_pred)
baseline_rmse = np.sqrt(baseline_mse)
print("Baseline Mean Squared Error:", baseline_mse)
print("Baseline Root Mean Squared Error:", baseline_rmse)

Mean Squared Error: 282.15217727272733
Root Mean Squared Error: 16.797386024995895
Baseline Mean Squared Error: 422.10514591942143
Baseline Root Mean Squared Error: 20.545197636416678


In [13]:
models.append(best_model)

### Predicting Receiving Yards

In [14]:
y2 = df['Yds_P']
X_train, X_test, y2_train, y2_test = train_test_split(X, y2, test_size=0.2, random_state=42)

In [15]:
# Define hyperparameters for tuning
param_grid = {
    'regressor__n_estimators': [50, 100, 150, 175, 200, 225, 250],
    'regressor__max_depth': [10, 20, 30, None]
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Train the model
grid_search.fit(X_train, y2_train)

# Best parameters and estimator
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
print("Best Parameters:", best_params)

# Make predictions
y2_pred = best_model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y2_test, y2_pred)
print("Mean Squared Error:", mse)

Best Parameters: {'regressor__max_depth': 20, 'regressor__n_estimators': 250}
Mean Squared Error: 46368.166474742255


In [16]:
mse = mean_squared_error(y2_test, y2_pred)
rmse = np.sqrt(mse)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)

# Baseline model
dummy = DummyRegressor(strategy="mean")
dummy.fit(X_train, y2_train)
dummy_pred = dummy.predict(X_test)
baseline_mse = mean_squared_error(y2_test, dummy_pred)
baseline_rmse = np.sqrt(baseline_mse)
print("Baseline Mean Squared Error:", baseline_mse)
print("Baseline Root Mean Squared Error:", baseline_rmse)

Mean Squared Error: 46368.166474742255
Root Mean Squared Error: 215.33268789188105
Baseline Mean Squared Error: 68701.0799651343
Baseline Root Mean Squared Error: 262.1089085955193


In [17]:
models.append(best_model)

### Predicting Receiving Touchdowns

In [18]:
y3 = df['TD_P']
X_train, X_test, y3_train, y3_test = train_test_split(X, y3, test_size=0.2, random_state=42)

In [19]:
# Define hyperparameters for tuning
param_grid = {
    'regressor__n_estimators': [50, 100, 150, 175, 200, 225, 250],
    'regressor__max_depth': [10, 20, 30, 40, None]
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Train the model
grid_search.fit(X_train, y3_train)

# Best parameters and estimator
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
print("Best Parameters:", best_params)

# Make predictions
y3_pred = best_model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y3_test, y3_pred)
print("Mean Squared Error:", mse)

Best Parameters: {'regressor__max_depth': None, 'regressor__n_estimators': 250}
Mean Squared Error: 5.284459090909092


In [20]:
mse = mean_squared_error(y3_test, y3_pred)
rmse = np.sqrt(mse)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)

# Baseline model
dummy = DummyRegressor(strategy="mean")
dummy.fit(X_train, y3_train)
dummy_pred = dummy.predict(X_test)
baseline_mse = mean_squared_error(y3_test, dummy_pred)
baseline_rmse = np.sqrt(baseline_mse)
print("Baseline Mean Squared Error:", baseline_mse)
print("Baseline Root Mean Squared Error:", baseline_rmse)

Mean Squared Error: 5.284459090909092
Root Mean Squared Error: 2.2987951389606454
Baseline Mean Squared Error: 6.7041015625
Baseline Root Mean Squared Error: 2.58922798580967


In [21]:
models.append(best_model)

In [22]:
for i, model in enumerate(models):
    joblib.dump(model, f'wrte_model_{i}.joblib')