In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from scipy.stats import randint, uniform

In [11]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,Player,Season,Passing Yds,Passing Tds,Rushing Yds,Rushing Tds,Receiving Yds,Receiving Tds,VBD
0,Ron Johnson,1970,0,0,1027,8,48,5,135
1,Gene Washington,1970,0,0,0,0,44,1,33
2,MacArthur Lane,1970,0,0,977,11,32,2,126
3,Warren Wells,1970,0,0,34,0,43,0,112
4,John Brodie,1970,2941,24,29,2,0,1,105


In [12]:
labelencoder = LabelEncoder()
df['Player_no'] = labelencoder.fit_transform(df['Player'])

X = df.drop(columns=['Player', 'VBD'])
y = df['VBD']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
rf = RandomForestRegressor(random_state=42)

param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': randint(3, 20),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 10),
    'max_features': uniform(0.5, 0.5)  # try between 0.5 and 1.0
}

search = RandomizedSearchCV(
    rf,
    param_distributions=param_dist,
    n_iter=50,
    scoring='neg_mean_squared_error',
    cv=5,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

search.fit(X_train, y_train)

print('Best hyperparameters:', search.best_params_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best hyperparameters: {'max_depth': 15, 'max_features': 0.5704621124873813, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 426}


In [14]:
best_rf = search.best_estimator_
y_pred = best_rf.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)

print(f'\nTest set performance:')
print(f'  MSE : {mse:.2f}')
print(f'  RMSE: {rmse:.2f}')
print(f'  MAE : {mae:.2f}')


Test set performance:
  MSE : 69.79
  RMSE: 8.35
  MAE : 2.70


In [15]:
# Cross‑validated RMSE on the full training set
cv_scores = cross_val_score(best_rf, X_train, y_train, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
cv_rmse = np.sqrt(-cv_scores)
print(f'5‑fold CV RMSE: {cv_rmse.mean():.2f} ± {cv_rmse.std():.2f}')

5‑fold CV RMSE: 8.35 ± 0.08


In [16]:
# Refit on entire dataset and assign scores
best_rf.fit(X, y)
df['Predicted_Score'] = best_rf.predict(X)

In [18]:
top10 = df[['Player', 'Predicted_Score', 'VBD']].sort_values('Predicted_Score', ascending=False).head(10)
print('\nTop 10 Players by Predicted Score:')
print(top10)


Top 10 Players by Predicted Score:
                    Player  Predicted_Score  VBD
17834  LaDainian Tomlinson       242.352113  266
1922          O.J. Simpson       238.423206  278
17256      Shaun Alexander       216.004695  226
13362        Terrell Davis       214.786385  233
11778         Emmitt Smith       214.286385  221
15531        Priest Holmes       210.401408  220
14447       Marshall Faulk       209.166667  228
16092        Priest Holmes       208.260563  229
2745         Walter Payton       202.284038  209
25437  Christian McCaffrey       196.528951  215
