In [6]:
import pandas as pd
import numpy as np

from google.colab import drive
drive.mount("/content/drive")

from pathlib import Path
DATA_PATH = Path("/content/drive/MyDrive/bbref_drafts/model_data/ncaa_to_nba_model_data.csv")  # change if needed

df = pd.read_csv(DATA_PATH)
print("Rows:", len(df))
print("Columns:", df.columns.tolist())
display(df.head())


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Rows: 268
Columns: ['PTS', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'TS%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'VORP_Pctl_InClass']


Unnamed: 0,PTS,TRB,AST,STL,BLK,TOV,TS%,TRB%,AST%,STL%,BLK%,TOV%,USG%,VORP_Pctl_InClass
0,19.1,4.9,7.5,2.0,0.6,4.6,0.562,6.6,34.8,2.9,1.5,21.4,25.7,0.965517
1,22.8,10.3,6.7,1.9,1.0,4.9,0.581,15.9,37.5,3.0,2.9,20.0,33.1,0.413793
2,18.1,12.3,1.5,1.3,3.0,3.6,0.621,16.5,7.9,1.8,7.9,20.0,21.8,0.793103
3,25.8,16.8,1.7,1.7,3.0,3.5,0.579,22.5,9.2,2.4,7.5,13.6,30.5,0.896552
4,15.8,11.1,3.1,0.9,4.2,2.8,0.535,15.6,14.4,1.4,11.0,15.7,22.1,0.448276


In [7]:
from sklearn.model_selection import train_test_split

TARGET = "VORP_Pctl_InClass"
FEATURES = ["PTS","TRB","AST","STL","BLK","TOV","TS%","TRB%","AST%","STL%","BLK%","TOV%","USG%"]

# Coerce to numeric
for c in FEATURES + [TARGET]:
    df[c] = pd.to_numeric(df[c], errors="coerce")

df = df.dropna(subset=FEATURES + [TARGET]).reset_index(drop=True)

X = df[FEATURES]
y = df[TARGET]

# Random split (since DraftYear isn't available here)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

print("Train:", len(X_train), "Test:", len(X_test))


Train: 214 Test: 54


In [8]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from scipy.stats import spearmanr

def eval_model(name, y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rho = spearmanr(y_true, y_pred).correlation
    print(f"{name} | MAE: {mae:.4f} | Spearman r: {rho:.4f}")
    return mae, rho

# Baseline: mean predictor
baseline_pred = np.full(len(y_test), y_train.mean())
eval_model("Baseline", y_test, baseline_pred)

# Ridge (scaled)
ridge = Pipeline([
    ("scaler", StandardScaler()),
    ("model", Ridge(alpha=1.0, random_state=42))
])
ridge.fit(X_train, y_train)
ridge_pred = ridge.predict(X_test)
eval_model("Ridge", y_test, ridge_pred)

# Gradient boosting (no scaling needed)
gbr = HistGradientBoostingRegressor(
    learning_rate=0.05,
    max_depth=3,
    max_iter=500,
    random_state=42
)
gbr.fit(X_train, y_train)
gbr_pred = gbr.predict(X_test)
eval_model("HistGradientBoosting", y_test, gbr_pred)


  rho = spearmanr(y_true, y_pred).correlation


Baseline | MAE: 0.2305 | Spearman r: nan
Ridge | MAE: 0.2479 | Spearman r: 0.0683
HistGradientBoosting | MAE: 0.2675 | Spearman r: 0.0226


(0.2675151658714606, np.float64(0.02260468488647651))

In [9]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)

def cv_eval(model, X, y, name):
    maes, rhos = [], []
    for train_idx, test_idx in kf.split(X):
        X_tr, X_te = X.iloc[train_idx], X.iloc[test_idx]
        y_tr, y_te = y.iloc[train_idx], y.iloc[test_idx]

        model.fit(X_tr, y_tr)
        pred = model.predict(X_te)

        maes.append(mean_absolute_error(y_te, pred))
        rhos.append(spearmanr(y_te, pred).correlation)

    print(f"{name} CV | MAE: {np.mean(maes):.4f} ± {np.std(maes):.4f} | Spearman r: {np.mean(rhos):.4f} ± {np.std(rhos):.4f}")

cv_eval(ridge, X, y, "Ridge")
cv_eval(gbr, X, y, "HistGradientBoosting")


Ridge CV | MAE: 0.2451 ± 0.0165 | Spearman r: 0.2154 ± 0.1001
HistGradientBoosting CV | MAE: 0.2599 ± 0.0056 | Spearman r: 0.1295 ± 0.1342


In [10]:
import pandas as pd
from sklearn.inspection import permutation_importance

# Ridge coefficients
ridge_coefs = pd.Series(
    ridge.named_steps["model"].coef_,
    index=FEATURES
).sort_values(key=lambda s: np.abs(s), ascending=False)

print("Ridge drivers (absolute magnitude):")
display(ridge_coefs)

# Permutation importance (on test set) for boosting
perm = permutation_importance(
    gbr, X_test, y_test,
    n_repeats=25, random_state=42,
    scoring="neg_mean_absolute_error"
)
perm_imp = pd.Series(perm.importances_mean, index=FEATURES).sort_values(ascending=False)

print("Permutation importance (GBR, test set):")
display(perm_imp)


Ridge drivers (absolute magnitude):


Unnamed: 0,0
TOV%,-0.169544
TOV,0.136597
PTS,-0.118859
TS%,0.095293
AST,0.048481
BLK,0.043946
STL,0.038414
TRB%,0.036843
AST%,0.014962
TRB,0.012628


Permutation importance (GBR, test set):


Unnamed: 0,0
AST%,0.058123
STL%,0.049214
TOV%,0.044076
PTS,0.03921
TS%,0.034823
TRB%,0.027941
TRB,0.027164
BLK,0.027012
TOV,0.023994
BLK%,0.022467


In [11]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge

final_ridge = Pipeline([
    ("scaler", StandardScaler()),
    ("model", Ridge(alpha=1.0))
])

final_ridge.fit(X, y)
print("Final Ridge trained on full dataset.")


Final Ridge trained on full dataset.


In [12]:
from google.colab import drive
import pandas as pd
import numpy as np

drive.mount("/content/drive")

PROSPECTS_PATH = "/content/drive/MyDrive/bbref_drafts/prospects_2026.csv"

df_prospects = pd.read_csv(PROSPECTS_PATH)

print("Prospects loaded:", len(df_prospects))
display(df_prospects.head())


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Prospects loaded: 26


Unnamed: 0,Player,PTS,TRB,AST,STL,BLK,TOV,TS%,TRB%,AST%,STL%,BLK%,TOV%,USG%
0,Darryn Peterson,29.3,5.7,4.2,1.9,1.1,1.5,0.651,7.7,27.4,2.9,3.3,6.3,31.5
1,AJ Dybantsa,27.7,9.2,4.3,1.7,0.6,2.9,0.623,12.5,20.8,2.4,1.7,11.4,30.8
2,Cameron Boozer,30.8,13.5,4.9,2.2,1.2,2.3,0.653,17.9,25.4,3.1,3.4,8.8,31.2
3,Caleb Wilson,26.3,14.0,3.3,2.0,1.6,2.3,0.614,18.7,17.4,2.8,3.8,9.8,29.4
4,Mikel Brown Jr.,24.4,4.4,7.5,1.2,0.1,3.5,0.552,5.8,32.6,1.6,0.4,13.8,29.7


In [13]:
FEATURES = [
    "PTS","TRB","AST","STL","BLK","TOV",
    "TS%","TRB%","AST%","STL%","BLK%","TOV%","USG%"
]

# Ensure numeric
for c in FEATURES:
    df_prospects[c] = pd.to_numeric(df_prospects[c], errors="coerce")

# Drop rows with missing features
df_prospects = df_prospects.dropna(subset=FEATURES).copy()

print("Rows after cleaning:", len(df_prospects))


Rows after cleaning: 26


In [14]:
df_prospects["Predicted_Success"] = final_ridge.predict(df_prospects[FEATURES])

# Convert to 0–100 rating
df_prospects["Rating_0_100"] = (df_prospects["Predicted_Success"] * 100).round(1)

# Rank
df_prospects = df_prospects.sort_values("Rating_0_100", ascending=False).reset_index(drop=True)
df_prospects["Rank"] = np.arange(1, len(df_prospects) + 1)

display(df_prospects[["Rank","Player","Rating_0_100"] + FEATURES])


Unnamed: 0,Rank,Player,Rating_0_100,PTS,TRB,AST,STL,BLK,TOV,TS%,TRB%,AST%,STL%,BLK%,TOV%,USG%
0,1,Yaxel Lendeborg,89.1,23.4,10.3,5.3,2.3,2.0,1.6,0.746,13.5,20.5,3.1,5.2,9.1,20.0
1,2,Hannes Steinbach,72.6,22.6,15.0,3.4,0.9,1.3,2.2,0.693,20.7,17.4,1.3,3.4,11.7,21.9
2,3,Kingston Flemings,70.5,20.4,4.5,6.8,2.8,0.4,2.9,0.692,6.2,33.0,4.2,1.2,16.3,22.2
3,4,Cameron Boozer,69.8,30.8,13.5,4.9,2.2,1.2,2.3,0.653,17.9,25.4,3.1,3.4,8.8,31.2
4,5,Aday Mara,67.9,18.5,14.8,3.6,0.5,4.8,4.3,0.61,19.4,14.0,0.7,12.5,22.0,22.5
5,6,Henri Veesaar,67.1,21.9,11.8,2.2,0.7,1.8,2.1,0.7,15.7,11.5,1.0,4.3,11.9,19.5
6,7,Caleb Wilson,64.7,26.3,14.0,3.3,2.0,1.6,2.3,0.614,18.7,17.4,2.8,3.8,9.8,29.4
7,8,Labaron Philon Jr.,62.8,30.9,5.2,7.7,1.6,0.5,3.9,0.659,6.2,37.6,2.1,1.1,14.2,30.8
8,9,Darryn Peterson,58.7,29.3,5.7,4.2,1.9,1.1,1.5,0.651,7.7,27.4,2.9,3.3,6.3,31.5
9,10,Bennett Stirtz,58.1,19.0,2.9,5.3,1.8,0.2,1.8,0.637,4.8,25.4,2.8,0.6,10.7,22.8


In [16]:
OUT_PATH = "/content/drive/MyDrive/bbref_drafts/top30_big_board.csv"

df_prospects.to_csv(OUT_PATH, index=False)

print("Big board saved to:", OUT_PATH)



Big board saved to: /content/drive/MyDrive/bbref_drafts/top30_big_board.csv
