In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

from google.colab import drive
drive.mount("/content/drive")

OUTCOMES_PATH = Path("/content/drive/MyDrive/bbref_drafts/nba_outcomes_2010_2021.csv")

df = pd.read_csv(OUTCOMES_PATH)
print("Rows loaded:", len(df))
display(df.head())


Mounted at /content/drive
Rows loaded: 308


Unnamed: 0,DraftYear,DraftPick,PlayerName,NCAA_Team,NBA_Yrs_Career,VORP_Career
0,2010,1,John Wall,Kentucky,11.0,24.1
1,2010,2,Evan Turner,Ohio State,10.0,0.5
2,2010,3,Derrick Favors,Georgia Tech,12.0,14.0
3,2010,4,Wesley Johnson,Syracuse,9.0,1.2
4,2010,5,DeMarcus Cousins,Kentucky,11.0,21.6


In [None]:
# Standardize / coerce types
df["DraftYear"] = pd.to_numeric(df["DraftYear"], errors="coerce")
df["DraftPick"] = pd.to_numeric(df["DraftPick"], errors="coerce")
df["VORP_Career"] = pd.to_numeric(df["VORP_Career"], errors="coerce")

# Re-apply filters (safe even if already filtered)
df["NCAA_Team"] = df["NCAA_Team"].astype(str).replace({"nan": ""}).str.strip()

df = df[
    (df["DraftPick"] >= 1) & (df["DraftPick"] <= 30) &
    (df["NCAA_Team"] != "") &
    (df["DraftYear"].between(2010, 2021))
].copy()

print("Rows after filters:", len(df))
print("Missing VORP_Career:", df["VORP_Career"].isna().sum())
display(df.head())


Rows after filters: 308
Missing VORP_Career: 0


Unnamed: 0,DraftYear,DraftPick,PlayerName,NCAA_Team,NBA_Yrs_Career,VORP_Career
0,2010,1,John Wall,Kentucky,11.0,24.1
1,2010,2,Evan Turner,Ohio State,10.0,0.5
2,2010,3,Derrick Favors,Georgia Tech,12.0,14.0
3,2010,4,Wesley Johnson,Syracuse,9.0,1.2
4,2010,5,DeMarcus Cousins,Kentucky,11.0,21.6


In [None]:
# Percentile rank within each draft year (0 to 1). Higher is better.
# method="average" avoids weirdness with ties
df["VORP_Pctl_InClass"] = (
    df.groupby("DraftYear")["VORP_Career"]
      .rank(pct=True, method="average", ascending=True)
)

# Convert to "higher = better" explicitly (rank(pct=True) already has higher value for higher VORP)
# So no invert needed.

# Z-score within each draft year (mean 0, std 1)
def zscore(s: pd.Series) -> pd.Series:
    mu = s.mean(skipna=True)
    sd = s.std(skipna=True)
    if sd == 0 or np.isnan(sd):
        return pd.Series([np.nan] * len(s), index=s.index)
    return (s - mu) / sd

df["VORP_Z_InClass"] = df.groupby("DraftYear")["VORP_Career"].transform(zscore)

# Optional: 0â€“100 rating directly from percentile
df["VORP_Rating_0_100"] = (df["VORP_Pctl_InClass"] * 100).round(1)

display(df[["DraftYear","DraftPick","PlayerName","VORP_Career","VORP_Pctl_InClass","VORP_Rating_0_100"]].head(20))


Unnamed: 0,DraftYear,DraftPick,PlayerName,VORP_Career,VORP_Pctl_InClass,VORP_Rating_0_100
0,2010,1,John Wall,24.1,0.965517,96.6
1,2010,2,Evan Turner,0.5,0.413793,41.4
2,2010,3,Derrick Favors,14.0,0.793103,79.3
3,2010,4,Wesley Johnson,1.2,0.517241,51.7
4,2010,5,DeMarcus Cousins,21.6,0.896552,89.7
5,2010,6,Ekpe Udoh,0.8,0.448276,44.8
6,2010,7,Greg Monroe,15.3,0.827586,82.8
7,2010,8,Al-Farouq Aminu,6.5,0.724138,72.4
8,2010,9,Gordon Hayward,23.0,0.931034,93.1
9,2010,10,Paul George,44.5,1.0,100.0


In [None]:
out_path = OUTCOMES_PATH.parent / "nba_outcomes_2010_2021_normalized.csv"
df.to_csv(out_path, index=False)
print("Saved:", out_path)


Saved: /content/drive/MyDrive/bbref_drafts/nba_outcomes_2010_2021_normalized.csv
