In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from google.colab import drive
drive.mount("/content/drive")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
DRAFT_CSV_DIR = Path("/content/drive/MyDrive/bbref_drafts")
YEARS = list(range(2010, 2022))  # 2010-2021 inclusive


In [None]:
def read_bbref_csv_robust(fp: Path) -> pd.DataFrame:
    # First attempt: normal read
    df = pd.read_csv(fp)

    # If everything got read into one column, fix it
    if df.shape[1] == 1 and isinstance(df.columns[0], str) and "," in df.columns[0]:
        raw = pd.read_csv(fp, header=None)
        split = raw[0].astype(str).str.split(",", expand=True)

        # First row = actual header
        split.columns = split.iloc[0].astype(str).str.strip()
        df = split.iloc[1:].reset_index(drop=True)

    return df


In [None]:
def load_and_clean_draft_csv(year: int) -> pd.DataFrame:
    fp = DRAFT_CSV_DIR / f"NBA_{year}.csv"
    if not fp.exists():
        print(f"Missing file: NBA_{year}.csv")
        return pd.DataFrame()

    df = read_bbref_csv_robust(fp)

    # Rename columns to standard names
    df = df.rename(columns={
        "Pk": "DraftPick",
        "Player": "PlayerName",
        "College": "NCAA_Team",
        "Yrs": "NBA_Yrs_Career",
        "VORP": "VORP_Career"
    })

    required = {"DraftPick", "PlayerName", "NCAA_Team"}
    if not required.issubset(df.columns):
        raise ValueError(
            f"Unexpected columns in NBA_{year}.csv: {df.columns.tolist()}"
        )

    df["DraftYear"] = year

    # Type conversions
    df["DraftPick"] = pd.to_numeric(df["DraftPick"], errors="coerce")
    df["NBA_Yrs_Career"] = pd.to_numeric(
        df.get("NBA_Yrs_Career", np.nan), errors="coerce"
    )
    df["VORP_Career"] = pd.to_numeric(
        df.get("VORP_Career", np.nan), errors="coerce"
    )

    # First round only
    df = df[(df["DraftPick"] >= 1) & (df["DraftPick"] <= 30)].copy()

    # NCAA only
    df["NCAA_Team"] = (
        df["NCAA_Team"]
        .astype(str)
        .replace({"nan": ""})
        .str.strip()
    )
    df = df[df["NCAA_Team"] != ""].copy()

    out_cols = [
        "DraftYear",
        "DraftPick",
        "PlayerName",
        "NCAA_Team",
        "NBA_Yrs_Career",
        "VORP_Career"
    ]

    return df[out_cols].reset_index(drop=True)


In [None]:
draft_dfs = []

for year in YEARS:
    df_year = load_and_clean_draft_csv(year)
    print(f"{year}: {len(df_year)} NCAA first-rounders")
    draft_dfs.append(df_year)

df_historical_outcomes = pd.concat(draft_dfs, ignore_index=True)

print("\nTOTAL ROWS:", len(df_historical_outcomes))
display(df_historical_outcomes.head(20))

output_path = DRAFT_CSV_DIR / "nba_outcomes_2010_2021.csv"
df_historical_outcomes.to_csv(output_path, index=False)
print("Saved:", output_path)


2010: 29 NCAA first-rounders
2011: 24 NCAA first-rounders
2012: 29 NCAA first-rounders
2013: 23 NCAA first-rounders
2014: 24 NCAA first-rounders
2015: 26 NCAA first-rounders
2016: 22 NCAA first-rounders
2017: 27 NCAA first-rounders
2018: 27 NCAA first-rounders
2019: 26 NCAA first-rounders
2020: 26 NCAA first-rounders
2021: 25 NCAA first-rounders

TOTAL ROWS: 308


Unnamed: 0,DraftYear,DraftPick,PlayerName,NCAA_Team,NBA_Yrs_Career,VORP_Career
0,2010,1,John Wall,Kentucky,11.0,24.1
1,2010,2,Evan Turner,Ohio State,10.0,0.5
2,2010,3,Derrick Favors,Georgia Tech,12.0,14.0
3,2010,4,Wesley Johnson,Syracuse,9.0,1.2
4,2010,5,DeMarcus Cousins,Kentucky,11.0,21.6
5,2010,6,Ekpe Udoh,Baylor,7.0,0.8
6,2010,7,Greg Monroe,Georgetown,10.0,15.3
7,2010,8,Al-Farouq Aminu,Wake Forest,11.0,6.5
8,2010,9,Gordon Hayward,Butler,14.0,23.0
9,2010,10,Paul George,Fresno State,16.0,44.5


Saved: /content/drive/MyDrive/bbref_drafts/nba_outcomes_2010_2021.csv
