
# NBA Top-150: Stat Distributions & Summary (Plain Notebook)

This notebook:
- Loads a per-game CSV (e.g., `nba_top_150_per_game_24_25.csv`)
- Detects key stat columns (PTS, REB, AST, STL, BLK, TOV, 3PM, FG%, FT%)
- Computes **mean, median, ±1 SD**, plus min/max and quartiles
- Saves a summary CSV
- Plots distributions (histograms) with mean and ±1 SD vertical lines

> Edit the **Configuration** cell below to set your CSV path and output folder.


In [None]:

import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline


In [None]:

# === Configuration ===
CSV_PATH = "data/processed/nba_top_150_per_game_24_25.csv"   # <-- change to your file location
OUTDIR = Path("outputs")                                     # where to save summary & figures
BINS = 20                                                    # histogram bins
OUTDIR.mkdir(parents=True, exist_ok=True)


In [None]:

df = pd.read_csv(CSV_PATH)
print(f"Loaded {CSV_PATH!r} with shape {df.shape}")
df.head()


In [None]:

STAT_NAME_ALIASES = {
    "PTS": ["PTS"],
    "REB": ["REB", "TRB"],
    "AST": ["AST"],
    "STL": ["STL"],
    "BLK": ["BLK"],
    "TOV": ["TOV", "TO"],
    "3PM": ["3PM", "FG3M", "3P"],
    "FG%": ["FG%", "FG_PCT"],
    "FT%": ["FT%", "FT_PCT"],
    # "GP": ["GP"],
    # "MIN": ["MIN"],
}

def pick_present_columns(df: pd.DataFrame) -> dict:
    actual = {}
    lower_cols = {c.lower(): c for c in df.columns}
    for logical, candidates in STAT_NAME_ALIASES.items():
        for cand in candidates:
            if cand.lower() in lower_cols:
                actual[logical] = lower_cols[cand.lower()]
                break
    return actual

def summarize_stats(df: pd.DataFrame, col_map: dict) -> pd.DataFrame:
    rows = []
    for logical, col in col_map.items():
        series = pd.to_numeric(df[col], errors="coerce").dropna()
        if series.empty:
            continue
        rows.append({
            "stat": logical,
            "column": col,
            "count": int(series.count()),
            "mean": float(series.mean()),
            "median": float(series.median()),
            "std": float(series.std(ddof=1)),
            "min": float(series.min()),
            "max": float(series.max()),
            "p25": float(series.quantile(0.25)),
            "p75": float(series.quantile(0.75)),
        })
    out = pd.DataFrame(rows).sort_values("stat").reset_index(drop=True)
    return out


In [None]:

col_map = pick_present_columns(df)
if not col_map:
    raise RuntimeError("No expected stat columns found. Check your CSV headers or STAT_NAME_ALIASES.")

summary = summarize_stats(df, col_map)

print("\n=== Summary (mean, median, ±1 SD) ===\n")
for _, row in summary.iterrows():
    print(f"{row['stat']:>4}  mean={row['mean']:.3f}  median={row['median']:.3f}  std={row['std']:.3f}  (n={row['count']})")
print()

summary_path = OUTDIR / "stat_distributions_summary.csv"
summary.to_csv(summary_path, index=False)
print(f"Saved summary → {summary_path}")

summary


In [None]:

figs_dir = OUTDIR / "figures"
figs_dir.mkdir(parents=True, exist_ok=True)

for logical, col in col_map.items():
    series = pd.to_numeric(df[col], errors="coerce").dropna()
    if series.empty:
        continue

    mean = series.mean()
    std = series.std(ddof=1)

    plt.figure(figsize=(7, 5))
    plt.hist(series, bins=BINS, edgecolor="black")
    plt.axvline(mean, linestyle="--", linewidth=1.5, label=f"Mean = {mean:.3f}")
    if np.isfinite(std) and std > 0:
        plt.axvline(mean - std, linestyle=":", linewidth=1.2, label=f"Mean - 1 SD = {mean - std:.3f}")
        plt.axvline(mean + std, linestyle=":", linewidth=1.2, label=f"Mean + 1 SD = {mean + std:.3f}")

    plt.title(f"Distribution of {logical} ({col})")
    plt.xlabel(logical)
    plt.ylabel("Count")
    plt.legend()
    plt.tight_layout()

    outpath = figs_dir / f"hist_{logical.replace('%','pct')}.png"
    plt.savefig(outpath, dpi=150)
    plt.show()

print(f"Saved histograms → {figs_dir}")



## Notes
- Include GP/MIN by uncommenting them in `STAT_NAME_ALIASES`.
- If your CSV headers differ, add aliases.
- Increase `BINS` for smoother histograms.
