
# NBA Top-150: Stat Distributions & Summary

This notebook:
- Loads a per-game CSV (e.g., `nba_top_150_per_game_24_25.csv`)
- Detects key stat columns (PTS, REB, AST, STL, BLK, TOV, 3PM, FG%, FT%)
- Computes **mean, median, ±1 SD**, plus min/max and quartiles
- Saves a summary CSV
- Plots distributions (histograms) with mean and ±1 SD vertical lines

> Tip: Edit the **Configuration** cell below to set your CSV path and output folder.


In [None]:
!pip install caas_jupyter

In [1]:

import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Jupyter display helpers
from caas_jupyter_tools import display_dataframe_to_user

# matplotlib inline in classic Notebook; VS Code auto-handles this as well
%matplotlib inline


ModuleNotFoundError: No module named 'caas_jupyter_tools'

In [6]:

# === Configuration ===
CSV_PATH = "/home/builder/Desktop/NBAstats/nba_top180_per_game_2024-25.csv"   # <-- change to your file location
OUTDIR = Path("outputs")                                     # where to save summary & figures
BINS = 20                                                    # histogram bins
OUTDIR.mkdir(parents=True, exist_ok=True)


In [7]:

df = pd.read_csv(CSV_PATH)
print(f"Loaded {CSV_PATH!r} with shape {df.shape}")
df.head()


Loaded '/home/builder/Desktop/NBAstats/nba_top180_per_game_2024-25.csv' with shape (180, 15)


Unnamed: 0,SEASON,PLAYER_ID,PLAYER_NAME,TEAM_ABBREVIATION,GP,MIN,PTS,REB,AST,STL,BLK,TOV,3PM,FG%,FT%
0,2024-25,1628969,Mikal Bridges,NYK,82,37.0,17.6,3.2,3.7,0.9,0.5,1.6,2.0,0.5,0.814
1,2024-25,1628404,Josh Hart,NYK,77,37.6,13.6,9.6,5.9,1.5,0.4,2.1,1.1,0.525,0.776
2,2024-25,1630162,Anthony Edwards,MIN,79,36.3,27.6,5.7,4.5,1.2,0.6,3.2,4.1,0.447,0.837
3,2024-25,1626164,Devin Booker,PHX,75,37.3,25.6,4.1,7.1,0.9,0.2,2.9,2.4,0.461,0.894
4,2024-25,201935,James Harden,LAC,79,35.3,22.8,5.8,8.7,1.5,0.7,4.3,3.0,0.41,0.874


In [9]:

# Candidate column names we’ll accept for each logical stat
STAT_NAME_ALIASES = {
    "PTS": ["PTS"],
    "REB": ["REB", "TRB"],
    "AST": ["AST"],
    "STL": ["STL"],
    "BLK": ["BLK"],
    "TOV": ["TOV", "TO"],
    "3PM": ["3PM", "FG3M", "3P"],
    "FG%": ["FG%", "FG_PCT"],
    "FT%": ["FT%", "FT_PCT"],
    # Uncomment to include GP/MIN distributions:
    # "GP": ["GP"],
    # "MIN": ["MIN"],
}

def pick_present_columns(df: pd.DataFrame) -> dict:
    """Return mapping {logical_name: actual_col_in_df} for the stats present."""
    actual = {}
    lower_cols = {c.lower(): c for c in df.columns}
    for logical, candidates in STAT_NAME_ALIASES.items():
        for cand in candidates:
            if cand.lower() in lower_cols:
                actual[logical] = lower_cols[cand.lower()]
                break
    return actual

def summarize_stats(df: pd.DataFrame, col_map: dict) -> pd.DataFrame:
    """Compute mean, median, std (1 SD) for each stat, return as tidy DataFrame."""
    rows = []
    for logical, col in col_map.items():
        series = pd.to_numeric(df[col], errors="coerce").dropna()
        if series.empty:
            continue
        rows.append({
            "stat": logical,
            "column": col,
            "count": int(series.count()),
            "mean": float(series.mean()),
            "median": float(series.median()),
            "std": float(series.std(ddof=1)),
            "min": float(series.min()),
            "max": float(series.max()),
            "p25": float(series.quantile(0.25)),
            "p75": float(series.quantile(0.75)),
        })
    out = pd.DataFrame(rows).sort_values("stat").reset_index(drop=True)
    return out


In [10]:

col_map = pick_present_columns(df)
if not col_map:
    raise RuntimeError("No expected stat columns found. Check your CSV headers or STAT_NAME_ALIASES.")

summary = summarize_stats(df, col_map)

print("\n=== Summary (mean, median, ±1 SD) ===\n")
for _, row in summary.iterrows():
    print(f"{row['stat']:>4}  mean={row['mean']:.3f}  median={row['median']:.3f}  std={row['std']:.3f}  (n={row['count']})")
print()

# Save and display summary
summary_path = OUTDIR / "stat_distributions_summary.csv"
summary.to_csv(summary_path, index=False)
print(f"Saved summary → {summary_path}")

display_dataframe_to_user("Stat Distributions Summary", summary)
summary



=== Summary (mean, median, ±1 SD) ===

 3PM  mean=1.827  median=1.750  std=0.888  (n=180)
 AST  mean=3.551  median=3.000  std=2.071  (n=180)
 BLK  mean=0.547  median=0.400  std=0.482  (n=180)
 FG%  mean=0.468  median=0.458  std=0.057  (n=180)
 FT%  mean=0.791  median=0.808  std=0.081  (n=180)
 PTS  mean=15.286  median=13.850  std=5.974  (n=180)
 REB  mean=5.149  median=4.300  std=2.535  (n=180)
 STL  mean=0.982  median=0.900  std=0.349  (n=180)
 TOV  mean=1.746  median=1.600  std=0.856  (n=180)

Saved summary → outputs/stat_distributions_summary.csv


NameError: name 'display_dataframe_to_user' is not defined

In [None]:

figs_dir = OUTDIR / "figures"
figs_dir.mkdir(parents=True, exist_ok=True)

for logical, col in col_map.items():
    series = pd.to_numeric(df[col], errors="coerce").dropna()
    if series.empty:
        continue

    mean = series.mean()
    std = series.std(ddof=1)

    plt.figure(figsize=(7, 5))
    plt.hist(series, bins=BINS, edgecolor="black")
    plt.axvline(mean, linestyle="--", linewidth=1.5, label=f"Mean = {mean:.3f}")
    if np.isfinite(std) and std > 0:
        plt.axvline(mean - std, linestyle=":", linewidth=1.2, label=f"Mean - 1 SD = {mean - std:.3f}")
        plt.axvline(mean + std, linestyle=":", linewidth=1.2, label=f"Mean + 1 SD = {mean + std:.3f}")

    plt.title(f"Distribution of {logical} ({col})")
    plt.xlabel(logical)
    plt.ylabel("Count")
    plt.legend()
    plt.tight_layout()

    outpath = figs_dir / f"hist_{logical.replace('%','pct')}.png"
    plt.savefig(outpath, dpi=150)
    plt.show()

print(f"Saved histograms → {figs_dir}")



## Notes & Tweaks
- To include **GP** and **MIN** in the plots and summary, add them to `STAT_NAME_ALIASES` and re-run.
- If your CSV uses different headers, add them as aliases in `STAT_NAME_ALIASES`.
- Increase `BINS` for smoother histograms on larger samples.
- The summary CSV is saved in your `OUTDIR` for quick export.
