In [None]:
import os
import requests
import pandas as pd
from bs4 import BeautifulSoup

# ── Configuration ───────────────────────────────────────────────
base_url     = "https://stats.ncaa.org"
output_dir   = "statCSVs_2019_2023"
os.makedirs(output_dir, exist_ok=True)

years         = range(2019, 2024)
stat_sequences = {
    'Aces Per Set':       532,
    'Assists Per Set':    522,
    'Attacks Per Set':   1124,
    'Blocks Per Set':     523,
    'Digs Per Set':       524,
    'Hitting Percentage': 520,
    'Kills Per Set':      521,
    'Points Per Set':     686,
    'Triple Doubles':     929
}

# ── Helper: fetch & parse a table given a full href ──────────────
def get_table(href):
    headers = {
        "User-Agent": "Mozilla/5.0",
        "Referer":     base_url + href
    }
    r = requests.get(base_url + href, headers=headers, timeout=10)
    soup = BeautifulSoup(r.text, "html.parser")
    tbl = soup.find("table", id="rankings_table")
    if not tbl:
        return None
    cols = [th.get_text(strip=True) for th in tbl.find("thead").find_all("th")]
    rows = []
    for tr in tbl.find("tbody").find_all("tr"):
        rows.append([td.get_text(strip=True) for td in tr.find_all("td")])
    return pd.DataFrame(rows, columns=cols)

# ── Step 1: Discover ranking_period for each year ────────────────
period_by_year = {}
for year in years:
    print(f"Finding ranking_period for {year}…", end="")
    found = None
    # try the first 100 period IDs
    for p in range(1, 101):
        href = (f"/rankings/national_ranking?"
                f"academic_year={year}.0&division=1.0&ranking_period={p}.0"
                f"&sport_code=MVB&stat_seq=532.0")
        df = get_table(href)
        if df is not None and not df.empty:
            found = p
            break
    if found:
        period_by_year[year] = found
        print(f"  ✔️  period = {found}")
    else:
        print("  ⚠️  not found")

# ── Step 2: Scrape each stat for each year ────────────────────────
for year, period in period_by_year.items():
    print(f"\n--- Scraping stats for {year} (period={period}) ---")
    for stat_name, seq in stat_sequences.items():
        href = (f"/rankings/national_ranking?"
                f"academic_year={year}.0&division=1.0&ranking_period={period}.0"
                f"&sport_code=MVB&stat_seq={seq}.0")
        print(f" • {stat_name}", end="")
        df = get_table(href)
        if df is None or df.empty:
            print(" — no data")
            continue
        fname = f"{stat_name.replace(' ', '_')}_Individual_{year}.csv"
        df.to_csv(os.path.join(output_dir, fname), index=False)
        print(" — saved")

print("\nAll done! Check the folder:", output_dir)

In [None]:
import os
import requests
import pandas as pd
from bs4 import BeautifulSoup

# ── Configuration ───────────────────────────────────────────────
base_url     = "https://stats.ncaa.org"
output_dir   = "teamCSVs_2019_2023"
os.makedirs(output_dir, exist_ok=True)

years = range(2019, 2024)

# Team stat_seq values
team_stats = {
    'Aces Per Set':               528,
    'Assists Per Set':            527,
    'Blocks Per Set':             529,
    'Digs Per Set':               533,
    'Hitting Percentage':         525,
    'Kills Per Set':              526,
    'Match W-L Pctg.':            530,
    'Opponent Hitting Percentage':1140,
    'Team Attacks Per Set':      1125
}

# ── Helper: fetch & parse a table given an href ────────────────
def get_table(href):
    r = requests.get(base_url + href, headers={
        "User-Agent": "Mozilla/5.0",
        "Referer":     base_url + href
    }, timeout=10)
    soup = BeautifulSoup(r.text, "html.parser")
    tbl = soup.find("table", id="rankings_table")
    if not tbl:
        return None
    cols = [th.get_text(strip=True) for th in tbl.find("thead").find_all("th")]
    rows = []
    for tr in tbl.find("tbody").find_all("tr"):
        rows.append([td.get_text(strip=True) for td in tr.find_all("td")])
    return pd.DataFrame(rows, columns=cols)

# ── Step 1: discover ranking_period for each year ───────────────
period_by_year = {}
for year in years:
    for p in range(1, 101):
        href = (
            f"/rankings/national_ranking?"
            f"academic_year={year}.0&division=1.0&ranking_period={p}.0"
            f"&sport_code=MVB&stat_seq=528.0"  # try with Aces Per Set
        )
        df = get_table(href)
        if df is not None and not df.empty:
            period_by_year[year] = p
            break

# ── Step 2: scrape team stats only ──────────────────────────────
for year, period in period_by_year.items():
    print(f"Scraping team stats for {year} (period={period}):")
    for name, seq in team_stats.items():
        href = (
            f"/rankings/national_ranking?"
            f"academic_year={year}.0&division=1.0&ranking_period={period}.0"
            f"&sport_code=MVB&stat_seq={seq}.0"
        )
        try:
            df = get_table(href)
            if df is not None and not df.empty:
                fname = f"{name.replace(' ', '_')}_Team_{year}.csv"
                df.to_csv(os.path.join(output_dir, fname), index=False)
                print(f"  ✔ {name}")
            else:
                print(f"  ⚠ {name} (no data)")
        except Exception as e:
            print(f"  ❌ {name}: {e}")

print("Done. CSVs saved in", output_dir)