In [1]:
import os
import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm

# Setup
baseEndpoint = "https://stats.ncaa.org"
output_stat_dir = "statCSVs_2019_2023"
output_score_dir = "scoreCSVs_2019_2023"
os.makedirs(output_stat_dir, exist_ok=True)
os.makedirs(output_score_dir, exist_ok=True)
years = range(2019, 2024)

# ------------------- STATS SECTION ------------------- #
def getStatDropdown(year, stat_type='I'):  # 'I' = Individual, 'T' = Team
    example_url = f"https://stats.ncaa.org/rankings/national_ranking?academic_year={year}.0&division=1.0&ranking_period=82.0&sport_code=MVB&stat_seq=532.0"
    response = requests.get(example_url)
    soup = BeautifulSoup(response.text, "html.parser")

    dropdown_id = f"stat_type_{stat_type}_N"
    dropdown_div = soup.find("div", {"aria-labelledby": dropdown_id})
    links = dropdown_div.find_all("a") if dropdown_div else []

    stat_names = [a.text.strip() for a in links if a.get("href", "").startswith("/rankings")]
    stat_hrefs = [a["href"] for a in links if a.get("href", "").startswith("/rankings")]

    return stat_names, stat_hrefs

def getTable(baseEndpoint, href):
    headers = {
        "User-Agent": "Mozilla/5.0",
        "Referer": f"{baseEndpoint}{href}"
    }
    response = requests.get(f"{baseEndpoint}{href}", headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")

    table = soup.find("table", id="rankings_table")
    if not table:
        return None

    header_cells = table.find("thead").find_all("th")
    headers = [th.get_text(strip=True) for th in header_cells]

    data_rows = []
    for tr in table.find("tbody").find_all("tr"):
        row_data = []
        for td in tr.find_all("td"):
            a = td.find("a")
            text = a.get_text(strip=True) if a else td.get_text(strip=True)
            row_data.append(text)
        data_rows.append(row_data)

    return pd.DataFrame(data_rows, columns=headers)

# Scrape stats for each year and type
for year in years:
    for stat_type, label in zip(['I', 'T'], ['Individual', 'Team']):
        print(f"\nScraping {label} stats for {year}...")
        try:
            stat_names, stat_hrefs = getStatDropdown(year, stat_type)
            for name, href in zip(stat_names, stat_hrefs):
                try:
                    print(f"  → {name}")
                    df = getTable(baseEndpoint, href)
                    if df is not None:
                        fname = f"{name.replace(' ', '_')}_{label}_{year}.csv"
                        df.to_csv(os.path.join(output_stat_dir, fname), index=False)
                except Exception as e:
                    print(f"    ❌ Failed to get {name}: {e}")
        except Exception as e:
            print(f"❌ Failed to fetch dropdown for {label} {year}: {e}")

# ------------------- GAME SCORES SECTION ------------------- #
for year in years:
    yearly_dataframes = []
    print(f"\nScraping match scores for {year}...")

    for month in range(1, 13):
        month_str = str(month).zfill(2)
        url = f"https://data.ncaa.com/casablanca/schedule/volleyball-men/d1/{year}/{month_str}/schedule-all-conf.json"
        try:
            response = requests.get(url)
            if response.status_code != 200:
                continue
            game_dates = [x for x in response.json().get("gameDates", []) if x.get("games", 0) > 0]
        except Exception as e:
            print(f"  ⚠️ Skipping {url} due to error: {e}")
            continue

        for date in tqdm(game_dates, desc=f"{year}-{month_str}"):
            try:
                month_, day_, year_ = date["contest_date"].split("-")
                scoreboard_url = f"https://www.ncaa.com/scoreboard/volleyball-men/d1/{year_}/{month_}/{day_}"
                resp = requests.get(scoreboard_url)
                soup = BeautifulSoup(resp.text, 'html.parser')

                rows = []
                for game in soup.find_all('ul', class_='gamePod-game-teams'):
                    teams = game.find_all('li')
                    if len(teams) == 2:
                        team1 = teams[0].find('span', class_='gamePod-game-team-name').text.strip()
                        score1 = teams[0].find('span', class_='gamePod-game-team-score').text.strip()
                        team2 = teams[1].find('span', class_='gamePod-game-team-name').text.strip()
                        score2 = teams[1].find('span', class_='gamePod-game-team-score').text.strip()
                        rows.append([team1, score1, team2, score2, date["contest_date"]])

                if rows:
                    df = pd.DataFrame(rows, columns=["Team 1", "Team 1 Score", "Team 2", "Team 2 Score", "Date"])
                    yearly_dataframes.append(df)
            except Exception as e:
                print(f"    ❌ Error on {date['contest_date']}: {e}")

    # Save yearly results
    if yearly_dataframes:
        year_df = pd.concat(yearly_dataframes, ignore_index=True)
        out_path = os.path.join(output_score_dir, f"NCAA_Mens_VB_Scores_{year}.csv")
        year_df.to_csv(out_path, index=False)


Scraping Individual stats for 2019...

Scraping Team stats for 2019...

Scraping Individual stats for 2020...

Scraping Team stats for 2020...

Scraping Individual stats for 2021...

Scraping Team stats for 2021...

Scraping Individual stats for 2022...

Scraping Team stats for 2022...

Scraping Individual stats for 2023...

Scraping Team stats for 2023...

Scraping match scores for 2019...


2019-01: 100%|██████████| 27/27 [00:23<00:00,  1.13it/s]
2019-02: 100%|██████████| 24/24 [00:20<00:00,  1.19it/s]
2019-03: 100%|██████████| 26/26 [00:23<00:00,  1.13it/s]
2019-04: 100%|██████████| 13/13 [00:11<00:00,  1.12it/s]
2019-05: 100%|██████████| 2/2 [00:02<00:00,  1.06s/it]
2019-12: 100%|██████████| 1/1 [00:00<00:00,  1.09it/s]



Scraping match scores for 2020...


2020-01: 100%|██████████| 27/27 [00:23<00:00,  1.14it/s]
2020-02: 100%|██████████| 25/25 [00:22<00:00,  1.11it/s]
2020-03: 100%|██████████| 28/28 [00:24<00:00,  1.14it/s]
2020-04: 100%|██████████| 12/12 [00:10<00:00,  1.18it/s]



Scraping match scores for 2021...


2021-01: 100%|██████████| 10/10 [00:08<00:00,  1.15it/s]
2021-02: 100%|██████████| 23/23 [00:20<00:00,  1.11it/s]
2021-03: 100%|██████████| 28/28 [00:25<00:00,  1.11it/s]
2021-04: 100%|██████████| 16/16 [00:15<00:00,  1.07it/s]
2021-05: 100%|██████████| 4/4 [00:04<00:00,  1.17s/it]



Scraping match scores for 2022...


2022-01: 100%|██████████| 25/25 [00:26<00:00,  1.06s/it]
2022-02: 100%|██████████| 25/25 [00:21<00:00,  1.17it/s]
2022-03: 100%|██████████| 30/30 [00:26<00:00,  1.15it/s]
2022-04: 100%|██████████| 19/19 [00:18<00:00,  1.03it/s]
2022-05: 100%|██████████| 4/4 [00:03<00:00,  1.00it/s]



Scraping match scores for 2023...


2023-01: 100%|██████████| 26/26 [00:22<00:00,  1.15it/s]
2023-02: 100%|██████████| 25/25 [00:21<00:00,  1.16it/s]
2023-03: 100%|██████████| 30/30 [00:25<00:00,  1.20it/s]
2023-04: 100%|██████████| 17/17 [00:25<00:00,  1.52s/it]
2023-05: 100%|██████████| 3/3 [00:03<00:00,  1.10s/it]
