In [1]:
import pandas as pd
from pathlib import Path

Project_Root = Path(".")
Data_Dir = Project_Root / "sports_data"

list(Data_Dir.glob("*.xls*"))

[WindowsPath('sports_data/sportsref_download_2010-2011.xls'),
 WindowsPath('sports_data/sportsref_download_2011-2012.xls'),
 WindowsPath('sports_data/sportsref_download_2012-2013.xls'),
 WindowsPath('sports_data/sportsref_download_2013-2014.xls'),
 WindowsPath('sports_data/sportsref_download_2014-2015.xls'),
 WindowsPath('sports_data/sportsref_download_2015-2016.xls'),
 WindowsPath('sports_data/sportsref_download_2016-2017.xls'),
 WindowsPath('sports_data/sportsref_download_2017-2018.xls'),
 WindowsPath('sports_data/sportsref_download_2018-2019.xls'),
 WindowsPath('sports_data/sportsref_download_2019-2020.xls'),
 WindowsPath('sports_data/sportsref_download_2020-2021.xls'),
 WindowsPath('sports_data/sportsref_download_2021-2022.xls'),
 WindowsPath('sports_data/sportsref_download_2022-2023.xlsx'),
 WindowsPath('sports_data/sportsref_download_2023-2024.xlsx'),
 WindowsPath('sports_data/sportsref_download_2024-2025.xlsx')]

In [5]:
def clean_season(path, season_name):
    """
    load one clean season file from sports_data
    steps:
    1. Read excel with no header 
    2. Use first row as header
    3. Drop completely empty rows
    4. Keep a limited set of stats
    5. Clean school names
    6. Convert numeric columns
    7. Add a season colum 
    """

    #step 1
    raw = pd.read_excel(path, header=None)

    #step 2
    raw.columns = raw.iloc[0]
    df = raw.ilic[1:].reset_index(drop=True)

    #step 3
    df = df.dropna(axis=1, how='all')

    #step 4
    keep_cols = [
        "School",
        "G", "W", "L", "W-L%", "SRS", "SOS",
        "PTS", "Opp PTS",
        "FG%", "3P%", "3PA", "FGA", "FT%",
        "TRB", "AST", "TOV", "STL", "BLK"
    ]

    existing_cols = [col for col in keep_cols if col in df.columns]
    missing_cols = [col for col in keep_cols if col not in df.columns]

    if missing_cols:
        print(f"For {season_name}, missing columns: {missing_cols}")

    df = df[existing_cols.copy()]

    #step 5
    if "School" in df.columns:
        df["School"] = (
            df["School"]
            .astype(str)
            .str.replace(r"\s*NCAA$", "", regex=True)
            .str.strip()
        )

    #step 6
    for col in df.columns:
        if col != "School":
            df[col] = pd.to_numeric(df[col], errors="coerce")

    #step 7
    df["Season"] = season_name

    return df
    

In [8]:
excel_files = sorted(Data_Dir.glob("*xls*"))
excel_files

[WindowsPath('sports_data/sportsref_download_2010-2011.xls'),
 WindowsPath('sports_data/sportsref_download_2011-2012.xls'),
 WindowsPath('sports_data/sportsref_download_2012-2013.xls'),
 WindowsPath('sports_data/sportsref_download_2013-2014.xls'),
 WindowsPath('sports_data/sportsref_download_2014-2015.xls'),
 WindowsPath('sports_data/sportsref_download_2015-2016.xls'),
 WindowsPath('sports_data/sportsref_download_2016-2017.xls'),
 WindowsPath('sports_data/sportsref_download_2017-2018.xls'),
 WindowsPath('sports_data/sportsref_download_2018-2019.xls'),
 WindowsPath('sports_data/sportsref_download_2019-2020.xls'),
 WindowsPath('sports_data/sportsref_download_2020-2021.xls'),
 WindowsPath('sports_data/sportsref_download_2021-2022.xls'),
 WindowsPath('sports_data/sportsref_download_2022-2023.xlsx'),
 WindowsPath('sports_data/sportsref_download_2023-2024.xlsx'),
 WindowsPath('sports_data/sportsref_download_2024-2025.xlsx')]

In [9]:
season_dfs = {}

for file in excel_files:
    name = file.stem
    season_name = name.split("_")[-1] # takes the years at the end of file name

    df_season = clean_season(file, season_name)
    season_dfs[season_name] = df_season

    print("Loaded season:", season_name, "Shape:", df_season.shape)

ValueError: Excel file format cannot be determined, you must specify an engine manually.