In [8]:
!pip install lxml

Collecting lxml
  Downloading lxml-6.0.2-cp313-cp313-win_amd64.whl.metadata (3.7 kB)
Downloading lxml-6.0.2-cp313-cp313-win_amd64.whl (4.0 MB)
   ---------------------------------------- 0.0/4.0 MB ? eta -:--:--
   ---------------------------------------  3.9/4.0 MB 29.0 MB/s eta 0:00:01
   ---------------------------------------- 4.0/4.0 MB 23.5 MB/s  0:00:00
Installing collected packages: lxml
Successfully installed lxml-6.0.2


In [9]:
import pandas as pd
from pathlib import Path

Project_Root = Path(".")
Data_Dir = Project_Root / "sports_data"

excel_files = sorted(Data_Dir.glob("*.xls*"))
excel_files

[WindowsPath('sports_data/sportsref_download_2010-2011.xls'),
 WindowsPath('sports_data/sportsref_download_2011-2012.xls'),
 WindowsPath('sports_data/sportsref_download_2012-2013.xls'),
 WindowsPath('sports_data/sportsref_download_2013-2014.xls'),
 WindowsPath('sports_data/sportsref_download_2014-2015.xls'),
 WindowsPath('sports_data/sportsref_download_2015-2016.xls'),
 WindowsPath('sports_data/sportsref_download_2016-2017.xls'),
 WindowsPath('sports_data/sportsref_download_2017-2018.xls'),
 WindowsPath('sports_data/sportsref_download_2018-2019.xls'),
 WindowsPath('sports_data/sportsref_download_2019-2020.xls'),
 WindowsPath('sports_data/sportsref_download_2020-2021.xls'),
 WindowsPath('sports_data/sportsref_download_2021-2022.xls'),
 WindowsPath('sports_data/sportsref_download_2022-2023.xlsx'),
 WindowsPath('sports_data/sportsref_download_2023-2024.xlsx'),
 WindowsPath('sports_data/sportsref_download_2024-2025.xlsx')]

In [10]:
def clean_season(path, season_name):
    """
    Load and clean one season file from sports_data.
    Handles:
    - .xls files that are actually HTML tables (Sports Reference)
    - .xlsx real Excel files
    """

    ext = path.suffix.lower()

    # ---- Step 1: read the raw table ----
    if ext == ".xls":
        # Many Sports Reference "xls" downloads are really HTML tables.
        # pd.read_html can read them directly.
        tables = pd.read_html(path)
        raw = tables[0]   # first (and usually only) table
    elif ext == ".xlsx":
        # Real Excel file
        raw = pd.read_excel(path, header=0, engine="openpyxl")
    else:
        raise ValueError(f"Unsupported file type: {ext}")

    # At this point, raw already has the header row as columns.
    # ---- Step 2: drop completely empty columns ----
    raw = raw.dropna(axis=1, how="all")

    # ---- Step 3: keep a limited set of stats ----
    keep_cols = [
        "School",
        "G", "W", "L", "W-L%", "SRS", "SOS",
        "PTS", "Opp PTS",
        "FG%", "3P%", "3PA", "FGA", "FT%",
        "TRB", "AST", "TOV", "STL", "BLK"
    ]

    existing_cols = [col for col in keep_cols if col in raw.columns]
    missing_cols = [col for col in keep_cols if col not in raw.columns]

    if missing_cols:
        print(f"For {season_name}, missing columns (skipped): {missing_cols}")

    df = raw[existing_cols].copy()

    # ---- Step 4: clean School names ----
    if "School" in df.columns:
        df["School"] = (
            df["School"]
            .astype(str)
            .str.replace(r"\s*NCAA$", "", regex=True)
            .str.strip()
        )

    # ---- Step 5: convert numeric columns ----
    for col in df.columns:
        if col != "School":
            df[col] = pd.to_numeric(df[col], errors="coerce")

    # ---- Step 6: add Season column ----
    df["Season"] = season_name

    return df


In [13]:
season_dfs = {}

for file in excel_files:
    name = file.stem
    season_name = name.split("_")[-1]  # takes the years at the end of file name

    df_season = clean_season(file, season_name)
    season_dfs[season_name] = df_season

    print("Loaded season:", season_name, "Shape:", df_season.shape)

For 2010-2011, missing columns (skipped): ['School', 'G', 'W', 'L', 'W-L%', 'SRS', 'SOS', 'PTS', 'Opp PTS', 'FG%', '3P%', '3PA', 'FGA', 'FT%', 'TRB', 'AST', 'TOV', 'STL', 'BLK']
Loaded season: 2010-2011 Shape: (345, 1)
For 2011-2012, missing columns (skipped): ['School', 'G', 'W', 'L', 'W-L%', 'SRS', 'SOS', 'PTS', 'Opp PTS', 'FG%', '3P%', '3PA', 'FGA', 'FT%', 'TRB', 'AST', 'TOV', 'STL', 'BLK']
Loaded season: 2011-2012 Shape: (344, 1)
For 2012-2013, missing columns (skipped): ['School', 'G', 'W', 'L', 'W-L%', 'SRS', 'SOS', 'PTS', 'Opp PTS', 'FG%', '3P%', '3PA', 'FGA', 'FT%', 'TRB', 'AST', 'TOV', 'STL', 'BLK']
Loaded season: 2012-2013 Shape: (347, 1)
For 2013-2014, missing columns (skipped): ['School', 'G', 'W', 'L', 'W-L%', 'SRS', 'SOS', 'PTS', 'Opp PTS', 'FG%', '3P%', '3PA', 'FGA', 'FT%', 'TRB', 'AST', 'TOV', 'STL', 'BLK']
Loaded season: 2013-2014 Shape: (351, 1)
For 2014-2015, missing columns (skipped): ['School', 'G', 'W', 'L', 'W-L%', 'SRS', 'SOS', 'PTS', 'Opp PTS', 'FG%', '3P%', '3

In [12]:
!pip install openpyxl


Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Downloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl

   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   ---------------------------------------- 2/2 [openpyxl]

Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5
