In [12]:
!pip install lxml



In [13]:
import pandas as pd
from pathlib import Path

Project_Root = Path(".")
Data_Dir = Project_Root / "sports_data"

excel_files = sorted(Data_Dir.glob("*.xls*"))
excel_files

[WindowsPath('sports_data/sportsref_download_2010-2011.xls'),
 WindowsPath('sports_data/sportsref_download_2011-2012.xls'),
 WindowsPath('sports_data/sportsref_download_2012-2013.xls'),
 WindowsPath('sports_data/sportsref_download_2013-2014.xls'),
 WindowsPath('sports_data/sportsref_download_2014-2015.xls'),
 WindowsPath('sports_data/sportsref_download_2015-2016.xls'),
 WindowsPath('sports_data/sportsref_download_2016-2017.xls'),
 WindowsPath('sports_data/sportsref_download_2017-2018.xls'),
 WindowsPath('sports_data/sportsref_download_2018-2019.xls'),
 WindowsPath('sports_data/sportsref_download_2019-2020.xls'),
 WindowsPath('sports_data/sportsref_download_2020-2021.xls'),
 WindowsPath('sports_data/sportsref_download_2021-2022.xls'),
 WindowsPath('sports_data/sportsref_download_2022-2023.xlsx'),
 WindowsPath('sports_data/sportsref_download_2023-2024.xlsx'),
 WindowsPath('sports_data/sportsref_download_2024-2025.xlsx')]

In [20]:
def clean_season(path, season_name):
    """
    Load and clean one season file from sports_data.
    Handles:
    - .xls files that are actually HTML tables (Sports Reference)
    - .xlsx real Excel files with two header rows
    """

    ext = path.suffix.lower()

    # ---- Step 1: read the raw table with a 2-row header ----
    if ext == ".xls":
        # Sports Reference "xls" exports are HTML tables
        tables = pd.read_html(path, header=[0, 1])
        raw = tables[0]
    elif ext == ".xlsx":
        raw = pd.read_excel(path, header=[0, 1], engine="openpyxl")
    else:
        raise ValueError(f"Unsupported file type: {ext}")

    # ---- Step 2: flatten the 2-level column names ----
    new_cols = []
    for col in raw.columns:
        # col is usually a tuple: (top, sub)
        if isinstance(col, tuple):
            top = str(col[0])
            sub = str(col[1])
        else:
            top = ""
            sub = str(col)

        # Special cases
        if top.startswith("Unnamed") and sub == "School":
            name = "School"
        elif top == "Overall":
            # We care about overall G, W, L, etc.
            name = sub
        elif top == "Totals":
            # We care about totals FG, 3P, TRB, etc.
            name = sub
        elif top == "Points" and sub == "Tm.":
            name = "PTS"
        elif top == "Points" and sub == "Opp.":
            name = "Opp PTS"
        else:
            # For Conf., Home, Away, etc., create a unique name
            # that won't match keep_cols (so we ignore them)
            name = f"{top}_{sub}".strip()

        new_cols.append(name)

    raw.columns = new_cols

    # ---- Step 3: drop completely empty columns ----
    raw = raw.dropna(axis=1, how="all")

    # ---- Step 4: keep a limited set of stats (now matching real names) ----
    keep_cols = [
        "School",
        "G", "W", "L", "W-L%", "SRS", "SOS",
        "PTS", "Opp PTS",
        "FG", "FGA", "FG%",
        "3P", "3PA", "3P%",
        "FT", "FTA", "FT%",
        "TRB", "AST", "STL", "BLK", "TOV"
    ]

    existing_cols = [col for col in keep_cols if col in raw.columns]
    missing_cols = [col for col in keep_cols if col not in raw.columns]

    if missing_cols:
        print(f"For {season_name}, missing columns (skipped): {missing_cols}")

    df = raw[existing_cols].copy()

    # ---- Step 4b: ensure no duplicate column names remain ----
    df = df.loc[:, ~df.columns.duplicated()]

    # ---- Step 5: clean School names ----
    if "School" in df.columns:
        df["School"] = (
            df["School"]
            .astype(str)
            .str.replace(r"\s*NCAA$", "", regex=True)
            .str.replace(r"\s*\(.*?\)", "", regex=True)  # remove seeds like (1)
            .str.strip()
        )

    # ---- Step 6: convert numeric columns ----
    for col in df.columns:
        if col != "School":
            # Ensure we're operating on a Series, not a DataFrame
            df[col] = pd.to_numeric(df[col], errors="coerce")

    # ---- Step 7: add Season column ----
    df["Season"] = season_name

    return df

In [21]:
season_dfs = {}

for file in excel_files:
    name = file.stem
    season_name = name.split("_")[-1]  # e.g. "2010-2011"

    df_season = clean_season(file, season_name)
    season_dfs[season_name] = df_season

    print("Loaded season:", season_name, "Shape:", df_season.shape)


Loaded season: 2010-2011 Shape: (345, 24)
Loaded season: 2011-2012 Shape: (344, 24)
Loaded season: 2012-2013 Shape: (347, 24)
Loaded season: 2013-2014 Shape: (351, 24)
Loaded season: 2014-2015 Shape: (351, 24)
Loaded season: 2015-2016 Shape: (351, 24)
Loaded season: 2016-2017 Shape: (351, 24)
Loaded season: 2017-2018 Shape: (351, 24)
Loaded season: 2018-2019 Shape: (353, 24)
Loaded season: 2019-2020 Shape: (353, 24)
Loaded season: 2020-2021 Shape: (347, 24)
Loaded season: 2021-2022 Shape: (358, 24)
Loaded season: 2022-2023 Shape: (363, 24)
Loaded season: 2023-2024 Shape: (363, 24)
Loaded season: 2024-2025 Shape: (364, 24)


In [16]:
!pip install openpyxl




In [17]:
tables = pd.read_html(excel_files[0])
tables[0].columns.tolist()

[('Unnamed: 0_level_0', 'Rk'),
 ('Unnamed: 1_level_0', 'School'),
 ('Overall', 'G'),
 ('Overall', 'W'),
 ('Overall', 'L'),
 ('Overall', 'W-L%'),
 ('Overall', 'SRS'),
 ('Overall', 'SOS'),
 ('Unnamed: 8_level_0', 'Unnamed: 8_level_1'),
 ('Conf.', 'W'),
 ('Conf.', 'L'),
 ('Unnamed: 11_level_0', 'Unnamed: 11_level_1'),
 ('Home', 'W'),
 ('Home', 'L'),
 ('Unnamed: 14_level_0', 'Unnamed: 14_level_1'),
 ('Away', 'W'),
 ('Away', 'L'),
 ('Unnamed: 17_level_0', 'Unnamed: 17_level_1'),
 ('Points', 'Tm.'),
 ('Points', 'Opp.'),
 ('Unnamed: 20_level_0', 'Unnamed: 20_level_1'),
 ('Totals', 'MP'),
 ('Totals', 'FG'),
 ('Totals', 'FGA'),
 ('Totals', 'FG%'),
 ('Totals', '3P'),
 ('Totals', '3PA'),
 ('Totals', '3P%'),
 ('Totals', 'FT'),
 ('Totals', 'FTA'),
 ('Totals', 'FT%'),
 ('Totals', 'ORB'),
 ('Totals', 'TRB'),
 ('Totals', 'AST'),
 ('Totals', 'STL'),
 ('Totals', 'BLK'),
 ('Totals', 'TOV'),
 ('Totals', 'PF')]

In [18]:
pd.read_excel(excel_files[12], header=0, engine="openpyxl").columns.tolist()

['Unnamed: 0',
 'Unnamed: 1',
 'Overall',
 'Unnamed: 3',
 'Unnamed: 4',
 'Unnamed: 5',
 'Unnamed: 6',
 'Unnamed: 7',
 'Unnamed: 8',
 'Conf.',
 'Unnamed: 10',
 'Unnamed: 11',
 'Home',
 'Unnamed: 13',
 'Unnamed: 14',
 'Away',
 'Unnamed: 16',
 'Unnamed: 17',
 'Points',
 'Unnamed: 19',
 'Unnamed: 20',
 'Totals',
 'Unnamed: 22',
 'Unnamed: 23',
 'Unnamed: 24',
 'Unnamed: 25',
 'Unnamed: 26',
 'Unnamed: 27',
 'Unnamed: 28',
 'Unnamed: 29',
 'Unnamed: 30',
 'Unnamed: 31',
 'Unnamed: 32',
 'Unnamed: 33',
 'Unnamed: 34',
 'Unnamed: 35',
 'Unnamed: 36',
 'Unnamed: 37']

Unnamed: 0,School,G,W,L,W-L%,SRS,SOS,PTS,Opp PTS,FG,...,3P%,FT,FTA,FT%,TRB,AST,STL,BLK,TOV,Season
0,Air Force,29,13,16,0.448,-0.45,2.14,1786,1798,616,...,0.337,362,537,0.674,767,403,194,90,371,2011-2012
1,Akron,34,22,12,0.647,6.33,0.72,2456,2241,868,...,0.37,513,741,0.692,1098,475,247,141,480,2011-2012
2,Alabama,33,21,12,0.636,13.76,7.18,2135,1918,778,...,0.289,443,623,0.711,1066,402,239,137,423,2011-2012
3,Alabama A&M,28,7,21,0.25,-19.33,-10.79,1764,1964,629,...,0.313,359,555,0.647,849,314,178,119,463,2011-2012
4,Alabama State,31,12,19,0.387,-15.79,-9.96,1840,1995,648,...,0.306,354,606,0.584,982,357,236,95,470,2011-2012
