In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import glob
from pathlib import Path
import re


# directory containing game parquet files (relative to this notebook)
path = Path('..') / 'data' / 'processed' / 'structured'
files = sorted(path.glob('*.parquet'))
# load exactly one DataFrame per season (keep the first file encountered for each year)
games = {}  # maps YYYY -> DataFrame
for f in files:
    # read parquet first so any I/O/parquet-engine errors surface immediately
    df = pd.read_parquet(f)
    fname = f.name
    m = re.search(r'(19|20)\d{2}', fname)
    if m:
        year = m.group(0)
    else:
        # fallback: use filename without suffix
        year = fname.rsplit('.parquet', 1)[0]
    # normalize to a 4-digit year string when possible
    year_str = year if (isinstance(year, str) and len(str(year)) == 4 and str(year).isdigit()) else str(year)
    # if we've already loaded a DataFrame for this season, skip further files
    if year_str in games:
        # skip duplicates for the same season
        continue
    # register the DataFrame for this season
    games[year_str] = df
    # expose short-name globals: games_YY (e.g. games_16) and games_YYYY
    short = year_str[-2:] if year_str.isdigit() and len(year_str) == 4 else year_str
    globals()[f'games_{year_str}'] = df
    globals()[f'games_{short}'] = df
    print(f'Loaded {f} -> games_{year_str} (alias games_{short}), shape={df.shape}')
# convenience alias: dfs points to the per-season mapping we just built
dfs = games
# use dfs['2017'] or globals()['games_17'] as needed

Loaded ../data/processed/structured/games_2016.parquet -> games_2016 (alias games_16), shape=(832, 196)
Loaded ../data/processed/structured/games_2017.parquet -> games_2017 (alias games_17), shape=(834, 196)
Loaded ../data/processed/structured/games_2018.parquet -> games_2018 (alias games_18), shape=(845, 196)
Loaded ../data/processed/structured/games_2019.parquet -> games_2019 (alias games_19), shape=(848, 196)
Loaded ../data/processed/structured/games_2020.parquet -> games_2020 (alias games_20), shape=(542, 196)
Loaded ../data/processed/structured/games_2021.parquet -> games_2021 (alias games_21), shape=(849, 196)
Loaded ../data/processed/structured/games_2022.parquet -> games_2022 (alias games_22), shape=(854, 196)
Loaded ../data/processed/structured/games_2023.parquet -> games_2023 (alias games_23), shape=(868, 196)
Loaded ../data/processed/structured/games_2024.parquet -> games_2024 (alias games_24), shape=(874, 196)


In [2]:
# Split DataFrame columns into chunks of `chunk_size` columns each
def chunk_df_columns(df, chunk_size=10, prefix='df_chunk'):
    """Return a dict of DataFrames, each containing up to chunk_size columns.
    Keys are prefix_01, prefix_02, ...
    """
    cols = list(df.columns)
    chunks = {}
    for i in range(0, len(cols), chunk_size):
        chunk_cols = cols[i:i+chunk_size]
        name = f"{prefix}_{i//chunk_size+1:02d}"
        # use .loc to preserve column order and copy to avoid view warnings
        chunks[name] = df.loc[:, chunk_cols].copy()
    return chunks

# Pick a sensible default dataframe variable if present (adjust names as needed)
for candidate in ('df_a', 'df', 'df_16', 'df_2016'):
    if candidate in globals():
        target_name = candidate
        break
else:
    raise NameError('No target dataframe found. Define `df` or `df_a` or df_YYYY beforehand.')

target_df = globals()[target_name]
chunks = chunk_df_columns(target_df, chunk_size=10, prefix=f'{target_name}_chunk')
print(f"Target: {target_name}, columns: {len(target_df.columns)}, chunks: {len(chunks)}")
for name, cdf in chunks.items():
    print(name, cdf.shape, 'columns->', list(cdf.columns))

# Optionally expose chunked dataframes as globals (uncomment to enable)
# for name, cdf in chunks.items():
#     globals()[name] = cdf
#     print('Created global', name)

Target: df, columns: 196, chunks: 20
df_chunk_01 (874, 10) columns-> ['week', 'home_team', 'away_team', 'home_points', 'away_points', 'home_team_x', 'home_conference', 'home_team_y', 'home_defensiveTDs', 'home_firstDowns']
df_chunk_02 (874, 10) columns-> ['home_fumblesLost', 'home_fumblesRecovered', 'home_interceptionTDs', 'home_interceptionYards', 'home_interceptions', 'home_kickReturnTDs', 'home_kickReturnYards', 'home_kickReturns', 'home_kickingPoints', 'home_netPassingYards']
df_chunk_03 (874, 10) columns-> ['home_passesDeflected', 'home_passesIntercepted', 'home_passingTDs', 'home_puntReturnTDs', 'home_puntReturnYards', 'home_puntReturns', 'home_qbHurries', 'home_rushingAttempts', 'home_rushingTDs', 'home_rushingYards']
df_chunk_04 (874, 10) columns-> ['home_sacks', 'home_tackles', 'home_tacklesForLoss', 'home_totalFumbles', 'home_totalYards', 'home_turnovers', 'home_yardsPerPass', 'home_yardsPerRushAttempt', 'home_team_conference', 'home_off_passingPlays_explosiveness']
df_chunk_