In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import glob
from pathlib import Path
import re


# directory containing game parquet files (relative to this notebook)
path = Path('..') / 'data' / 'processed' / 'structured'
files = sorted(path.glob('*.parquet'))
# load exactly one DataFrame per season (keep the first file encountered for each year)
games = {}  # maps YYYY -> DataFrame
for f in files:
    # read parquet first so any I/O/parquet-engine errors surface immediately
    df = pd.read_parquet(f)
    fname = f.name
    m = re.search(r'(19|20)\d{2}', fname)
    if m:
        year = m.group(0)
    else:
        # fallback: use filename without suffix
        year = fname.rsplit('.parquet', 1)[0]
    # normalize to a 4-digit year string when possible
    year_str = year if (isinstance(year, str) and len(str(year)) == 4 and str(year).isdigit()) else str(year)
    # if we've already loaded a DataFrame for this season, skip further files
    if year_str in games:
        # skip duplicates for the same season
        continue
    # register the DataFrame for this season
    games[year_str] = df
    # expose short-name globals: games_YY (e.g. games_16) and games_YYYY
    short = year_str[-2:] if year_str.isdigit() and len(year_str) == 4 else year_str
    globals()[f'games_{year_str}'] = df
    globals()[f'games_{short}'] = df
    print(f'Loaded {f} -> games_{year_str} (alias games_{short}), shape={df.shape}')
# convenience alias: dfs points to the per-season mapping we just built
dfs = games
# use dfs['2017'] or globals()['games_17'] as needed

Loaded ../data/processed/structured/games_2016.parquet -> games_2016 (alias games_16), shape=(832, 166)
Loaded ../data/processed/structured/games_2017.parquet -> games_2017 (alias games_17), shape=(834, 166)
Loaded ../data/processed/structured/games_2018.parquet -> games_2018 (alias games_18), shape=(845, 166)
Loaded ../data/processed/structured/games_2019.parquet -> games_2019 (alias games_19), shape=(848, 166)
Loaded ../data/processed/structured/games_2020.parquet -> games_2020 (alias games_20), shape=(542, 166)
Loaded ../data/processed/structured/games_2021.parquet -> games_2021 (alias games_21), shape=(849, 166)
Loaded ../data/processed/structured/games_2022.parquet -> games_2022 (alias games_22), shape=(854, 166)
Loaded ../data/processed/structured/games_2023.parquet -> games_2023 (alias games_23), shape=(868, 166)
Loaded ../data/processed/structured/games_2024.parquet -> games_2024 (alias games_24), shape=(874, 166)


Unnamed: 0,game_id,week,home_team,away_team,home_points,away_points,home_defensiveTDs,away_defensiveTDs,home_firstDowns,away_firstDowns,...,home_def_ppa,away_def_ppa,home_def_drives,away_def_drives,home_def_plays,away_def_plays,spread,total,point_diff,favorite
0,401282714,1,Illinois,Nebraska,30,22,1.0,0.0,18.0,19.0,...,0.320845,0.044854,12,12,71,68,7.0,52.0,8,away
1,401286187,1,Fresno State,UConn,45,0,0.0,0.0,25.0,9.0,...,-0.396563,0.290790,14,14,61,77,-28.0,63.5,45,home
2,401309833,1,UCLA,Hawai'i,44,10,1.0,0.0,20.0,16.0,...,-0.048344,0.225559,14,13,74,65,-18.0,66.5,34,home
3,401282049,1,New Mexico State,UTEP,3,30,0.0,0.0,20.0,19.0,...,0.323698,-0.190611,12,11,67,73,10.0,59.5,-27,away
4,401310693,1,San José State,Southern Utah,45,14,0.0,1.0,20.0,17.0,...,-0.051936,0.510368,9,10,63,56,-28.5,57.5,31,home
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
844,401282154,14,Alabama,Georgia,41,24,1.0,0.0,25.0,30.0,...,0.025634,0.377538,13,12,78,70,6.5,48.5,17,away
845,401331163,14,Wake Forest,Pittsburgh,21,45,0.0,1.0,24.0,22.0,...,0.086396,-0.038590,16,16,72,85,3.5,73.0,-24,away
846,401331447,14,Iowa,Michigan,3,42,0.0,0.0,15.0,21.0,...,0.159692,-0.094733,12,13,49,70,-12.0,44.0,-39,home
847,401387250,14,California,USC,24,14,0.0,0.0,13.0,25.0,...,0.086452,0.115814,14,15,130,87,-4.5,57.5,10,home


In [14]:
# Split DataFrame columns into chunks of `chunk_size` columns each, keeping the first N columns in every chunk

def chunk_df_columns(df, chunk_size=15, prefix='df_chunk', preserve_front=6):
    """Return a dict of DataFrames, each containing up to chunk_size columns plus the first `preserve_front` columns.
    Keys are prefix_01, prefix_02, ...

    Each chunk will always include the first `preserve_front` columns (if they exist) so you can inspect
    identifiers/context columns alongside column groups. Additionally, ensure common score/points columns
    (home_points, away_points, home_score, away_score) are included in the preserved front so they appear in every chunk.
    """
    cols = list(df.columns)
    # front columns to keep in every chunk
    front = cols[:preserve_front] if preserve_front > 0 else []

    # always preserve common scoring columns so they appear alongside each chunk
    for extra in ('home_points', 'away_points', 'home_score', 'away_score'):
        if extra in cols and extra not in front:
            front.append(extra)

    # If there are no additional columns beyond the front, return a single chunk with the full frame
    tail_cols = [c for c in cols if c not in front]
    chunks = {}
    if not tail_cols:
        name = f"{prefix}_01"
        chunks[name] = df.loc[:, front].copy()
        return chunks

    # iterate over the tail in groups of chunk_size
    idx = 0
    chunk_index = 1
    while idx < len(tail_cols):
        block = tail_cols[idx: idx + chunk_size]
        # combine front + block but avoid duplicate column names
        combined = front + [c for c in block if c not in front]
        name = f"{prefix}_{chunk_index:02d}"
        chunks[name] = df.loc[:, combined].copy()
        idx += chunk_size
        chunk_index += 1

    return chunks

# Pick a sensible default dataframe variable if present (adjust names as needed)
for candidate in ('df_a', 'df', 'df_16', 'df_2016'):
    if candidate in globals():
        target_name = candidate
        break
else:
    raise NameError('No target dataframe found. Define `df` or `df_a` or df_YYYY beforehand.')

# create the chunks mapping silently (no display/print) for later inspection
target_df = globals()[target_name]
# default preserve_front=5 to keep first five columns in every chunk
chunks = chunk_df_columns(target_df, chunk_size=10, prefix=f'{target_name}_chunk', preserve_front=5)
# expose each chunk as a global variable (e.g. df_chunk_01) for interactive access
for name, cdf in chunks.items():
    globals()[name] = cdf
# also expose the mapping itself and return it as the cell result
globals()['chunks'] = chunks
chunks

{'df_chunk_01':        game_id  week     home_team       away_team  home_points  away_points  \
 0    401635525     1  Georgia Tech   Florida State         24.0         21.0   
 1    401643697     1    New Mexico   Montana State         31.0         35.0   
 2    401643696     1        Nevada             SMU         24.0         29.0   
 3    401643858     1       Hawai'i  Delaware State         35.0         14.0   
 4    401628458     1       Rutgers          Howard         44.0          7.0   
 ..         ...   ...           ...             ...          ...          ...   
 869  401673469    15         Texas         Georgia         19.0         22.0   
 870  401673470    15     Louisiana        Marshall          3.0         31.0   
 871  401673464    15        Oregon      Penn State         45.0         37.0   
 872  401673463    15           SMU         Clemson         31.0         34.0   
 873  401645383    16          Army            Navy         13.0         31.0   
 
      home_

In [20]:
for col in games_2016.columns:
    print(col)

game_id
week
home_team
away_team
home_points
away_points
home_defensiveTDs
away_defensiveTDs
home_firstDowns
away_firstDowns
home_fumblesLost
away_fumblesLost
home_fumblesRecovered
away_fumblesRecovered
home_interceptions
away_interceptions
home_kickingPoints
away_kickingPoints
home_netPassingYards
away_netPassingYards
home_passesDeflected
away_passesDeflected
home_passesIntercepted
away_passesIntercepted
home_passingTDs
away_passingTDs
home_qbHurries
away_qbHurries
home_rushingAttempts
away_rushingAttempts
home_rushingTDs
away_rushingTDs
home_rushingYards
away_rushingYards
home_sacks
away_sacks
home_tackles
away_tackles
home_tacklesForLoss
away_tacklesForLoss
home_totalFumbles
away_totalFumbles
home_totalYards
away_totalYards
home_turnovers
away_turnovers
home_yardsPerPass
away_yardsPerPass
home_yardsPerRushAttempt
away_yardsPerRushAttempt
home_off_passingPlays_explosiveness
away_off_passingPlays_explosiveness
home_off_passingPlays_successRate
away_off_passingPlays_successRate
home_of

In [6]:
games_2016.rename(columns={'total': 'o/u'}, inplace=True)

In [7]:
games_2016['spread']

0     -21.5
1     -27.0
2     -33.0
3     -24.0
4     -38.5
       ... 
827    25.5
828     7.0
829    10.0
830     3.0
831     5.5
Name: spread, Length: 832, dtype: float64

In [8]:
games_2016['o/u']

0      66.0
1       0.0
2       0.0
3       0.0
4      58.5
       ... 
827    54.0
828    57.5
829    58.5
830    47.0
831    46.5
Name: o/u, Length: 832, dtype: float64

In [12]:
games_2016['favorite'] 

0      home
1      home
2      home
3      home
4      home
       ... 
827    away
828    away
829    away
830    away
831    away
Name: favorite, Length: 832, dtype: object