In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path


In [2]:
csv_path = "nfl_game_features_2002_2024.csv"

df = pd.read_csv(csv_path)

In [3]:
TRAIN_SEASONS_START = 2002
TRAIN_SEASONS_END = 2020
TEST_SEASONS_START = 2021
TEST_SEASONS_END = 2024

print("="*60)
print("TRAIN-TEST SPLIT EXECUTION")
print("="*60)

train_mask = (df['season'] >= TRAIN_SEASONS_START) & (df['season'] <= TRAIN_SEASONS_END)
test_mask = (df['season'] >= TEST_SEASONS_START) & (df['season'] <= TEST_SEASONS_END)

train_df = df[train_mask].copy()
test_df = df[test_mask].copy()

total_original = len(df)
total_split = len(train_df) + len(test_df)

print(f"Original dataset size: {total_original:,}")
print(f"Training set size: {len(train_df):,}")
print(f"Testing set size: {len(test_df):,}")
print(f"Total after split: {total_split:,}")


train_seasons = set(train_df['season'].unique())
test_seasons = set(test_df['season'].unique())
overlap = train_seasons.intersection(test_seasons)

if overlap:
    print(f"Season overlap detected: {sorted(overlap)}")

print(f"\nTraining seasons: {sorted(train_seasons)}")
print(f"Testing seasons: {sorted(test_seasons)}")

train_percentage = (len(train_df) / total_original) * 100
test_percentage = (len(test_df) / total_original) * 100

print(f"\nSplit distribution:")
print(f"  Training: {train_percentage:.1f}%")
print(f"  Testing: {test_percentage:.1f}%")

TRAIN-TEST SPLIT EXECUTION
Original dataset size: 11,330
Training set size: 9,706
Testing set size: 1,624
Total after split: 11,330

Training seasons: [np.float64(2002.0), np.float64(2003.0), np.float64(2004.0), np.float64(2005.0), np.float64(2006.0), np.float64(2007.0), np.float64(2008.0), np.float64(2009.0), np.float64(2010.0), np.float64(2011.0), np.float64(2012.0), np.float64(2013.0), np.float64(2014.0), np.float64(2015.0), np.float64(2016.0), np.float64(2017.0), np.float64(2018.0), np.float64(2019.0), np.float64(2020.0)]
Testing seasons: [np.float64(2021.0), np.float64(2022.0), np.float64(2023.0)]

Split distribution:
  Training: 85.7%
  Testing: 14.3%


In [4]:
# Define output filenames
train_filename = "nfl_train_data_2002_2020.csv"
test_filename = "nfl_test_data_2021_2024.csv"


try:
    train_df.to_csv(train_filename, index=False)
    print(f"Seasons for train: {int(train_df['season'].min())}-{int(train_df['season'].max())}")
except Exception as e:
    print(f"Error exporting training data: {e}")

try:
    test_df.to_csv(test_filename, index=False)
    print(f"Seasons for test: {int(test_df['season'].min())}-{int(test_df['season'].max())}")
except Exception as e:
    print(f"Error exporting testing data: {e}")


Seasons for train: 2002-2020
Seasons for test: 2021-2023
