In [4]:
import pandas as pd
df = pd.read_csv('nfl_team_stats_2002-2024.csv')

In [None]:

# Remove rows where week is not a number
df = df[pd.to_numeric(df['week'], errors='coerce').notna()]
df['week'] = df['week'].astype(int)

In [None]:
# Columns to split for each team
shared_cols = ['season', 'week', 'date', 'time_et', 'neutral']
team_cols = {
    'away': [
        'away', 'score_away', 'first_downs_away', 'first_downs_from_passing_away',
        'first_downs_from_rushing_away', 'first_downs_from_penalty_away',
        'third_down_comp_away', 'third_down_att_away',
        'fourth_down_comp_away', 'fourth_down_att_away',
        'plays_away', 'drives_away', 'yards_away',
        'pass_comp_away', 'pass_att_away', 'pass_yards_away',
        'sacks_num_away', 'sacks_yards_away',
        'rush_att_away', 'rush_yards_away',
        'pen_num_away', 'pen_yards_away',
        'redzone_comp_away', 'redzone_att_away',
        'fumbles_away', 'interceptions_away',
        'def_st_td_away', 'possession_away'
    ],
    'home': [
        'home', 'score_home', 'first_downs_home', 'first_downs_from_passing_home',
        'first_downs_from_rushing_home', 'first_downs_from_penalty_home',
        'third_down_comp_home', 'third_down_att_home',
        'fourth_down_comp_home', 'fourth_down_att_home',
        'plays_home', 'drives_home', 'yards_home',
        'pass_comp_home', 'pass_att_home', 'pass_yards_home',
        'sacks_num_home', 'sacks_yards_home',
        'rush_att_home', 'rush_yards_home',
        'pen_num_home', 'pen_yards_home',
        'redzone_comp_home', 'redzone_att_home',
        'fumbles_home', 'interceptions_home',
        'def_st_td_home', 'possession_home'
    ]
}

In [None]:
def create_team_df(df, side):
    team_df = df[shared_cols].copy()
    for col in team_cols[side]:
        new_col = col.replace(f'_{side}', '')
        if col == side:
            new_col = 'team'
        team_df[new_col] = df[col]
    team_df['location'] = side
    return team_df

# Combine home and away team stats
away_df = create_team_df(df, 'away')
home_df = create_team_df(df, 'home')
team_df = pd.concat([away_df, home_df], ignore_index=True)

In [None]:
# === Step 2: Prepare for rolling averages ===

# Convert possession time to seconds
def time_to_seconds(t):
    if pd.isna(t): return None
    mins, secs = map(int, t.split(':'))
    return mins * 60 + secs

team_df['possession_sec'] = team_df['possession'].apply(time_to_seconds)

# Determine which columns to average
exclude_cols = ['season', 'week', 'date', 'time_et', 'neutral', 'team', 'location', 'possession']
numeric_cols = [col for col in team_df.columns if col not in exclude_cols and col != 'possession_sec']

# Sort the data
team_df = team_df.sort_values(by=['team', 'season', 'week'])

In [None]:
# === Step 3: Rolling average function ===

def calc_rolling_averages(team_df):
    team_df = team_df.sort_values(by=['season', 'week']).copy()

    for col in numeric_cols:
        team_df[f'{col}_rolling'] = team_df[col].rolling(window=3, min_periods=1).mean().shift(1)

    team_df['possession_sec_rolling'] = team_df['possession_sec'].rolling(window=3, min_periods=1).mean().shift(1)

    return team_df.drop(columns=['possession_sec'])

# Apply rolling averages by team
rolling_df = team_df.groupby('team', group_keys=False).apply(calc_rolling_averages).reset_index(drop=True)


In [None]:
# === Step 4: Filter, Round, and Save ===

# Keep only categorical ID columns + rolling stats
id_cols = ['season', 'week', 'date', 'time_et', 'neutral', 'team', 'location']
rolling_cols = [col for col in rolling_df.columns if col.endswith('_rolling')]

final_df = rolling_df[id_cols + rolling_cols]
final_df[rolling_cols] = final_df[rolling_cols].round(3)

# Save result
final_df.to_csv('nfl_team_3week_rolling_avg_only.csv', index=False)

In [20]:
# Load the original and rolling data
original_df = pd.read_csv('nfl_team_stats_2002-2024.csv')
original_df = original_df.iloc[:, :9]
rolling_df = pd.read_csv('nfl_team_3week_rolling_avg_only.csv')

# Filter out non-numeric weeks again
original_df = original_df[pd.to_numeric(original_df['week'], errors='coerce').notna()]
original_df['week'] = original_df['week'].astype(int)


# Prepare rolling data for merging
rolling_home = rolling_df.copy()
rolling_home.columns = ['home_' + col if col not in ['season', 'week', 'team'] else col for col in rolling_home.columns]
rolling_away = rolling_df.copy()
rolling_away.columns = ['away_' + col if col not in ['season', 'week', 'team'] else col for col in rolling_away.columns]

# Merge in rolling averages for home and away
merged = original_df.merge(
    rolling_home,
    left_on=['season', 'week', 'home'],
    right_on=['season', 'week', 'team'],
    how='left'
).drop(columns=['team'])

merged = merged.merge(
    rolling_away,
    left_on=['season', 'week', 'away'],
    right_on=['season', 'week', 'team'],
    how='left'
).drop(columns=['team'])


# Add home_win column
merged['home_win'] = (merged['score_home'] > merged['score_away']).astype(int)

cols_to_drop = [
    'home_date', 'home_time_et', 'home_neutral', 'home_location',
    'away_date', 'away_time_et', 'away_neutral', 'away_location', 
    'date', 'time_et', 'neutral'
]

merged = merged.drop(columns=[col for col in cols_to_drop if col in merged.columns])

merged = merged.iloc[16:].reset_index(drop=True)

# Save if needed
merged.to_csv('nfl_games_with_rolling_stats.csv', index=False)


In [45]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def remove_rolling(s):
    if s != "away_score_rolling" and s != "home_score_rolling":
        return s.replace('_rolling', '')
    return s

# Load your data
df = pd.read_csv('nfl_games_with_rolling_stats.csv')

df.columns = [remove_rolling(col) for col in df.columns]

# Split features (exclude first 6 and last column) and target (last column)
X = df.iloc[:, 6:-1]
y = df.iloc[:, -1]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [30]:
# Train Random Forest
clf = RandomForestClassifier(random_state=42, 
    max_depth=39, 
    max_features= 'sqrt',
    min_samples_leaf = 5,
    min_samples_split = 3,
    n_estimators = 330)
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f'Accuracy: {acc:.3f}')

Accuracy: 0.612


In [46]:
# Train Random Forest
clf = RandomForestClassifier(
    n_estimators = 3, 
    max_depth = 5, 
    random_state=42, 
    max_features='sqrt',
    min_samples_leaf = 5,
    min_samples_split = 3,)
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f'Accuracy: {acc:.3f}')

Accuracy: 0.575


In [47]:
from sklearn.tree import export_text

# Iterate through each tree in the random forest and print the splits
for i, tree in enumerate(clf.estimators_):
    print(f"Tree {i+1}:")
    tree_rules = export_text(tree, feature_names=list(X.columns))
    print(tree_rules)
    print("-" * 80)

Tree 1:
|--- home_rush_att <= 32.83
|   |--- away_yards <= 335.17
|   |   |--- home_first_downs <= 16.17
|   |   |   |--- home_redzone_comp <= 1.50
|   |   |   |   |--- away_interceptions <= 0.17
|   |   |   |   |   |--- class: 0.0
|   |   |   |   |--- away_interceptions >  0.17
|   |   |   |   |   |--- class: 0.0
|   |   |   |--- home_redzone_comp >  1.50
|   |   |   |   |--- away_score_rolling <= 22.67
|   |   |   |   |   |--- class: 1.0
|   |   |   |   |--- away_score_rolling >  22.67
|   |   |   |   |   |--- class: 0.0
|   |   |--- home_first_downs >  16.17
|   |   |   |--- away_yards <= 279.17
|   |   |   |   |--- home_yards <= 323.17
|   |   |   |   |   |--- class: 1.0
|   |   |   |   |--- home_yards >  323.17
|   |   |   |   |   |--- class: 1.0
|   |   |   |--- away_yards >  279.17
|   |   |   |   |--- home_sacks_yards <= 6.83
|   |   |   |   |   |--- class: 1.0
|   |   |   |   |--- home_sacks_yards >  6.83
|   |   |   |   |   |--- class: 1.0
|   |--- away_yards >  335.17
|   | 

In [48]:
import json
from sklearn.tree import _tree

def tree_to_json(tree, feature_names, tree_index):
    tree_ = tree.tree_

    def recurse(node):
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_names[tree_.feature[node]]
            threshold = tree_.threshold[node]
            return {
                "name": f"{name} <= {threshold:.2f}",
                "children": [
                    recurse(tree_.children_left[node]),
                    recurse(tree_.children_right[node])
                ]
            }
        else:
            value = tree_.value[node][0]
            class_idx = value.argmax()
            return {
                "name": "Home win" if class_idx == 1 else "Away win",
                "value": int(value.sum())
            }

    return {
        "name": f"tree_{tree_index}",
        "children": [recurse(0)]
    }

# Wrap all trees under a single root node
forest_json = [ {
    "name": "forest",
    "children": [
        tree_to_json(tree, feature_names=X.columns, tree_index=i)
        for i, tree in enumerate(clf.estimators_)
    ]
} ]

# Save to file
with open('random_forest.json', 'w') as f:
    json.dump(forest_json, f, indent=2)


In [49]:
def clean_tree(node):
    # Base case: if it's a leaf, return it
    if 'children' not in node:
        return node

    # Recursively clean children
    node['children'] = [clean_tree(child) for child in node['children']]

    children = node['children']

    # Rule 1: If two leaf children with same name, keep one
    if len(children) == 2:
        c1, c2 = children
        if 'children' not in c1 and 'children' not in c2:
            if c1.get('name') == c2.get('name'):
                node['children'] = [c1]  # Keep one

    # Rule 2: If one child and it's a leaf, replace this node with that leaf
    if len(node['children']) == 1:
        only_child = node['children'][0]
        if 'children' not in only_child:
            return only_child  # Replace node with its child

    return node


import json

# Load your hierarchical JSON from a file
with open('random_forest.json') as f:
    data = json.load(f)

# If it's a list of trees
cleaned = [clean_tree(tree) for tree in data]

# Save cleaned JSON
with open('random_forest.json', 'w') as f:
    json.dump(cleaned, f, indent=2)


In [34]:
# Get a few predictions
num_predictions = 10

# Predict on the test set
y_pred = clf.predict(X_test)

# Convert test indices to original DataFrame rows
test_results = X_test.copy()
test_results['predicted_home_win'] = y_pred
test_results['actual_home_win'] = y_test.values

# Join metadata from original df
metadata_cols = ['season', 'week', 'home', 'away', 'score_home', 'score_away']
test_results = test_results.merge(df[metadata_cols], left_index=True, right_index=True)

# Print a few results
for i in range(num_predictions):
    row = test_results.iloc[i]
    print(f"{int(row['season'])} Week {int(row['week'])}: {row['away']} @ {row['home']}")
    print(f"Score: {int(row['score_away'])}-{int(row['score_home'])}")
    pred = "Home Win" if row['predicted_home_win'] == 1 else "Away Win"
    actual = "Home Win" if row['actual_home_win'] == 1 else "Away Win"
    print(f"Predicted: {pred} | Actual: {actual}")
    print("-" * 40)

2018 Week 16: Falcons @ Panthers
Score: 24-10
Predicted: Home Win | Actual: Away Win
----------------------------------------
2009 Week 14: Cardinals @ 49ers
Score: 9-24
Predicted: Away Win | Actual: Home Win
----------------------------------------
2007 Week 9: Ravens @ Steelers
Score: 7-38
Predicted: Home Win | Actual: Home Win
----------------------------------------
2015 Week 11: Bills @ Patriots
Score: 13-20
Predicted: Away Win | Actual: Home Win
----------------------------------------
2017 Week 9: Raiders @ Dolphins
Score: 27-24
Predicted: Away Win | Actual: Away Win
----------------------------------------
2014 Week 8: Rams @ Chiefs
Score: 7-34
Predicted: Home Win | Actual: Home Win
----------------------------------------
2004 Week 3: Bears @ Vikings
Score: 22-27
Predicted: Home Win | Actual: Home Win
----------------------------------------
2009 Week 1: Rams @ Seahawks
Score: 0-28
Predicted: Home Win | Actual: Home Win
----------------------------------------
2020 Week 14: Br