In [472]:
import pandas as pd



In [473]:
# Load new NFL team stats dataset
espn_stats = pd.read_csv("../data/Raw_NFL/nfl_team_stats_2002-2024.csv")

# Load Super Bowl history dataset
superbowl_winners = pd.read_csv("../data/Raw_NFL/superbowl.csv")


In [None]:
# Total number of Passing, Rushing, PF, PA stats per team per season
# Combine home and away stats by flipping away/home roles and stacking
away_stats = espn_stats[['season', 'away', 'pass_yards_away', 'rush_yards_away', 'score_away', 'score_home']].copy()
away_stats.columns = ['season', 'team', 'pass_yards', 'rush_yards', 'points_scored', 'points_allowed']

home_stats = espn_stats[['season', 'home', 'pass_yards_home', 'rush_yards_home', 'score_home', 'score_away']].copy()
home_stats.columns = ['season', 'team', 'pass_yards', 'rush_yards', 'points_scored', 'points_allowed']

# Combine all games
all_team_games = pd.concat([away_stats, home_stats])

# Aggregate to season-level stats
season_stats = all_team_games.groupby(['season', 'team']).agg({
    'pass_yards': 'sum',
    'rush_yards': 'sum',
    'points_scored': 'sum',
    'points_allowed': 'sum'
}).reset_index()



In [475]:
# Filter to only Super Bowl games
sb_games = espn_stats[espn_stats['week'].str.lower() == 'superbowl'].copy()

# Determine winner by comparing score
sb_games['sb_winner'] = sb_games.apply(
    lambda row: row['away'] if row['score_away'] > row['score_home'] else row['home'],
    axis=1
) # If away score is greater, we use the away row as the winner, otherwise the home row

sb_winners_recent = sb_games[['season', 'sb_winner']].copy()
sb_winners_recent.columns = ['season', 'team']

In [476]:
# Extract season and standardize team name
superbowl_winners['season'] = pd.to_datetime(superbowl_winners['Date']).dt.year - 1
superbowl_winners['team'] = superbowl_winners['Winner'].str.strip()

# Only keep 1997–2001 data
sb_winners_old = superbowl_winners[superbowl_winners['season'].between(1997, 2001)][['season', 'team']]
# Combine old and recent Super Bowl winners
combined_sb_winners = pd.concat([sb_winners_recent, sb_winners_old])
combined_sb_winners = combined_sb_winners.sort_values('season').reset_index(drop=True)

# Generate sb_wins_last_5 feature for each team 
def calculate_sb_wins_last_5(season, team):
    # Filter for the last 5 seasons including current
    recent_wins = combined_sb_winners[
        (combined_sb_winners['season'] < season) & 
        (combined_sb_winners['season'] > season - 6) & 
        (combined_sb_winners['team'] == team)
    ]
    return len(recent_wins)
# Apply the function to create the new feature
season_stats['sb_wins_last_5'] = season_stats.apply(
    lambda row: calculate_sb_wins_last_5(row['season'], row['team']),
    axis=1
)



In [477]:
# Create a total yards column
season_stats['total_yards'] = season_stats['pass_yards'] + season_stats['rush_yards']

# Add point differential column
season_stats['point_differential'] = season_stats['points_scored'] - season_stats['points_allowed']

In [478]:
# Add is_sb_winner column to season_stats
season_stats['is_sb_winner'] = season_stats.apply(
	lambda row: 1 if (row['season'], row['team']) in combined_sb_winners.values else 0,
	axis=1
)

# Using past stats, predict is_sb_winner_next_season
season_stats['is_sb_winner_next_season'] = season_stats.groupby('team')['is_sb_winner'].shift(-1)


In [479]:
# Calculate correlation of features with Super Bowl winner status
season_stats.corr(numeric_only=True)['is_sb_winner_next_season'].sort_values(ascending=False)
 

is_sb_winner                1.000000e+00
is_sb_winner_next_season    1.000000e+00
sb_wins_last_5              4.086805e-01
total_yards                 3.043840e-01
point_differential          2.996067e-01
points_scored               2.911758e-01
pass_yards                  2.882693e-01
rush_yards                  1.173738e-01
season                      5.278603e-16
points_allowed             -8.723356e-02
Name: is_sb_winner_next_season, dtype: float64

In [480]:
# 1. Import libraries
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

# 2. Prepare features and target variable
feature_cols = ['pass_yards', 'rush_yards', 'total_yards', 'points_scored', 'points_allowed', 'point_differential', 'sb_wins_last_5']
X = season_stats[feature_cols]
y = season_stats['is_sb_winner_next_season']

# 3. Drop 2024 season (since it has no target label for 2025)
X = X[season_stats['season'] < 2024]
y = y[season_stats['season'] < 2024]

#4 Split the data into training and testing sets
# 4. Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
# 5. Train the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# 6. Predict and evaluate
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

# 7. Evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_proba))

Accuracy: 0.6595744680851063

Classification Report:
               precision    recall  f1-score   support

         0.0       0.67      0.86      0.75        84
         1.0       0.64      0.37      0.47        57

    accuracy                           0.66       141
   macro avg       0.65      0.61      0.61       141
weighted avg       0.65      0.66      0.64       141


Confusion Matrix:
 [[72 12]
 [36 21]]
ROC AUC Score: 0.745405179615706


  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept


In [481]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

# 1. Define the Random Forest model
rf_model = RandomForestClassifier(
    n_estimators=100,         # Number of trees
    random_state=42,          # For reproducibility
    class_weight='balanced',  # Helps with imbalance between 0 and 1 classes
    max_depth=None,           # No limit on tree depth
    n_jobs=-1                 # Use all available cores
)

# 2. Fit the model
rf_model.fit(X_train, y_train)

# 3. Make predictions
y_pred_rf = rf_model.predict(X_test)
y_proba_rf = rf_model.predict_proba(X_test)[:, 1]

# 4. Evaluate the model
# Accuracy of the model
print("🎯 Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf)) 
# Predicting a win and predicting a loss
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred_rf)) 
# True vs False Negatives and Positives
print("\n📉 Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf)) 
# AUC closer to 1 = better. 0.5 = random guessing
print("📈 ROC AUC Score:", roc_auc_score(y_test, y_proba_rf)) 

🎯 Random Forest Accuracy: 0.7092198581560284

📊 Classification Report:
               precision    recall  f1-score   support

         0.0       0.73      0.82      0.77        84
         1.0       0.67      0.54      0.60        57

    accuracy                           0.71       141
   macro avg       0.70      0.68      0.69       141
weighted avg       0.71      0.71      0.70       141


📉 Confusion Matrix:
 [[69 15]
 [26 31]]
📈 ROC AUC Score: 0.768483709273183


In [482]:
import joblib
# Save the trained model
joblib.dump(rf_model, "../model/rf_nfl_superbowl_winner_prediction.pkl")

['../model/rf_nfl_superbowl_winner_prediction.pkl']