**Data Processing**

In [1]:
# Import libraries
import pandas as pd
import nflreadpy as nfl
import nfl_data_py as nfl_data
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

In [2]:
# Load NFL data for 2025 season
season = 2025
schedule = nfl.load_schedules(seasons=[season])
schedule = schedule.to_pandas()

# Select only regular season games
team_stats = nfl.load_team_stats(seasons=[season])
team_stats = team_stats.to_pandas()

In [3]:
# Add column for EPA per play
team_stats["pass_plays"] = team_stats["attempts"] + team_stats["sacks_suffered"]
team_stats["rush_plays"] = team_stats["carries"]
team_stats["total_plays"] = team_stats["pass_plays"] + team_stats["rush_plays"]
team_stats["off_epa_per_play"] = (
    (team_stats["passing_epa"] * team_stats["pass_plays"]) +
    (team_stats["rushing_epa"] * team_stats["rush_plays"])
) / team_stats["total_plays"]

In [4]:
# Add columns points for and against to team_stats
home = schedule[["season", "week", "home_team", "away_team", "home_score", "away_score"]].copy()
home["team"] = home["home_team"]
home["points_for"] = home["home_score"]
home["points_against"] = home["away_score"]

away = schedule[["season", "week", "home_team", "away_team", "home_score", "away_score"]].copy()
away["team"] = away["away_team"]
away["points_for"] = away["away_score"]
away["points_against"] = away["home_score"]

schedule_long = pd.concat([home, away], ignore_index=True)

# Merge into team_stats
team_stats = team_stats.merge(
    schedule_long[["season", "week", "team", "points_for", "points_against"]],
    on=["season", "week", "team"],
    how="left"
)

In [5]:
# Add column for turnover differential
# Offensive giveaways
team_stats["giveaways"] = (
    team_stats["passing_interceptions"] + 
    team_stats["rushing_fumbles_lost"] + 
    team_stats["receiving_fumbles_lost"] + 
    team_stats["sack_fumbles_lost"]
)

# Defensive takeaways
team_stats["takeaways"] = (
    team_stats["def_interceptions"] + 
    team_stats["fumble_recovery_opp"]   # opp fumbles recovered by your team
)

# Turnover differential
team_stats["turnover_diff"] = team_stats["takeaways"] - team_stats["giveaways"]

In [6]:
# Web scrape defensive EPA/play from https://sumersports.com/teams/defensive/
import requests
from bs4 import BeautifulSoup

url = "https://sumersports.com/teams/defensive/"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# Locate table
table = soup.find('table')

headers = [th.get_text(strip=True) for th in table.find_all('th')]

# Extract rows
rows = []
for tr in table.find_all('tr')[1:]:  # Skip header row
    cols = [td.get_text(strip=True) for td in tr.find_all('td')]
    if cols:  # Check row is not empty
        rows.append(cols)

df = pd.DataFrame(rows, columns=headers)
# Clean up team names (remove extra spaces)
df['team'] = df['Team'].str.replace(r'^\d+\.\s*', '', regex=True)
df['EPA/Play'] = pd.to_numeric(df['EPA/Play'], errors='coerce')

def_epa_df = df[['team', 'EPA/Play']].copy()


In [7]:
# Web scrape 3rd down conversion % (offense and defense) from https://www.teamrankings.com/
urls = {
    'third_down': 'https://www.teamrankings.com/nfl/stat/third-down-conversion-pct',
    'opponent_third_down': 'https://www.teamrankings.com/nfl/stat/opponent-third-down-conversion-pct'
}

# Function to scrape data from a given URL
def scrape_data(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Locate table
    table = soup.find('table')

    # Extract headers
    headers = [th.get_text(strip=True) for th in table.find_all('th')]

    # Extract rows
    rows = []
    for tr in table.find_all('tr')[1:]:  # Skip header
        cols = [td.get_text(strip=True) for td in tr.find_all('td')]
        if cols:  # Check row is not empty
            rows.append(cols)

    # Create a df
    df = pd.DataFrame(rows, columns=headers)

    # Clean up team names (remove extra spaces)
    df['Team'] = df['Team'].str.strip()
    df['2025'] = df['2025'].str.rstrip('%').astype(float) / 100.0  # Convert to decimal
    return df[['Team', '2025']]

# Scrape data
third_down_df = scrape_data(urls['third_down'])
opponent_third_down_df = scrape_data(urls['opponent_third_down'])

# Merge on the 'Team' column
merged_third_down_df = pd.merge(third_down_df, opponent_third_down_df, on='Team', suffixes=('_Offense', '_Defense'))


In [8]:
# Add indicator column for winner
schedule["home_win"] = (schedule["home_score"] > schedule["away_score"]).astype(int)
schedule = schedule[['season', 'week', 'home_team', 'away_team', 'home_score', 'away_score', 'home_win']]

In [9]:
# Create dataframe with cumulative averages of features
def cumulative_avg(df, up_to_week):
    stats_cols = ["off_epa_per_play", "points_for", "points_against", "turnover_diff"]

    # filter games up to target week
    df_filtered = df[df["week"] <= up_to_week].copy()

    # calculate cumulative average per team
    cumu = (
        df_filtered.groupby("team")[stats_cols]
        .mean()
        .reset_index()
    )

    # rename columns
    cumu = cumu.rename(columns={col: f"{col}" for col in stats_cols})

    return cumu

In [11]:
# Create cumulative avg dataframe for weeks 1-3
cumulative_df = cumulative_avg(team_stats, up_to_week=3)

In [12]:
# Convert all team names to abbreviations
TEAM_ABBR = {
    "ARI": ["Arizona Cardinals", "Arizona"],
    "ATL": ["Atlanta Falcons", "Atlanta"],
    "BAL": ["Baltimore Ravens", "Baltimore"],
    "BUF": ["Buffalo Bills", "Buffalo"],
    "CAR": ["Carolina Panthers", "Carolina"],
    "CHI": ["Chicago Bears", "Chicago"],
    "CIN": ["Cincinnati Bengals", "Cincinnati"],
    "CLE": ["Cleveland Browns", "Cleveland"],
    "DAL": ["Dallas Cowboys", "Dallas"],
    "DEN": ["Denver Broncos", "Denver"],
    "DET": ["Detroit Lions", "Detroit"],
    "GB":  ["Green Bay Packers", "Green Bay"],
    "HOU": ["Houston Texans", "Houston"],
    "IND": ["Indianapolis Colts", "Indianapolis"],
    "JAX": ["Jacksonville Jaguars", "Jacksonville"],
    "KC":  ["Kansas City Chiefs", "Kansas City"],
    "LV":  ["Las Vegas Raiders", "Las Vegas"],
    "LAC": ["Los Angeles Chargers", "LA Chargers"],
    "LA": ["Los Angeles Rams", "LA Rams"],
    "MIA": ["Miami Dolphins", "Miami"],
    "MIN": ["Minnesota Vikings", "Minnesota"],
    "NE":  ["New England Patriots", "New England"],
    "NO":  ["New Orleans Saints", "New Orleans"],
    "NYG": ["New York Giants", "NY Giants"],
    "NYJ": ["New York Jets", "NY Jets"],
    "PHI": ["Philadelphia Eagles", "Philadelphia"],
    "PIT": ["Pittsburgh Steelers", "Pittsburgh"],
    "SF":  ["San Francisco 49ers", "San Francisco"],
    "SEA": ["Seattle Seahawks", "Seattle"],
    "TB":  ["Tampa Bay Buccaneers", "Tampa Bay"],
    "TEN": ["Tennessee Titans", "Tennessee"],
    "WAS": ["Washington Commanders", "Washington"],
}

# flatten into reverse lookup once
NAME_TO_ABBR = {name: abbr for abbr, names in TEAM_ABBR.items() for name in names}

# standardize team names in scraped dfs
def_epa_df["team"] = def_epa_df["team"].map(NAME_TO_ABBR)
merged_third_down_df["team"] = merged_third_down_df["Team"].map(NAME_TO_ABBR)
merged_third_down_df.drop(columns=["Team"], inplace=True)
def_epa_df = def_epa_df.rename(columns={"EPA/Play": "def_epa_per_play"})
merged_third_down_df = merged_third_down_df.rename(columns={
    "2025_Offense": "off_third_down_pct",
    "2025_Defense": "def_third_down_pct"
})

In [13]:
# Merge all three DataFrames 
all_stats = cumulative_df.merge(def_epa_df, on='team', how="inner").merge(merged_third_down_df, on="team", how="inner")

In [14]:
# Create matchup features, merge with game schedule

games = (
    schedule
    .merge(all_stats.add_suffix("_home"), left_on=["home_team"], right_on="team_home")
    .merge(all_stats.add_suffix("_away"), left_on=["away_team"], right_on="team_away")
)
# Create difference features: home - away
stats = ["off_epa_per_play", "points_for", "points_against", "turnover_diff", "def_epa_per_play", "off_third_down_pct", "def_third_down_pct"]
for col in stats:
    games[f'{col}_diff'] = games[f'{col}_home'] - games[f'{col}_away']

games = games.drop(columns=["team_home", "team_away"])

In [15]:
# Define features and labels
features = [c for c in games.columns if c.endswith("_diff")]
X = games[features]
y = games["home_win"]

In [16]:
# Train/test split

current_week = nfl.get_current_week()
train_df = games[games["week"] < current_week]
test_df = games[games["week"] == current_week]

X_train, y_train = train_df[features], train_df["home_win"]
X_test, y_test = test_df[features], test_df["home_win"]

**Random Forest Classifier**

In [17]:
# Fit Random Forest model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 

# Train RFC
rfc = RandomForestClassifier(
    n_estimators=200, 
    max_depth=None, 
    random_state=42
)
rfc.fit(X_train, y_train)

# Predict
y_pred = rfc.predict(X_test)
y_proba = rfc.predict_proba(X_test)[:,1]  # probability home team wins

In [None]:
acc = accuracy_score(y_test, y_pred)

print(f"Week {current_week} Test Accuracy: {acc:.3f}")

In [None]:
test_df["RFC_pred_home_win"] = y_pred
test_df["RFC_pred_home_win_prob"] = y_proba

import os
# Export to CSV
results_df = test_df[[
    "week", "home_team", "away_team", "home_win", "RFC_pred_home_win", "RFC_pred_home_win_prob"
]]

csv_path_games = "game_predictions.csv"

if not os.path.exists(csv_path_games):
    # Create new file
    results_df.to_csv(csv_path_games, index=False)
else:
    # Append without header
    results_df.to_csv(csv_path_games, mode="a", header=False, index=False)

**XGBoost Classifier**

In [None]:
import xgboost as xgb

model = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=4,
    learning_rate=0.1,
    eval_metric='logloss'  # avoids warnings in latest XGBoost
)

# Train
model.fit(X_train, y_train)

# Predict
y_pred_xg = model.predict(X_test)
probs_xg = model.predict_proba(X_test)[:,1]

test_df["XGB_pred_home_win"] = y_pred_xg
test_df["XGB_pred_home_win_prob"] = probs_xg

# Accuracy
xgb_acc= accuracy_score(y_test, y_pred_xg)
print(f"Week {current_week} Test Accuracy: {xgb_acc:.3f}")

**Exporting Results**

In [22]:
# Export weekly accuracy to CSV
new_row = pd.DataFrame([{
    "week": current_week,
    "rfc_acc": acc,
    "xgb_acc": xgb_acc
}])

# Append to CSV
csv_path = "weekly_model_accuracy.csv"

if not os.path.exists(csv_path):
    # Create new file with header
    new_row.to_csv(csv_path, index=False)
else:
    # Append without writing header again
    new_row.to_csv(csv_path, mode="a", header=False, index=False)

In [29]:
# Export game-by-game predictions to CSV
results_df = test_df[[
    "week", "home_team", "away_team", "home_win",
    "RFC_pred_home_win", "RFC_pred_home_win_prob",
    "XGB_pred_home_win", "XGB_pred_home_win_prob"
]]
csv_path_games = "game_predictions.csv"

# Append or create file
if not os.path.exists(csv_path_games):
    results_df.to_csv(csv_path_games, index=False)
else:
    results_df.to_csv(csv_path_games, mode="a", header=False, index=False)