In [2]:
# Import libraries
import pandas as pd
import nflreadpy as nfl
import nfl_data_py as nfl_data
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

In [None]:
# Load NFL data fo2 2025 season
season = 2025
schedule = nfl.load_schedules(seasons=[season])
schedule = schedule.to_pandas()

# Select only regular season games, create target variable
games = schedule[schedule['game_type'] <= 'REG'].copy()
games["winner"] = (games["home_score"] > games["away_score"]).astype(int) # 1 if home team wins, 0 if away team wins
team_stats = nfl.load_team_stats([season])
team_stats = team_stats.to_pandas()

In [None]:
# Add column for EPA per play
team_stats["pass_plays"] = team_stats["attempts"] + team_stats["sacks_suffered"]
team_stats["rush_plays"] = team_stats["carries"]
team_stats["total_plays"] = team_stats["pass_plays"] + team_stats["rush_plays"]
team_stats["off_epa_per_play"] = (
    (team_stats["passing_epa"] * team_stats["pass_plays"]) +
    (team_stats["rushing_epa"] * team_stats["rush_plays"])
) / team_stats["total_plays"]

In [None]:
# Add columns points for and against to team_stats
home = schedule[["season", "week", "home_team", "away_team", "home_score", "away_score"]].copy()
home["team"] = home["home_team"]
home["points_for"] = home["home_score"]
home["points_against"] = home["away_score"]

away = schedule[["season", "week", "home_team", "away_team", "home_score", "away_score"]].copy()
away["team"] = away["away_team"]
away["points_for"] = away["away_score"]
away["points_against"] = away["home_score"]

schedule_long = pd.concat([home, away], ignore_index=True)

# Merge into team_stats
team_stats = team_stats.merge(
    schedule_long[["season", "week", "team", "points_for", "points_against"]],
    on=["season", "week", "team"],
    how="left"
)

In [25]:
# Add column for turnover differential
# Offensive giveaways
team_stats["giveaways"] = (
    team_stats["passing_interceptions"] + 
    team_stats["rushing_fumbles_lost"] + 
    team_stats["receiving_fumbles_lost"] + 
    team_stats["sack_fumbles_lost"]
)

# Defensive takeaways
team_stats["takeaways"] = (
    team_stats["def_interceptions"] + 
    team_stats["fumble_recovery_opp"]   # opp fumbles recovered by your team
)

# Turnover differential
team_stats["turnover_diff"] = team_stats["takeaways"] - team_stats["giveaways"]

In [43]:
# Web scrape defensive EPA/play from https://sumersports.com/teams/defensive/
import requests
from bs4 import BeautifulSoup

url = "https://sumersports.com/teams/defensive/"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# Find the table containing the defensive stats
table = soup.find('table')

headers = [th.get_text(strip=True) for th in table.find_all('th')]

# Extract rows
rows = []
for tr in table.find_all('tr')[1:]:  # Skip the header row
    cols = [td.get_text(strip=True) for td in tr.find_all('td')]
    if cols:  # Ensure the row is not empty
        rows.append(cols)

# Create a DataFrame
df = pd.DataFrame(rows, columns=headers)
# Clean up the team names (remove extra spaces)
df['team'] = df['Team'].str.replace(r'^\d+\.\s*', '', regex=True)
df['EPA/Play'] = pd.to_numeric(df['EPA/Play'], errors='coerce')

def_epa_df = df[['team', 'EPA/Play']].copy()


In [45]:
# Web scrape 3rd down conversion % (offense and defense) from https://www.teamrankings.com/
urls = {
    'third_down': 'https://www.teamrankings.com/nfl/stat/third-down-conversion-pct',
    'opponent_third_down': 'https://www.teamrankings.com/nfl/stat/opponent-third-down-conversion-pct'
}

# Function to scrape data from a given URL
def scrape_data(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the table containing the stats
    table = soup.find('table')

    # Extract headers
    headers = [th.get_text(strip=True) for th in table.find_all('th')]

    # Extract rows
    rows = []
    for tr in table.find_all('tr')[1:]:  # Skip the header row
        cols = [td.get_text(strip=True) for td in tr.find_all('td')]
        if cols:  # Ensure the row is not empty
            rows.append(cols)

    # Create a DataFrame
    df = pd.DataFrame(rows, columns=headers)

    # Clean up the team names (remove extra spaces)
    df['Team'] = df['Team'].str.strip()

    return df[['Team', '2025']]

# Scrape data for both statistics
third_down_df = scrape_data(urls['third_down'])
opponent_third_down_df = scrape_data(urls['opponent_third_down'])

# Merge the dataframes on the 'Team' column
merged_df = pd.merge(third_down_df, opponent_third_down_df, on='Team', suffixes=('_Offense', '_Defense'))

# Display the merged DataFrame
print(merged_df)


             Team 2025_Offense 2025_Defense
0           Miami       54.29%       52.63%
1       Green Bay       53.70%       32.73%
2   San Francisco       48.08%       32.00%
3         Chicago       45.45%       29.27%
4     LA Chargers       43.40%       38.33%
5          Dallas       42.55%       58.18%
6         Buffalo       42.00%       42.00%
7    Philadelphia       41.82%       36.96%
8         Atlanta       41.82%       34.09%
9     New England       41.67%       39.13%
10        Detroit       41.51%       41.67%
11     Cincinnati       41.03%       46.34%
12      Baltimore       40.91%       41.94%
13        LA Rams       40.43%       34.00%
14   Indianapolis       39.58%       40.00%
15    Kansas City       39.29%       34.04%
16        Seattle       39.13%       38.98%
17     Pittsburgh       38.64%       41.82%
18        Arizona       38.46%       34.62%
19       Carolina       38.18%       37.21%
20      Cleveland       37.70%       44.83%
21      Tampa Bay       37.04%  

In [None]:
# Create dataframe with rolling averages of relevant features