In [1]:
import pandas as pd
import numpy as np
import os
import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
print(f"Project Root: {project_root}")
print("Sys Path Before:", sys.path)
if project_root not in sys.path:
    print("Inserting project root to sys.path")
    sys.path.insert(0, project_root)

# Now import
from data_api import SportsDataIO, Yahoo, PFR
import utils
import yahoo_helpers

from dotenv import load_dotenv
load_dotenv()

Project Root: c:\Users\bengu\Documents\Sports Analysis Project\clairvoyent-raven-sports-analysis\src
Sys Path Before: ['C:\\Users\\bengu\\AppData\\Local\\Programs\\Python\\Python310\\python310.zip', 'C:\\Users\\bengu\\AppData\\Local\\Programs\\Python\\Python310\\DLLs', 'C:\\Users\\bengu\\AppData\\Local\\Programs\\Python\\Python310\\lib', 'C:\\Users\\bengu\\AppData\\Local\\Programs\\Python\\Python310', 'c:\\Users\\bengu\\.virtualenvs\\cfeproj-oIABPDjj', '', 'c:\\Users\\bengu\\.virtualenvs\\cfeproj-oIABPDjj\\lib\\site-packages', 'c:\\Users\\bengu\\.virtualenvs\\cfeproj-oIABPDjj\\lib\\site-packages\\win32', 'c:\\Users\\bengu\\.virtualenvs\\cfeproj-oIABPDjj\\lib\\site-packages\\win32\\lib', 'c:\\Users\\bengu\\.virtualenvs\\cfeproj-oIABPDjj\\lib\\site-packages\\Pythonwin']
Inserting project root to sys.path


True

## Import and clean data from nflverse source

In [2]:
NFLVERSE_DATA_PATH = r"C:\Users\bengu\Documents\Sports Analysis Project\clairvoyent-raven-sports-analysis\data\nfl_player_stats_cleaned.csv"

all_players_df = pd.read_csv(NFLVERSE_DATA_PATH)

In [3]:
column_categories = {
    "standard": {
        "player_id",
        "player_name",
        "position",
        "position_group",
        "season",
        "week",
        "season_type",
        "team",
        "opponent_team",
        "pacr",
    },
    "passing": {
        "completions",
        "attempts",
        "passing_yards",
        "passing_tds",
        "passing_interceptions",
        "sacks_suffered",
        "sack_yards_lost",
        "sack_fumbles",
        "sack_fumbles_lost",
        "passing_air_yards",
        "passing_yards_after_catch",
        "passing_first_downs",
        "passing_epa",
        "passing_cpoe",
        "passing_2pt_conversions",
        "carries", # Include rushing stats for QBs
        "rushing_yards",
        "rushing_tds",
        "rushing_fumbles",
        "rushing_fumbles_lost",
        "rushing_first_downs",
        "rushing_epa",
        "rushing_2pt_conversions",
    },
    "rushing_and_receiving": {
        "carries",
        "rushing_yards",
        "rushing_tds",
        "rushing_fumbles",
        "rushing_fumbles_lost",
        "rushing_first_downs",
        "rushing_epa",
        "rushing_2pt_conversions",
        "receptions",
        "targets",
        "receiving_yards",
        "receiving_tds",
        "receiving_fumbles",
        "receiving_fumbles_lost",
        "receiving_air_yards",
        "receiving_yards_after_catch",
        "receiving_first_downs",
        "rushing_epa",
        "rushing_2pt_conversions",
        "receptions",
        "targets",
        "receiving_yards",
        "receiving_tds",
        "receiving_fumbles",
        "receiving_fumbles_lost",
        "receiving_air_yards",
        "receiving_yards_after_catch",
        "receiving_first_downs",
        "receiving_epa",
        "receiving_2pt_conversions",
        "racr",
        "target_share",
        "air_yards_share",
        "wopr",
    },
    "defense": {
        "special_teams_tds",
        "def_tackles_solo",
        "def_tackles_with_assist",
        "def_tackle_assists",
        "def_tackles_for_loss",
        "def_tackles_for_loss_yards",
        "def_fumbles_forced",
        "def_sacks",
        "def_sack_yards",
        "def_qb_hits",
        "def_interceptions",
        "def_interception_yards",
        "def_pass_defended",
        "def_tds",
        "def_fumbles",
        "def_safeties",
        "misc_yards",
        "fumble_recovery_own",
        "fumble_recovery_yards_own",
        "fumble_recovery_opp",
        "fumble_recovery_yards_opp",
        "fumble_recovery_tds",
        "penalties",
    },
    "kicking": {
        "fg_made",
        "fg_att",
        "fg_missed",
        "fg_blocked",
        "fg_long",
        "fg_pct",
        "fg_made_0_19",
        "fg_made_20_29",
        "fg_made_30_39",
        "fg_made_40_49",
        "fg_made_50_59",
        "fg_made_60_",
        "fg_missed_0_19",
        "fg_missed_20_29",
        "fg_missed_30_39",
        "fg_missed_40_49",
        "fg_missed_50_59",
        "fg_missed_60_",
        "fg_made_list",
        "fg_missed_list",
        "fg_blocked_list",
        "fg_made_distance",
        "fg_missed_distance",
        "fg_blocked_distance",
        "pat_made",
        "pat_att",
        "pat_missed",
        "pat_blocked",
        "pat_pct",
        "gwfg_made",
        "gwfg_att",
        "gwfg_missed",
        "gwfg_blocked",
        "gwfg_distance",
    },
    "special_teams": {
        "punt_returns",
        "punt_return_yards",
        "kickoff_returns",
        "kickoff_return_yards",
    },
}

categories_positions = {
    "passing": ["QB"],
    "rushing_and_receiving": ["RB", "WR"],
    # Not available "kicking": ["K"] 
}


In [4]:
# Establish dataframes for each positional group, filter by position value
def filter_by_positional_group(df, category_key_a, category_key_b="standard") -> pd.DataFrame:
    new_df = df.loc[:, list(column_categories[category_key_a] | column_categories[category_key_b])]
    new_df = new_df[new_df["position"].isin(categories_positions[category_key_a])]
    return new_df.reset_index(drop=True)

# Data of interest
passing_df = filter_by_positional_group(all_players_df, "passing")
rushing_and_receiving_df = filter_by_positional_group(all_players_df, "rushing_and_receiving")

# Save in case
defense_df = all_players_df.loc[:, list(column_categories["standard"] | column_categories["defense"])]
kicking_df = all_players_df.loc[:, list(column_categories["standard"] | column_categories["kicking"])]
specials_teams_df = all_players_df.loc[:, list(column_categories["standard"] | column_categories["special_teams"])]

# Give the dataframes each an alias
passing_df.alias = "passing_df"
rushing_and_receiving_df.alias = "rushing_and_receiving_df"
defense_df.alias = "defense_df"
kicking_df.alias = "kicking_df"
specials_teams_df.alias = "specials_teams_df"

In [None]:
for df in [passing_df, rushing_and_receiving_df, defense_df, kicking_df, specials_teams_df]:
    null_values = 0
    print(f"Checking {df.alias} for null values")
    null_counts = df.isna().sum()

    for column, count in zip(null_counts.index, null_counts):
        if count != 0:
            null_values += 1
            print(f"Column: {column} contains {count} null values.")
    
    print(f"{df.alias} contains {null_values} null values.")

df = None

Checking passing_df for null values
passing_df contains 0 null values.
Checking rushing_and_receiving_df for null values
rushing_and_receiving_df contains 0 null values.
Checking defense_df for null values
defense_df contains 0 null values.
Checking kicking_df for null values
kicking_df contains 0 null values.
Checking specials_teams_df for null values
specials_teams_df contains 0 null values.


### Implement and Apply Model

Implement multiple linear regression models to predict multiple statistics for rushers and receivers. 

* Define the targets: rushing_yards, rushing_tds, receiving_yards, receiving_tds

In [7]:
rushing_and_receiving_df.columns

Index(['season_type', 'rushing_tds', 'receiving_epa', 'rushing_yards',
       'rushing_fumbles_lost', 'position_group', 'position',
       'receiving_2pt_conversions', 'team', 'season', 'racr', 'rushing_epa',
       'week', 'opponent_team', 'rushing_fumbles', 'air_yards_share',
       'receiving_tds', 'receiving_first_downs', 'wopr',
       'receiving_yards_after_catch', 'player_id', 'receiving_air_yards',
       'target_share', 'pacr', 'carries', 'targets', 'receiving_yards',
       'receiving_fumbles_lost', 'receiving_fumbles', 'rushing_first_downs',
       'player_name', 'rushing_2pt_conversions', 'receptions'],
      dtype='object')

In [None]:
"""
Collect xs, and ys.
Normalize data using Z-score normalization
(val - mean_val) / standard_deviation

Create 4 buckets, one for each statistical category.

Prefix   | Stat Category
-------------------------
rsh_yd_ -> rushing_yards
rsh_td_ -> rushing_tds
rc_yd_  -> receiving_yards
rc_td_  -> receiving_tds
"""

inputs = {
    "rsh_yd": ["rushing_tds", "rushing_yards", "rushing_fumbles_lost", "racr", "rushing_epa", "rushing_fumbles", "carries", "pacr", "rushing_first_downs", "rushing_2pt_conversions"],
    "rsh_td": ["rushing_tds", "rushing_yards", "rushing_fumbles_lost", "racr", "rushing_epa", "rushing_fumbles", "carries", "pacr", "rushing_first_downs", "rushing_2pt_conversions"],
    "rc_yd" : ["receiving_epa", "receiving_2pt_conversions", "racr", "air_yards_share", "receiving_tds", "receiving_first_downs", "wopr", "receiving_yards_after_catch", "receiving_air_yards", "target_share", "pacr", "targets", "receiving_yards", "receiving_fumbles_lost", "receiving_fumbles_lost", "receptions"],
    "rc_td" : ["receiving_epa", "receiving_2pt_conversions", "racr", "air_yards_share", "receiving_tds", "receiving_first_downs", "wopr", "receiving_yards_after_catch", "receiving_air_yards", "target_share", "pacr", "targets", "receiving_yards", "receiving_fumbles_lost", "receiving_fumbles_lost", "receptions"],
}

# Stat 1 rushing_yards
rsh_yd_X = rushing_and_receiving_df[inputs["rsh_yd"] + ["season", "week", "player_id"]]
# Stat 2 rushing_tds
rsh_td_X = rushing_and_receiving_df[inputs["rsh_td"] + ["season", "week", "player_id"]]
# Stat 3 receiving_yards
rc_yd_X = rushing_and_receiving_df[inputs["rc_yd"] + ["season", "week", "player_id"]]
# Stat 4 receiving_tds
rc_td_X = rushing_and_receiving_df[inputs["rc_td"] + ["season", "week", "player_id"]]

In [26]:
# Collect rolling data by season-week
rsh_yd_X = rsh_yd_X.sort_values(["season", "week", "player_id"])

# Group by season + player_id, then apply rolling mean over 3 weeks
rsh_yd_X[[f"{c}_roll3_shift" for c in inputs["rsh_yd"]]] = (
    rsh_yd_X.groupby(["season", "player_id"])[inputs["rsh_yd"]]
      .transform(lambda x: x.rolling(3, min_periods=1).mean().shift(1))
)


In [None]:
# Normalize the data using Z-Score Standardization (sklearn StandardScaler)
from sklearn.preprocessing import StandardScaler

rsh_yd_input_cols = [col + "_roll3_shift" for col in inputs["rsh_yd"]]

standard_scaler = StandardScaler()
# Normalize the shifted rolling values only
rsh_yd_X[rsh_yd_input_cols] = standard_scaler.fit_transform(rsh_yd_X[rsh_yd_input_cols])


In [38]:
# Create and apply the linear regression model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split


X = rsh_yd_X.dropna()[rsh_yd_input_cols].reset_index(drop=True)
y = rsh_yd_X.dropna()["rushing_yards"].reset_index(drop=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

reg = LinearRegression().fit(X_train, y_train)

reg.score(X_test, y_test)


0.5834541796570684

In [49]:
import random

test_sample_indices = random.choices(range(len(X_test)), k=10)

X_samples, y_samples = X_test.reset_index(drop=True).loc[test_sample_indices, :], y_test.reset_index(drop=True).loc[test_sample_indices]

In [None]:
preds = reg.predict(X_samples)

In [63]:
for pred, true in zip(preds, y_samples.values):
    diff = pred - true

    print(f"Pred: {pred}, True: {true}, Diff: {diff}")

Pred: 3.858858365850292, True: -6, Diff: 9.858858365850292
Pred: 6.438288878236238, True: 0, Diff: 6.438288878236238
Pred: 1.6895984991781265, True: 0, Diff: 1.6895984991781265
Pred: 14.203571704841565, True: 22, Diff: -7.796428295158435
Pred: 1.7389517302775452, True: 0, Diff: 1.7389517302775452
Pred: 1.7389517302775452, True: 0, Diff: 1.7389517302775452
Pred: 6.594065701418668, True: 13, Diff: -6.405934298581332
Pred: 18.52405045787305, True: 16, Diff: 2.5240504578730487
Pred: 19.196999617307682, True: 5, Diff: 14.196999617307682
Pred: 1.7389517302775452, True: 0, Diff: 1.7389517302775452
