In [2]:
import os
import sys
import numpy as np
import pandas as pd

from dotenv import load_dotenv 
load_dotenv()

# -----------------------------------------------------------------------------
# Project path setup
# -----------------------------------------------------------------------------
project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
print(f"Project Root: {project_root}")
print("Sys Path Before:", sys.path)
if project_root not in sys.path:
    print("Inserting project root to sys.path")
    sys.path.insert(0, project_root)

# Now import internal modules
import utils
from pipelines import linear_regression_pipeline_v1 as pipeline
from data_api import NFLDataPy

# -----------------------------------------------------------------------------
# Constants / Config
# -----------------------------------------------------------------------------
COLUMN_CATEGORIES = utils.STATISTICAL_COLUMNS_BY_CATEGORY
TARGET_INPUTS = utils.TARGETS_TO_INPUTS

ROLLING_PERIOD = 4

CATEGORIES_POSITIONS = {
    "passing": ["QB"],
    "rushing_and_receiving": ["RB", "WR", "TE", "QB"],
    # (kicking not available)
}

COMBINED_DF_PATH = os.getenv("COMBINED_DATA_FRAME_PATH")

# -----------------------------------------------------------------------------
# Load Persistent DataFrames
# -----------------------------------------------------------------------------
print("Loading base data frames...")
nfl_data = NFLDataPy()
years = [2024]
# all_players_df = nfl_data.load_player_stats(years)
# all_teams_df = nfl_data.load_team_stats(years)
# injuries_df = nfl_data.load_injuries(years)
# depth_df = nfl_data.load_depth_charts(years)

combined_df = pd.read_csv(COMBINED_DF_PATH)
print("-" * 40)
print("\n")

Project Root: c:\Users\bengu\Documents\NFL Data Project\clairvoyent-raven-sports-analysis\src
Sys Path Before: ['c:\\Users\\bengu\\Documents\\NFL Data Project\\clairvoyent-raven-sports-analysis\\src', 'C:\\Users\\bengu\\AppData\\Local\\Programs\\Python\\Python310\\python310.zip', 'C:\\Users\\bengu\\AppData\\Local\\Programs\\Python\\Python310\\DLLs', 'C:\\Users\\bengu\\AppData\\Local\\Programs\\Python\\Python310\\lib', 'C:\\Users\\bengu\\AppData\\Local\\Programs\\Python\\Python310', 'c:\\Users\\bengu\\.virtualenvs\\cfeproj-oIABPDjj', '', 'c:\\Users\\bengu\\.virtualenvs\\cfeproj-oIABPDjj\\lib\\site-packages', 'c:\\Users\\bengu\\.virtualenvs\\cfeproj-oIABPDjj\\lib\\site-packages\\win32', 'c:\\Users\\bengu\\.virtualenvs\\cfeproj-oIABPDjj\\lib\\site-packages\\win32\\lib', 'c:\\Users\\bengu\\.virtualenvs\\cfeproj-oIABPDjj\\lib\\site-packages\\Pythonwin']
Project Root: c:\Users\bengu\Documents\NFL Data Project\clairvoyent-raven-sports-analysis\src
Sys Path Before: ['c:\\Users\\bengu\\Document

In [36]:
from collections import defaultdict
targets = [f"{prefix} {target}" for target in utils.TARGET_TRANSLATION.values() for prefix in ["True", "Projected", "Average", "STD"]]
descriptive_stats = defaultdict(dict)

for col in targets:
    descriptive_stats[col]["mean"] = combined_df.groupby("position")[col].mean()
    descriptive_stats[col]["std"] = combined_df.groupby("position")[col].std()


for col in targets:
    means = descriptive_stats[col]["mean"]                     # Series indexed by position
    stds  = descriptive_stats[col]["std"].replace(0, np.nan)   # avoid div-by-0

    mu  = combined_df["position"].map(means)
    sig = combined_df["position"].map(stds)

    z = (combined_df[col] - mu) / sig
    combined_df[f"{col}_z-score"] = z.fillna(0)                # fill NA/inf with 0 if you like

In [39]:
combined_df = combined_df.drop("Unnamed: 0", axis=1)
combined_df.to_csv("combined_data_frame.csv")

In [13]:
print("Running data pipeline...")
target_data_struct, target_input_cols = pipeline.run_pipeline(all_players_df, all_teams_df, injuries_df, depth_df)
print("-" * 40)

results, trues, predictions = pipeline.test_model(target_data_struct, target_input_cols, r"C:\Users\bengu\Documents\NFL Data Project\clairvoyent-raven-sports-analysis\src\notebooks\models")

Running data pipeline...
Filtering and merging injuries and depth charts...
----------------------------------------


Generating positional dataframes...
Generated positional dataframes with 32265 rows and 352352 rows
----------------------------------------


Generating target data structure and input columns by statistical category...
----------------------------------------


Engineering features for cumulative and rolling data...
----------------------------------------


----------------------------------------




In [8]:
from functools import reduce

def assemble_combined_df(
    target_data_struct: dict[str, pd.DataFrame],
    trues: dict[str, pd.Series | pd.DataFrame],
    predictions: dict[str, pd.Series | pd.DataFrame],
) -> pd.DataFrame:
    """
    Build one wide table with per-target True/Projected columns.
    - Merges on keys (prefers ['player_id','season','week'] if present, else just 'player_id')
    - Keeps player_name and position (joined from a meta table)
    - Fills NA in metric columns with 0 (does not touch text/meta)
    """
    # Collect metric frames (only keys + 2 metric columns per target)
    metric_frames: list[pd.DataFrame] = []
    # Collect meta (to attach player_name/position once at the end)
    meta_frames: list[pd.DataFrame] = []

    for target, df in target_data_struct.items():
        if target == "def" or df is None or not isinstance(df, pd.DataFrame):
            continue

        # Determine merge keys available in this df
        preferred_keys = ["player_id", "season", "week"]
        keys = [k for k in preferred_keys if k in df.columns]
        if not keys:  # Fallback to player_id only if absolutely necessary
            if "player_id" in df.columns:
                keys = ["player_id"]
            else:
                # Can't merge without an id; skip this target
                print(f"[assemble] Skipping '{target}' (no merge key present).")
                continue

        # Build a small frame with keys and per-target metrics
        tmp = df[keys].copy()

        # Align trues/preds to df rows (works if Series indexed like df or just reindexes)
        true_series = trues.get(target, pd.Series(index=df.index, dtype="float64"))
        pred_series = predictions.get(target, pd.Series(index=df.index, dtype="float64"))

        # If they are DataFrames with a single column, squeeze to Series
        if isinstance(true_series, pd.DataFrame) and true_series.shape[1] == 1:
            true_series = true_series.iloc[:, 0]
        if isinstance(pred_series, pd.DataFrame) and pred_series.shape[1] == 1:
            pred_series = pred_series.iloc[:, 0]

        tmp[f"True {target}"] = pd.Series(true_series).reindex(df.index).to_numpy()
        tmp[f"Projected {target}"] = pd.Series(pred_series).reindex(df.index).to_numpy()

        metric_frames.append(tmp)

        # Stash meta once per df (we’ll dedupe later)
        keep_meta = [c for c in ["player_id", "season", "week", "player_display_name", "position"] if c in df.columns]
        if keep_meta:
            meta_frames.append(df[keep_meta].copy())

    if not metric_frames:
        return pd.DataFrame(columns=["player_id", "season", "week", "player_display_name", "position"])

    # Merge all metric frames with an outer join on the available keys
    def _merge(left: pd.DataFrame, right: pd.DataFrame) -> pd.DataFrame:
        # Intersect on keys both frames share (at least player_id)
        shared_keys = [k for k in ["player_id", "season", "week"] if k in left.columns and k in right.columns]
        if not shared_keys and "player_id" in left.columns and "player_id" in right.columns:
            shared_keys = ["player_id"]
        return left.merge(right, on=shared_keys, how="outer")

    combined_metrics = reduce(_merge, metric_frames)

    # Build a meta table and join it on keys (prefer season/week if present)
    if meta_frames:
        meta = pd.concat(meta_frames, ignore_index=True).drop_duplicates()
        # Keep only one row per key combo, preferring non-null player_name/position
        key_order = [k for k in ["player_id", "season", "week"] if k in meta.columns]
        if not key_order:
            key_order = ["player_id"]
        meta = (meta
                .sort_values(key_order)  # stable
                .groupby(key_order, as_index=False)
                .agg({
                    "player_display_name": "first" if "player_display_name" in meta.columns else "first",
                    "position": "first" if "position" in meta.columns else "first"
                })
               ) if any(c in meta.columns for c in ["player_display_name", "position"]) else meta

        # Determine merge keys between combined_metrics and meta
        shared_keys = [k for k in ["player_id", "season", "week"] if k in combined_metrics.columns and k in meta.columns]
        if not shared_keys and "player_id" in combined_metrics.columns and "player_id" in meta.columns:
            shared_keys = ["player_id"]

        combined = combined_metrics.merge(meta, on=shared_keys, how="left")
    else:
        combined = combined_metrics

    # Fill NA only in metric columns (leave text/meta alone)
    metric_cols = [c for c in combined.columns if c.startswith("True ") or c.startswith("Projected ")]
    combined[metric_cols] = combined[metric_cols].fillna(0)

    # Optional: order columns
    front = [c for c in ["player_id", "season", "week", "player_display_name", "position"] if c in combined.columns]
    rest = [c for c in combined.columns if c not in front]
    combined = combined[front + rest]

    combined = combined.drop_duplicates(subset=["season", "week", "player_display_name"]).reset_index(drop=True)

    return combined

combined_df = assemble_combined_df(target_data_struct, trues, predictions)

In [10]:
COMBINED_DF_PATH = os.getenv("COMBINED_DATA_FRAME_PATH")

# -----------------------------------------------------------------------------
# Load Persistent DataFrames
# -----------------------------------------------------------------------------
df_cached = False

if COMBINED_DF_PATH:
    try:
        combined_df = pd.read_csv(COMBINED_DF_PATH)
        df_cached = True

    except:
        print("Dataframe is not cached or is not cached correctly, or the path is not set.")

In [11]:
combined_df

Unnamed: 0,player_id,season,week,player_display_name,position,True rsh_yd,Projected rsh_yd,True rsh_td,Projected rsh_td,True rsh_fmbls,...,True rc,Projected rc,True rc_fmbls,Projected rc_fmbls,True p_yd,Projected p_yd,True p_td,Projected p_td,True intcpt,Projected intcpt
0,00-0023459,2024,1,Aaron Rodgers,QB,-1,9.952670,0,0.071398,0,...,0,2.111171,0.0,0.011189,167.0,224.471242,1.0,1.407832,1.0,0.753970
1,00-0026498,2024,1,Matthew Stafford,QB,0,9.952670,0,0.071398,0,...,0,2.111171,0.0,0.011189,317.0,224.471242,1.0,1.407832,1.0,0.753970
2,00-0027973,2024,1,Andy Dalton,QB,0,9.508136,0,0.066536,0,...,0,1.968089,0.0,0.010656,0.0,168.012698,0.0,0.999061,0.0,0.645074
3,00-0028118,2024,1,Tyrod Taylor,QB,7,9.508136,0,0.066536,0,...,0,1.968089,0.0,0.010656,36.0,168.012698,1.0,0.999061,0.0,0.645074
4,00-0029604,2024,1,Kirk Cousins,QB,0,9.952670,0,0.071398,0,...,0,2.111171,0.0,0.011189,155.0,224.471242,1.0,1.407832,2.0,0.753970
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6118,00-0038102,2024,22,Kenny Pickett,QB,-4,7.832998,0,0.108896,0,...,0,0.484024,0.0,0.002753,0.0,115.836555,0.0,0.773218,0.0,0.348616
6119,00-0038519,2024,22,Nikko Remigio,WR,0,-0.357560,0,-0.009402,0,...,0,0.708348,0.0,0.005498,0.0,0.000000,0.0,0.000000,0.0,0.000000
6120,00-0039236,2024,22,Johnny Wilson,WR,0,0.890287,0,0.004050,0,...,0,1.374708,0.0,0.007836,0.0,0.000000,0.0,0.000000,0.0,0.000000
6121,00-0039746,2024,22,Will Shipley,RB,0,20.114752,0,0.191408,0,...,0,0.733050,0.0,0.003348,0.0,0.000000,0.0,0.000000,0.0,0.000000


In [9]:
for target in target_data_struct:
    if target != "def":
        target_df = target_data_struct[target]
        target_df[f"true_{target}"] = trues[target]
        target_df[f"pred_{target}"] = predictions[target]

In [None]:
# app.py
# Run with: shiny run --reload app.py
from shiny import App, render, ui, reactive

df = target_data_struct["rsh_yd"][["player_name", "season", "week", "position", "true_rsh_yd", "pred_rsh_yd"]]
# -------------------------------------------------------------------
# UI
# -------------------------------------------------------------------
app_ui = ui.page_fluid(
    ui.h2("2024 NFL Weekly Player Stats with Projections m"),
    ui.layout_sidebar(
        ui.sidebar(
            ui.input_selectize(
                "week_filter",
                "Select week(s)",
                choices=sorted(df["week"].unique().tolist()),
                multiple=True,
            ),
            ui.input_numeric("min_yards", "Minimum yards", 0),
            ui.input_select(
                "sort_by",
                "Sort by column",
                choices=["true_rsh_yd", "pred_rsh_yd"],
                selected="true_rsh_yd"
            ),
            ui.input_radio_buttons(
                "sort_order",
                "Sort order",
                choices=["Descending", "Ascending"],
                selected="Descending"
            ),
        ),
        ui.output_data_frame("filtered_table")
    )
)

# -------------------------------------------------------------------
# Server
# -------------------------------------------------------------------
def server(input, output, session):
    # reactive filter logic
    @reactive.calc
    def filtered_data():
        d = df.copy()

        # # Filter by team(s)
        # if input.team_filter():
        #     d = d[d["team"].isin(input.team_filter())]

        # Filter by yardage
        d = d[d["true_rsh_yd"] >= input.min_yards()]

        # Sort
        ascending = input.sort_order() == "Ascending"
        d = d.sort_values(by=input.sort_by(), ascending=ascending)

        return d.reset_index(drop=True)

    # Render table
    @output
    @render.data_frame
    def filtered_table():
        return render.DataGrid(
            filtered_data(),
            filters=True,  # adds column-level search boxes
            pagination=True
        )

# -------------------------------------------------------------------
# App entrypoint
# -------------------------------------------------------------------
app = App(app_ui, server)

In [None]:
!shiny run --reload app.py

Error: .\app.py not found



In [2]:
# ----------------------------------------------------------------------------
# Filter dfs to 2025 season
# ----------------------------------------------------------------------------
# print("Filtering data to current season...")
# # all_players_df = all_players_df[all_players_df["season"] == 2025].reset_index(drop=True)
# # all_teams_df = all_teams_df[all_teams_df["season"] == 2025].reset_index(drop=True)
# # injuries_df = injuries_df[injuries_df["season"] == 2025].reset_index(drop=True)
# # depth_df = depth_df[depth_df["season"] == 2025].reset_index(drop=True)
# print("-" * 40)

# ----------------------------------------------------------------------------
# Apply pipeline to filtered data.
# ----------------------------------------------------------------------------
print("Running data pipeline...")
target_data_struct, target_input_cols = pipeline.run_pipeline(all_players_df, all_teams_df, injuries_df, depth_df)
print("-" * 40)
# ----------------------------------------------------------------------------
# Apply model
# ----------------------------------------------------------------------------

models, model_results, trues, predictions = pipeline.train_and_validate_model(target_data_struct, target_input_cols, season_holdout=2024)
# model_results, trues, predictions = pipeline.test_model(test_data_struct, test_input_cols, rf"{os.getenv('SAVED_WEIGHTS_PATH')}")
print(model_results)

pipeline.save_and_store_model_weights(models, "models")

Running data pipeline...
Filtering and merging injuries and depth charts...
----------------------------------------


Generating positional dataframes...
Generated positional dataframes with 444488 rows and 4918211 rows
----------------------------------------


Generating target data structure and input columns by statistical category...
----------------------------------------


Engineering features for cumulative and rolling data...
----------------------------------------


----------------------------------------
{'rsh_yd': {'validation_rmse': '15.2186', 'r2': '0.578'}, 'rsh_td': {'validation_rmse': '0.2664', 'r2': '0.217'}, 'rsh_fmbls': {'validation_rmse': '0.1254', 'r2': '0.114'}, 'rc_yd': {'validation_rmse': '23.0503', 'r2': '0.458'}, 'rc_td': {'validation_rmse': '0.3644', 'r2': '0.165'}, 'rc': {'validation_rmse': '1.6459', 'r2': '0.498'}, 'rc_fmbls': {'validation_rmse': '0.1042', 'r2': '0.057'}, 'p_yd': {'validation_rmse': '84.5783', 'r2': '0.367'}, 'p_td': {'validation_rmse'