In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 200)
pd.set_option('display.max_colwidth', 30)

In [3]:
df = pd.read_excel("C:/Users/schne/Desktop/data/nfl_player_stats_cleaned.xlsx")


In [4]:
df_rush = df[~df['position_group'].isin(['QB', 'TE','OL','WR'])].copy()
df_rush = df_rush.drop_duplicates(subset=['player_id', 'season', 'week'])



pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
df_rush_2024 = df_rush[df_rush['season'] == 2024]

# Show the first 50 rows
print(df_rush_2024.head(50))

         player_id   player_name    player_display_name position_group  season  week season_type team opponent_team  completions  attempts  passing_yards  passing_tds  passing_interceptions  \
181951  00-0029892    K.Juszczyk          Kyle Juszczyk             RB    2024     1         REG   SF           NYJ            0         0              0            0                      0   
181959  00-0030578   C.Patterson  Cordarrelle Patterson             RB    2024     1         REG  PIT           ATL            0         0              0            0                      0   
181972  00-0031595      M.Burton         Michael Burton             RB    2024     1         REG  DEN           SEA            0         0              0            0                      0   
181973  00-0031687     R.Mostert         Raheem Mostert             RB    2024     1         REG  MIA           JAX            0         0              0            0                      0   
181979  00-0032104    A.Abdullah   

In [5]:
df.drop(columns=['completions','attempts','passing_yards','passing_interceptions','passing_tds','sacks_suffered','sack_yards_lost','sack_fumbles_lost',	'passing_yards_after_catch',	'passing_2pt_conversions'
                 ], inplace=True)

In [None]:

# -------------------------------
# Parameters
# -------------------------------
target = 'rushing_yards'
stats = ['carries', 'rushing_tds', 'rushing_fumbles_lost', 'rushing_first_downs']
lag_weeks = [1,2,3,4,5,6]
rolling_window = 4
zero_windows = [3,2,1]

# -------------------------------
# 1. Lag & Rolling Features
# -------------------------------
for stat in stats:
    for lag in lag_weeks:
        df_rush[f'{stat}_lag{lag}'] = df_rush.groupby('player_id')[stat].shift(lag)
    
    # Rolling average over last N weeks
    df_rush[f'{stat}_rolling{rolling_window}'] = df_rush.groupby('player_id')[stat] \
                                                      .shift(1) \
                                                      .rolling(rolling_window) \
                                                      .mean() \
                                                      .reset_index(level=0, drop=True)

# -------------------------------
# 2. Zeros features
# -------------------------------
for window in zero_windows:
    df_rush[f'zeros_last{window}_rush'] = df_rush.groupby('player_id')[target] \
        .transform(lambda x: x.shift(1).rolling(window).apply(lambda y: (y==0).sum(), raw=True))
    
    df_rush[f'zeros_last{window}_carries'] = df_rush.groupby('player_id')['carries'] \
        .transform(lambda x: x.shift(1).rolling(window).apply(lambda y: (y==0).sum(), raw=True))

# -------------------------------
# 3. Momentum / Trend features
# -------------------------------
# Difference between recent rolling averages
df_rush['rushing_yards_rolling2_diff'] = df_rush.groupby('player_id')[target] \
    .transform(lambda x: x.shift(1).rolling(2).mean() - x.shift(3).rolling(2).mean())

df_rush['carries_rolling2_diff'] = df_rush.groupby('player_id')['carries'] \
    .transform(lambda x: x.shift(1).rolling(2).mean() - x.shift(3).rolling(2).mean())

# -------------------------------
# 4. Usage / Opportunity features
# -------------------------------
# Player's share of team carries last 3 weeks
df_rush['team_carries_last3'] = df_rush.groupby(['team','week'])['carries'].transform('sum')
df_rush['player_carry_share_last3'] = df_rush.groupby('player_id')['carries'] \
    .transform(lambda x: x.shift(1).rolling(3).sum()) / df_rush.groupby('player_id')['team_carries_last3'] \
    .transform(lambda x: x.shift(1).rolling(3).sum() + 1e-6)  # avoid div by zero

# -------------------------------
# 5. Build extra_features list
# -------------------------------
extra_features = ['depth_team']

# Automatically include all zeros features
extra_features += [col for col in df_rush.columns if col.startswith('zeros_last')]

# Include momentum / trend features
extra_features += ['rushing_yards_rolling2_diff', 'carries_rolling2_diff', 'player_carry_share_last3']

print("Extra features ready for model:", extra_features)


Extra features ready for model: ['depth_team', 'zeros_last3_rush', 'zeros_last3_carries', 'zeros_last2_rush', 'zeros_last2_carries', 'zeros_last1_rush', 'zeros_last1_carries', 'rushing_yards_rolling2_diff', 'carries_rolling2_diff', 'player_carry_share_last3']


In [7]:
'''target = 'rushing_yards'   # whatever you’re predicting

for stat in['carries','rushing_tds','rushing_fumbles_lost','rushing_first_downs']:
   for lag in [1, 2, 3,4,5,6]:  # use however many weeks of history you think matter
      df_rush[f'{stat}_lag{lag}'] =df_rush[stat].shift(lag)
   df_rush[f'{stat}_rolling4'] =df_rush[stat].shift(1).rolling(4).mean()
   df_rush[f'{stat}_rolling4'] = df_rush.groupby('player_id')[stat].shift(1).rolling(4).mean().reset_index(level=0, drop=True)
   
for window in [3, 2, 1]:
    df_rush[f'zeros_last{window}_rush'] = df_rush.groupby('player_id')['rushing_yards'] \
        .transform(lambda x: x.shift(1).rolling(window).apply(lambda y: (y==0).sum(), raw=True))
                                
                                
for window in [3, 2, 1]:
    df_rush[f'zeros_last{window}_carries'] = df_rush.groupby('player_id')['carries'] \
        .transform(lambda x: x.shift(1).rolling(window).apply(lambda y: (y==0).sum(), raw=True))
                                
                                '''

"target = 'rushing_yards'   # whatever you’re predicting\n\nfor stat in['carries','rushing_tds','rushing_fumbles_lost','rushing_first_downs']:\n   for lag in [1, 2, 3,4,5,6]:  # use however many weeks of history you think matter\n      df_rush[f'{stat}_lag{lag}'] =df_rush[stat].shift(lag)\n   df_rush[f'{stat}_rolling4'] =df_rush[stat].shift(1).rolling(4).mean()\n   df_rush[f'{stat}_rolling4'] = df_rush.groupby('player_id')[stat].shift(1).rolling(4).mean().reset_index(level=0, drop=True)\n\nfor window in [3, 2, 1]:\n    df_rush[f'zeros_last{window}_rush'] = df_rush.groupby('player_id')['rushing_yards']         .transform(lambda x: x.shift(1).rolling(window).apply(lambda y: (y==0).sum(), raw=True))\n\n\nfor window in [3, 2, 1]:\n    df_rush[f'zeros_last{window}_carries'] = df_rush.groupby('player_id')['carries']         .transform(lambda x: x.shift(1).rolling(window).apply(lambda y: (y==0).sum(), raw=True))\n\n                                "

In [8]:
df_rush = df_rush.reset_index(drop=True)
train = df_rush[df_rush['season'] < 2024]  # for example
test  = df_rush[df_rush['season'] == 2024]

In [9]:
extra_features = ['depth_team']

# Add all columns that start with 'zeros_last' and end with '_rush'
extra_features += [col for col in df_rush.columns if col.startswith('zeros_last') and col.endswith('_rush')]

extra_features += [col for col in df_rush.columns if col.startswith('zeros_last') and col.endswith('_carries')]

print(extra_features)

['depth_team', 'zeros_last3_rush', 'zeros_last2_rush', 'zeros_last1_rush', 'zeros_last3_carries', 'zeros_last2_carries', 'zeros_last1_carries']


In [10]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, r2_score
features = [col for col in df_rush.columns if 'lag' in col or 'rolling' in col]
if extra_features:
    features += extra_features
X_train, y_train = train[features], train[target]
X_test, y_test   = test[features], test[target]

model = XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
model.fit(X_train, y_train)

preds = model.predict(X_test)

 # Metrics
mae  = mean_absolute_error(y_test, preds)
rmse = np.sqrt(mean_squared_error(y_test, preds))
r2   = r2_score(y_test, preds)

print(f"Regression — MAE: {mae:.4f} | RMSE: {rmse:.4f} | R²: {r2:.3f}")

Regression — MAE: 19.8685 | RMSE: 28.4891 | R²: 0.427


In [11]:
print(features)

['carries_lag1', 'carries_lag2', 'carries_lag3', 'carries_lag4', 'carries_lag5', 'carries_lag6', 'carries_rolling4', 'rushing_tds_lag1', 'rushing_tds_lag2', 'rushing_tds_lag3', 'rushing_tds_lag4', 'rushing_tds_lag5', 'rushing_tds_lag6', 'rushing_tds_rolling4', 'rushing_fumbles_lost_lag1', 'rushing_fumbles_lost_lag2', 'rushing_fumbles_lost_lag3', 'rushing_fumbles_lost_lag4', 'rushing_fumbles_lost_lag5', 'rushing_fumbles_lost_lag6', 'rushing_fumbles_lost_rolling4', 'rushing_first_downs_lag1', 'rushing_first_downs_lag2', 'rushing_first_downs_lag3', 'rushing_first_downs_lag4', 'rushing_first_downs_lag5', 'rushing_first_downs_lag6', 'rushing_first_downs_rolling4', 'rushing_yards_rolling2_diff', 'carries_rolling2_diff', 'depth_team', 'zeros_last3_rush', 'zeros_last2_rush', 'zeros_last1_rush', 'zeros_last3_carries', 'zeros_last2_carries', 'zeros_last1_carries']


In [12]:
results = test.copy()
results['predicted_' + target] = preds

# Show relevant columns
display_cols = ['player_name', 'season', 'week','carries', target, 'predicted_' + target]
print(results[display_cols].head(10))  # show first 10 rows

# Or just show the full DataFrame
results[display_cols]

       player_name  season  week  carries  rushing_yards  predicted_rushing_yards
36514   K.Juszczyk    2024     1        0              0                 0.692401
36515  C.Patterson    2024     1        4             13                23.883791
36516     M.Burton    2024     1        0              0                 2.055411
36517    R.Mostert    2024     1        6              9                63.441334
36518   A.Abdullah    2024     1        0              0                 4.341139
36519      D.Henry    2024     1       13             46                82.484406
36520        C.Ham    2024     1        0              0                 1.592359
36521    E.Elliott    2024     1       10             40                49.679768
36522      A.Jones    2024     1       14             94                78.784264
36523     S.Perine    2024     1        0              0                17.235479


Unnamed: 0,player_name,season,week,carries,rushing_yards,predicted_rushing_yards
36514,K.Juszczyk,2024,1,0,0,0.692401
36515,C.Patterson,2024,1,4,13,23.883791
36516,M.Burton,2024,1,0,0,2.055411
36517,R.Mostert,2024,1,6,9,63.441334
36518,A.Abdullah,2024,1,0,0,4.341139
36519,D.Henry,2024,1,13,46,82.484406
36520,C.Ham,2024,1,0,0,1.592359
36521,E.Elliott,2024,1,10,40,49.679768
36522,A.Jones,2024,1,14,94,78.784264
36523,S.Perine,2024,1,0,0,17.235479


In [13]:
model, rush_results = run_single_step_xgb(
    df_rush,
    target='rushing_yards',
    season_split=2024,
    extra_features=['depth_team', 'position_group']
)

display(
    rush_results[['player_name', 'player_id', 'position_group', 'season', 'week',
                  'carries', 'rushing_yards', 'depth_team', 'predicted_yards']]
    .head(20)
)

NameError: name 'run_single_step_xgb' is not defined

In [None]:
import matplotlib.pyplot as plt
import random

# Pick 4 random player IDs that exist in your test results
sample_players = random.sample(list(results['player_id'].unique()), 6)

# Create a 2x2 subplot layout
fig, axes = plt.subplots(3, 2, figsize=(12, 8))
axes = axes.flatten()

for i, player_id in enumerate(sample_players):
    player_data = results[results['player_id'] == player_id].sort_values('week')
    
    # Get player name safely (in case of duplicates or missing values)
    player_name = player_data['player_name'].iloc[0] if 'player_name' in player_data.columns else str(player_id)

    axes[i].plot(player_data['week'], player_data['passing_yards'], 
                 label='Actual', marker='o')
    axes[i].plot(player_data['week'], player_data['final_forecast'], 
                 label='Forecast', marker='x')
    
    axes[i].set_title(f'{player_name} — Passing Yard Forecast')
    axes[i].set_xlabel('Week')
    axes[i].set_ylabel('Passing Yards')
    axes[i].legend()
    axes[i].grid(True)

plt.tight_layout()
plt.show()