# Preliminary

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import StandardScaler


2024-08-26 19:49:43.765660: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-26 19:49:43.776901: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-26 19:49:43.780387: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-26 19:49:43.789494: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
df = pd.read_csv('data/player_stats.csv')
df = df.sort_values(by=['player_id', 'season', 'week'])


# EDA

In [3]:
df.dtypes 



player_id                       object
player_name                     object
player_display_name             object
position                        object
position_group                  object
headshot_url                    object
recent_team                     object
season                           int64
week                             int64
season_type                     object
opponent_team                   object
completions                      int64
attempts                         int64
passing_yards                    int64
passing_tds                      int64
interceptions                    int64
sacks                            int64
sack_yards                       int64
sack_fumbles                     int64
sack_fumbles_lost                int64
passing_air_yards                int64
passing_yards_after_catch        int64
passing_first_downs              int64
passing_epa                    float64
passing_2pt_conversions          int64
pacr                     

In [4]:
df

Unnamed: 0,player_id,player_name,player_display_name,position,position_group,headshot_url,recent_team,season,week,season_type,...,receiving_2pt_conversions,racr,target_share,air_yards_share,wopr,special_teams_tds,fantasy_points,fantasy_points_ppr,years_in_league,years_played
0,00-0000003,,Abdul-Karim al-Jabbar,RB,RB,,MIA,1999,1,REG,...,0,0.000000,0.052632,,,0,12.7,13.7,1.0,0
1,00-0000003,,Abdul-Karim al-Jabbar,RB,RB,,MIA,1999,2,REG,...,0,0.000000,0.117647,,,0,5.1,8.1,1.0,0
2,00-0000003,,Abdul-Karim al-Jabbar,RB,RB,,MIA,1999,4,REG,...,0,,0.023810,,,0,0.2,0.2,1.0,0
3,00-0000003,,Abdul-Karim al-Jabbar,RB,RB,,CLE,1999,7,REG,...,0,0.000000,0.050000,,,0,3.5,5.5,1.0,0
4,00-0000003,,Abdul-Karim al-Jabbar,RB,RB,,CLE,1999,8,REG,...,0,,,,,0,3.9,3.9,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128868,00-0039165,Z.Charbonnet,Zach Charbonnet,RB,RB,https://static.www.nfl.com/image/private/f_aut...,SEA,2023,14,REG,...,0,-0.666667,0.032258,-0.025532,0.030515,0,4.8,5.8,1.0,0
128869,00-0039165,Z.Charbonnet,Zach Charbonnet,RB,RB,https://static.www.nfl.com/image/private/f_aut...,SEA,2023,15,REG,...,0,0.000000,0.031250,0.012658,0.055736,0,1.6,1.6,1.0,0
128870,00-0039165,Z.Charbonnet,Zach Charbonnet,RB,RB,https://static.www.nfl.com/image/private/f_aut...,SEA,2023,16,REG,...,0,0.307692,0.055556,-0.056034,0.044109,0,-0.4,0.6,1.0,0
128871,00-0039165,Z.Charbonnet,Zach Charbonnet,RB,RB,https://static.www.nfl.com/image/private/f_aut...,SEA,2023,17,REG,...,0,3.000000,0.151515,0.057778,0.267717,0,4.1,9.1,1.0,0


In [5]:
df.isnull().sum(axis=0)

player_id                           0
player_name                     67380
player_display_name                 3
position                           72
position_group                     72
headshot_url                    59092
recent_team                         0
season                              0
week                                0
season_type                         0
opponent_team                       0
completions                         0
attempts                            0
passing_yards                       0
passing_tds                         0
interceptions                       0
sacks                               0
sack_yards                          0
sack_fumbles                        0
sack_fumbles_lost                   0
passing_air_yards                   0
passing_yards_after_catch           0
passing_first_downs                 0
passing_epa                    112406
passing_2pt_conversions             0
pacr                           112846
dakota      

# Feature Engineering


- Trend analysis
- Week-to-week consistency/variability
    - standard deviation of weekly totals
- Rolling averages of last n weeks/years
- Years in league = season - player's first season

In [6]:
# Rolling averages of last n years

# Years in league
# for each unique player id, get the number of years they have been in the league
# Years in league
df['years_played'] = df.groupby('player_id')['season'].rank(method='dense')-1
df['years_played'] = df['years_played'].astype(int)

# Data cleaning

In [7]:
# Drop any rows with season="POST"
df = df[df['season_type'] != 'POST']

# Full df cleaning
df.drop(
    columns=[
        'headshot_url',
        'player_name',
        'player_display_name',
        'recent_team',
        'opponent_team',
        'position',
        'season_type'],
    inplace=True
)
target = 'fantasy_points_ppr'

df = pd.get_dummies(
    df,
    columns=['player_id','season']
)

# QB data cleaning
qb_df = df[df['position_group'] == 'QB'].copy()

qb_df = pd.get_dummies(
    qb_df,
    columns=['position_group']
)


qb_df.drop(
    columns=[
        'receiving_epa',
        'racr',
        'target_share',
        'air_yards_share',
        'wopr'
    ], inplace=True
)
qb_df.dropna(
    subset=[
        'passing_epa',
        'pacr',
        'dakota',
        'rushing_epa'
    ], inplace=True
)


qb_df_target = qb_df[target]
qb_df_features = qb_df.drop(columns=[target])




# RB data cleaning

rb_df = df[df['position_group'] == 'RB'].copy()

rb_df = pd.get_dummies(
    rb_df,
    columns=['position_group']
)

rb_df.drop(
    columns=[
        'passing_epa',
        'pacr',
        'dakota',
        'receiving_epa',
        'rushing_epa',
        'racr',
        'target_share',
        'air_yards_share',
        'wopr'
    ], inplace=True
)

rb_df_target = rb_df[target]
rb_df_features = rb_df.drop(columns=[target])



# WR data cleaning
wr_df = df[df['position_group'] == 'WR'].copy()

wr_df = pd.get_dummies(
    wr_df,
    columns=['position_group']
)

wr_df.drop(
    columns=[
        'passing_epa',
        'pacr',
        'dakota',
        'rushing_epa',
    ], inplace=True
)

wr_df.dropna(
    subset=[
        'receiving_epa',
        'racr',
        'target_share',
        'air_yards_share',
        'wopr'
    ], inplace=True
)


wr_df_target = wr_df[target]
wr_df_features = wr_df.drop(columns=[target])



# TE data cleaning
te_df = df[df['position_group'] == 'TE'].copy()

te_df = pd.get_dummies(
    te_df,
    columns=['position_group']
)

te_df.drop(
    columns=[
        'passing_epa',
        'pacr',
        'dakota',
        'rushing_epa',
    ], inplace=True
)

te_df.dropna(
    subset=[
        'receiving_epa',
        'racr',
        'target_share',
        'air_yards_share',
        'wopr'
    ], inplace=True
)

te_df_target = te_df[target]
te_df_features = te_df.drop(columns=[target])



#sequence_length = 4  # Example: 4 weeks of data
#X_qb, y_qb = create_sequences(np.hstack((qb_features, qb_target.reshape(-1, 1))), sequence_length)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(


# Sequence generation

In [8]:
def create_sequences(features, target, sequence_length):
    X, y = [], []
    for i in range(len(features) - sequence_length):
        X.append(features[i:i+sequence_length])
        y.append(target[i+sequence_length])
    return np.array(X), np.array(y)

In [9]:
sequence_length = 6  # 6 weeks of data, nominally. Note, this will not account for gaps in weeks due to injury, transition between seasons, etc.

qb_df_features_sequences, qb_df_target_sequences = create_sequences(qb_df_features.values, qb_df_target.values, sequence_length)
rb_df_features_sequences, rb_df_target_sequences = create_sequences(rb_df_features.values, rb_df_target.values, sequence_length)
wr_df_features_sequences, wr_df_target_sequences = create_sequences(wr_df_features.values, wr_df_target.values, sequence_length)
te_df_features_sequences, te_df_target_sequences = create_sequences(te_df_features.values, te_df_target.values, sequence_length)

In [None]:
qb_df_features_sequences.shape, qb_df_target_sequences.shape

((11714, 6, 4040), (11714,))

# Train/test time-series split

In [None]:
X_qb = qb_df_features_sequences
y_qb = qb_df_target_sequences

X_rb = rb_df_features_sequences
y_rb = rb_df_target_sequences

X_wr = wr_df_features_sequences
y_wr = wr_df_target_sequences

X_te = te_df_features_sequences
y_te = te_df_target_sequences



tscv = TimeSeriesSplit(n_splits=5)

for train_index, test_index in tscv.split(X_qb):
    X_qb_train, X_qb_test = X_qb.iloc[train_index], X_qb.iloc[test_index]
    y_qb_train, y_qb_test = y_qb.iloc[train_index], y_qb.iloc[test_index]

for train_index, test_index in tscv.split(X_rb):
    X_rb_train, X_rb_test = X_rb.iloc[train_index], X_rb.iloc[test_index]
    y_rb_train, y_rb_test = y_rb.iloc[train_index], y_rb.iloc[test_index]

for train_index, test_index in tscv.split(X_wr):
    X_wr_train, X_wr_test = X_wr.iloc[train_index], X_wr.iloc[test_index]
    y_wr_train, y_wr_test = y_wr.iloc[train_index], y_wr.iloc[test_index]

for train_index, test_index in tscv.split(X_te):
    X_te_train, X_te_test = X_te.iloc[train_index], X_te.iloc[test_index]
    y_te_train, y_te_test = y_te.iloc[train_index], y_te.iloc[test_index]



NameError: name 'qb_df_features_sequences' is not defined

# Normalization

In [None]:
# Apply standardization (zero mean, unit variance)
scaler = StandardScaler()

qb_features_sequences = scaler.fit_transform(qb_features_sequences.reshape(-1, qb_df_features.shape[1])).reshape(qb_features_sequences.shape)
rb_features_sequences = scaler.fit_transform(rb_features_sequences.reshape(-1, rb_df_features.shape[1])).reshape(rb_features_sequences.shape)
wr_features_sequences = scaler.fit_transform(wr_features_sequences.reshape(-1, wr_df_features.shape[1])).reshape(wr_features_sequences.shape)
te_features_sequences = scaler.fit_transform(te_features_sequences.reshape(-1, te_df_features.shape[1])).reshape(te_features_sequences.shape)

# Model building

In [None]:
def create_rnn_model(input_shape):
    model = Sequential()
    model.add(LSTM(units=50, activation='relu', input_shape=input_shape))
    model.add(Dropout(0.2))  # Optional dropout layer for regularization
    model.add(Dense(1))  # Output layer for regression
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

# Model tuning and cross-validation

# Training

# Testing

# Performance evaluation

#### RMSE
#### MAE

# Prediction