# Preliminary

In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import StandardScaler


In [16]:
df = pd.read_csv('data/player_stats.csv')
df = df.sort_values(by=['player_id', 'season', 'week'])


# EDA

In [17]:
df.dtypes 



player_id                       object
player_name                     object
player_display_name             object
position                        object
position_group                  object
headshot_url                    object
recent_team                     object
season                           int64
week                             int64
season_type                     object
opponent_team                   object
completions                      int64
attempts                         int64
passing_yards                    int64
passing_tds                      int64
interceptions                    int64
sacks                            int64
sack_yards                       int64
sack_fumbles                     int64
sack_fumbles_lost                int64
passing_air_yards                int64
passing_yards_after_catch        int64
passing_first_downs              int64
passing_epa                    float64
passing_2pt_conversions          int64
pacr                     

In [18]:
df

Unnamed: 0,player_id,player_name,player_display_name,position,position_group,headshot_url,recent_team,season,week,season_type,...,receiving_2pt_conversions,racr,target_share,air_yards_share,wopr,special_teams_tds,fantasy_points,fantasy_points_ppr,years_in_league,years_played
0,00-0000003,,Abdul-Karim al-Jabbar,RB,RB,,MIA,1999,1,REG,...,0,0.000000,0.052632,,,0,12.7,13.7,1.0,0
1,00-0000003,,Abdul-Karim al-Jabbar,RB,RB,,MIA,1999,2,REG,...,0,0.000000,0.117647,,,0,5.1,8.1,1.0,0
2,00-0000003,,Abdul-Karim al-Jabbar,RB,RB,,MIA,1999,4,REG,...,0,,0.023810,,,0,0.2,0.2,1.0,0
3,00-0000003,,Abdul-Karim al-Jabbar,RB,RB,,CLE,1999,7,REG,...,0,0.000000,0.050000,,,0,3.5,5.5,1.0,0
4,00-0000003,,Abdul-Karim al-Jabbar,RB,RB,,CLE,1999,8,REG,...,0,,,,,0,3.9,3.9,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128868,00-0039165,Z.Charbonnet,Zach Charbonnet,RB,RB,https://static.www.nfl.com/image/private/f_aut...,SEA,2023,14,REG,...,0,-0.666667,0.032258,-0.025532,0.030515,0,4.8,5.8,1.0,0
128869,00-0039165,Z.Charbonnet,Zach Charbonnet,RB,RB,https://static.www.nfl.com/image/private/f_aut...,SEA,2023,15,REG,...,0,0.000000,0.031250,0.012658,0.055736,0,1.6,1.6,1.0,0
128870,00-0039165,Z.Charbonnet,Zach Charbonnet,RB,RB,https://static.www.nfl.com/image/private/f_aut...,SEA,2023,16,REG,...,0,0.307692,0.055556,-0.056034,0.044109,0,-0.4,0.6,1.0,0
128871,00-0039165,Z.Charbonnet,Zach Charbonnet,RB,RB,https://static.www.nfl.com/image/private/f_aut...,SEA,2023,17,REG,...,0,3.000000,0.151515,0.057778,0.267717,0,4.1,9.1,1.0,0


In [19]:
df.isnull().sum(axis=0)

player_id                           0
player_name                     67380
player_display_name                 3
position                           72
position_group                     72
headshot_url                    59092
recent_team                         0
season                              0
week                                0
season_type                         0
opponent_team                       0
completions                         0
attempts                            0
passing_yards                       0
passing_tds                         0
interceptions                       0
sacks                               0
sack_yards                          0
sack_fumbles                        0
sack_fumbles_lost                   0
passing_air_yards                   0
passing_yards_after_catch           0
passing_first_downs                 0
passing_epa                    112406
passing_2pt_conversions             0
pacr                           112846
dakota      

# Feature Engineering


- Trend analysis
- Week-to-week consistency/variability
    - standard deviation of weekly totals
- Rolling averages of last n weeks/years
- Years in league = season - player's first season

In [20]:
# Rolling averages of last n years

# Years in league
# for each unique player id, get the number of years they have been in the league
# Years in league
df['years_played'] = df.groupby('player_id')['season'].rank(method='dense')-1
df['years_played'] = df['years_played'].astype(int)

# Data cleaning

In [21]:
# Drop any rows with season="POST"
df = df[df['season_type'] != 'POST']

# Full df cleaning
df.drop(
    columns=[
        'headshot_url',
        'player_name',
        'player_display_name',
        'recent_team',
        'opponent_team',
        'position',
        'season_type'],
    inplace=True
)
target = 'fantasy_points_ppr'

df = pd.get_dummies(
    df,
    columns=['player_id','season']
)

# QB data cleaning
qb_df = df[df['position_group'] == 'QB'].copy()

qb_df = pd.get_dummies(
    qb_df,
    columns=['position_group']
)


qb_df.drop(
    columns=[
        'receiving_epa',
        'racr',
        'target_share',
        'air_yards_share',
        'wopr'
    ], inplace=True
)
qb_df.dropna(
    subset=[
        'passing_epa',
        'pacr',
        'dakota',
        'rushing_epa'
    ], inplace=True
)


qb_df_target = qb_df[target]
qb_df_features = qb_df.drop(columns=[target])




# RB data cleaning

rb_df = df[df['position_group'] == 'RB'].copy()

rb_df = pd.get_dummies(
    rb_df,
    columns=['position_group']
)

rb_df.drop(
    columns=[
        'passing_epa',
        'pacr',
        'dakota',
        'receiving_epa',
        'rushing_epa',
        'racr',
        'target_share',
        'air_yards_share',
        'wopr'
    ], inplace=True
)

rb_df_target = rb_df[target]
rb_df_features = rb_df.drop(columns=[target])



# WR data cleaning
wr_df = df[df['position_group'] == 'WR'].copy()

wr_df = pd.get_dummies(
    wr_df,
    columns=['position_group']
)

wr_df.drop(
    columns=[
        'passing_epa',
        'pacr',
        'dakota',
        'rushing_epa',
    ], inplace=True
)

wr_df.dropna(
    subset=[
        'receiving_epa',
        'racr',
        'target_share',
        'air_yards_share',
        'wopr'
    ], inplace=True
)


wr_df_target = wr_df[target]
wr_df_features = wr_df.drop(columns=[target])



# TE data cleaning
te_df = df[df['position_group'] == 'TE'].copy()

te_df = pd.get_dummies(
    te_df,
    columns=['position_group']
)

te_df.drop(
    columns=[
        'passing_epa',
        'pacr',
        'dakota',
        'rushing_epa',
    ], inplace=True
)

te_df.dropna(
    subset=[
        'receiving_epa',
        'racr',
        'target_share',
        'air_yards_share',
        'wopr'
    ], inplace=True
)

te_df_target = te_df[target]
te_df_features = te_df.drop(columns=[target])



#sequence_length = 4  # Example: 4 weeks of data
#X_qb, y_qb = create_sequences(np.hstack((qb_features, qb_target.reshape(-1, 1))), sequence_length)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(


# Sequence generation

In [22]:
qb_df_features

Unnamed: 0,week,completions,attempts,passing_yards,passing_tds,interceptions,sacks,sack_yards,sack_fumbles,sack_fumbles_lost,...,season_2015,season_2016,season_2017,season_2018,season_2019,season_2020,season_2021,season_2022,season_2023,position_group_QB
44,1,27,48,355,5,3,1,7,0,0,...,False,False,False,False,False,False,False,False,False,True
45,2,9,21,101,0,2,2,11,0,0,...,False,False,False,False,False,False,False,False,False,True
46,4,14,19,186,2,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,True
47,5,21,38,177,1,2,1,7,0,0,...,False,False,False,False,False,False,False,False,False,True
48,6,19,31,260,0,1,3,16,0,0,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128850,18,20,26,264,2,0,2,18,0,0,...,False,False,False,False,False,False,False,False,True,True
128853,1,24,37,223,1,1,4,8,0,0,...,False,False,False,False,False,False,False,False,True,True
128854,2,6,10,56,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,True,True
128855,4,11,25,200,2,0,2,4,0,0,...,False,False,False,False,False,False,False,False,True,True


In [23]:
def create_sequences(features, target, sequence_length):
    X, y = [], []
    for i in range(len(features) - sequence_length):
        X.append(features[i:i+sequence_length])
        y.append(target[i+sequence_length])
    return np.array(X), np.array(y)

In [24]:
sequence_length = 3  # 6 weeks of data, nominally. Note, this will not account for gaps in weeks due to injury, transition between seasons, etc.

qb_df_features_sequences, qb_df_target_sequences = create_sequences(qb_df_features.values, qb_df_target.values, sequence_length)
rb_df_features_sequences, rb_df_target_sequences = create_sequences(rb_df_features.values, rb_df_target.values, sequence_length)
wr_df_features_sequences, wr_df_target_sequences = create_sequences(wr_df_features.values, wr_df_target.values, sequence_length)
te_df_features_sequences, te_df_target_sequences = create_sequences(te_df_features.values, te_df_target.values, sequence_length)

In [None]:
qb_df_features_sequences.shape, qb_df_target_sequences.shape

((11717, 3, 4040), (11717,))

# Train/test time-series split, cross-validation, and 

In [None]:
X_qb = qb_df_features_sequences
y_qb = qb_df_target_sequences

X_rb = rb_df_features_sequences
y_rb = rb_df_target_sequences

X_wr = wr_df_features_sequences
y_wr = wr_df_target_sequences

X_te = te_df_features_sequences
y_te = te_df_target_sequences

tscv = TimeSeriesSplit(n_splits=5)


# Normalization

In [None]:
def normalize_sequences(X_train, X_test):
    scaler = StandardScaler()
    num_features = X_train.shape[2]
    
    X_train_reshaped = X_train.reshape(-1, num_features)
    X_test_reshaped = X_test.reshape(-1, num_features)
    
    scaler.fit(X_train_reshaped)
    
    X_train_normalized = scaler.transform(X_train_reshaped).reshape(X_train.shape)
    X_test_normalized = scaler.transform(X_test_reshaped).reshape(X_test.shape)
    
    return X_train_normalized, X_test_normalized

In [None]:
# Dictionaries to store the normalized datasets
normalized_datasets = {
    'qb': [],
    'rb': [],
    'wr': [],
    'te': []
}

for X, y, key in [(X_qb, y_qb, 'qb'), 
                  (X_rb, y_rb, 'rb'), 
                  (X_wr, y_wr, 'wr'), 
                  (X_te, y_te, 'te')]:
    for train_index, test_index in tscv.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Normalize the sequences
        X_train_normalized, X_test_normalized = normalize_sequences(X_train, X_test)
        
        # Store the normalized datasets in the dictionary
        normalized_datasets[key].append({
            'X_train': X_train_normalized,
            'X_test': X_test_normalized,
            'y_train': y_train,
            'y_test': y_test
        })

# Output the normalized datasets
for key, datasets in normalized_datasets.items():
    print(f"Normalized datasets for {key}:")
    for i, dataset in enumerate(datasets):
        print(f"  Split {i + 1}:")
        print(f"    X_train shape: {dataset['X_train'].shape}")
        print(f"    X_test shape: {dataset['X_test'].shape}")
        print(f"    y_train shape: {dataset['y_train'].shape}")
        print(f"    y_test shape: {dataset['y_test'].shape}")

KeyboardInterrupt: 

# Model building

In [None]:
def create_rnn_model(input_shape, units=50, num_lstm_layers=1, activation='tanh', recurrent_activation='sigmoid',
                     use_bias=True, kernel_initializer='glorot_uniform', recurrent_initializer='orthogonal',
                     bias_initializer='zeros', unit_forget_bias=True, kernel_regularizer=None,
                     recurrent_regularizer=None, bias_regularizer=None, activity_regularizer=None,
                     kernel_constraint=None, recurrent_constraint=None, bias_constraint=None,
                     dropout=0.0, recurrent_dropout=0.0, return_sequences=False, return_state=False,
                     go_backwards=False, stateful=False, unroll=False):
    model = Sequential()
    
    for i in range(num_lstm_layers):
        # For all layers except the last one, return_sequences=True
        return_seq = return_sequences if i < num_lstm_layers - 1 else False
        model.add(LSTM(
            units=units,
            activation=activation,
            recurrent_activation=recurrent_activation,
            use_bias=use_bias,
            kernel_initializer=kernel_initializer,
            recurrent_initializer=recurrent_initializer,
            bias_initializer=bias_initializer,
            unit_forget_bias=unit_forget_bias,
            kernel_regularizer=kernel_regularizer,
            recurrent_regularizer=recurrent_regularizer,
            bias_regularizer=bias_regularizer,
            activity_regularizer=activity_regularizer,
            kernel_constraint=kernel_constraint,
            recurrent_constraint=recurrent_constraint,
            bias_constraint=bias_constraint,
            dropout=dropout,
            recurrent_dropout=recurrent_dropout,
            return_sequences=return_seq,
            return_state=return_state,
            go_backwards=go_backwards,
            stateful=stateful,
            unroll=unroll,
            input_shape=input_shape if i == 0 else None
        ))
        
        # Update dropout if needed for each LSTM layer
        model.add(Dropout(dropout))
    
    model.add(Dense(1))  # Output layer for regression
    model.compile(optimizer='adam', loss='mean_squared_error')
    
    return model


In [None]:
# run the model on the normalized datasets
rnn_models = {
    'qb': [],
    'rb': [],
    'wr': [],
    'te': []
}

for key, datasets in normalized_datasets.items():
    for i, dataset in enumerate(datasets):
        print(f"Training RNN model for {key} - split {i + 1}...")
        
        # Get the input shape
        input_shape = dataset['X_train'].shape[1:]
        
        # Create the RNN model
        model = create_rnn_model(input_shape, num_lstm_layers=4)
        
        # Train the model
        model.fit(
            dataset['X_train'],
            dataset['y_train'],
            epochs=10,
            batch_size=32,
            validation_data=(dataset['X_test'], dataset['y_test']),
            verbose=1
        )
        
        # Store the model
        rnn_models[key].append(model)

# Model tuning and cross-validation

# Training

# Testing

# Performance evaluation

#### RMSE
#### MAE

# Prediction