# Preliminary

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import StandardScaler


2024-08-26 18:27:37.355838: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-26 18:27:37.368223: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-26 18:27:37.371665: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-26 18:27:37.382000: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
df = pd.read_csv('data/player_stats.csv')


# EDA

In [3]:
df.dtypes 



player_id                       object
player_name                     object
player_display_name             object
position                        object
position_group                  object
headshot_url                    object
recent_team                     object
season                           int64
week                             int64
season_type                     object
opponent_team                   object
completions                      int64
attempts                         int64
passing_yards                    int64
passing_tds                      int64
interceptions                    int64
sacks                            int64
sack_yards                       int64
sack_fumbles                     int64
sack_fumbles_lost                int64
passing_air_yards                int64
passing_yards_after_catch        int64
passing_first_downs              int64
passing_epa                    float64
passing_2pt_conversions          int64
pacr                     

In [4]:
def LoadData(positions, years):
    # Initialize an empty dictionary to store DataFrames by position
    data_dict = {}
    
    for position in positions:
        # Initialize an empty list to collect DataFrames for each year
        df_list = []
        
        for year in years:
            filepath = f'NFL-data-Players/{year}/{position}_season.csv'
            df = pd.read_csv(filepath)
            df['Year'] = year
            df_list.append(df)
        
        # Concatenate all yearly DataFrames into a single DataFrame for the current position
        position_data = pd.concat(df_list, ignore_index=True)
        position_data = position_data.fillna(0)
        
        # Store the DataFrame in the dictionary with position as the key
        data_dict[position] = position_data
    
    return data_dict


In [5]:
df

Unnamed: 0,player_id,player_name,player_display_name,position,position_group,headshot_url,recent_team,season,week,season_type,...,receiving_first_downs,receiving_epa,receiving_2pt_conversions,racr,target_share,air_yards_share,wopr,special_teams_tds,fantasy_points,fantasy_points_ppr
0,00-0000003,,Abdul-Karim al-Jabbar,RB,RB,,MIA,1999,1,REG,...,0,0.292378,0,0.000000,0.052632,,,0,12.7,13.7
1,00-0000003,,Abdul-Karim al-Jabbar,RB,RB,,MIA,1999,2,REG,...,1,0.377009,0,0.000000,0.117647,,,0,5.1,8.1
2,00-0000003,,Abdul-Karim al-Jabbar,RB,RB,,MIA,1999,4,REG,...,0,-0.699578,0,,0.023810,,,0,0.2,0.2
3,00-0000003,,Abdul-Karim al-Jabbar,RB,RB,,CLE,1999,7,REG,...,0,-0.228454,0,0.000000,0.050000,,,0,3.5,5.5
4,00-0000003,,Abdul-Karim al-Jabbar,RB,RB,,CLE,1999,8,REG,...,0,,0,,,,,0,3.9,3.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128868,00-0039165,Z.Charbonnet,Zach Charbonnet,RB,RB,https://static.www.nfl.com/image/private/f_aut...,SEA,2023,14,REG,...,0,-0.787724,0,-0.666667,0.032258,-0.025532,0.030515,0,4.8,5.8
128869,00-0039165,Z.Charbonnet,Zach Charbonnet,RB,RB,https://static.www.nfl.com/image/private/f_aut...,SEA,2023,15,REG,...,0,-0.442067,0,0.000000,0.031250,0.012658,0.055736,0,1.6,1.6
128870,00-0039165,Z.Charbonnet,Zach Charbonnet,RB,RB,https://static.www.nfl.com/image/private/f_aut...,SEA,2023,16,REG,...,0,-1.961893,0,0.307692,0.055556,-0.056034,0.044109,0,-0.4,0.6
128871,00-0039165,Z.Charbonnet,Zach Charbonnet,RB,RB,https://static.www.nfl.com/image/private/f_aut...,SEA,2023,17,REG,...,2,1.644468,0,3.000000,0.151515,0.057778,0.267717,0,4.1,9.1


In [6]:
df.isnull().sum(axis=0)

player_id                           0
player_name                     67380
player_display_name                 3
position                           72
position_group                     72
headshot_url                    59092
recent_team                         0
season                              0
week                                0
season_type                         0
opponent_team                       0
completions                         0
attempts                            0
passing_yards                       0
passing_tds                         0
interceptions                       0
sacks                               0
sack_yards                          0
sack_fumbles                        0
sack_fumbles_lost                   0
passing_air_yards                   0
passing_yards_after_catch           0
passing_first_downs                 0
passing_epa                    112406
passing_2pt_conversions             0
pacr                           112846
dakota      

# Feature Engineering

In [7]:
# Rolling average of last 4 weeks
# season-to-date stats
# trend analysis
# week to wee

# Data cleaning

In [8]:
# Full df cleaning
df.drop(
    columns=[
        'headshot_url',
        'player_name',
        'player_display_name',
        'recent_team',
        'opponent_team',
        'position'],
    inplace=True
)
# Drop any rows with season="POST"
df = df[df['season_type'] != 'POST']

df = df.sort_values(by=['player_id', 'season', 'week'])
target = 'fantasy_points_ppr'

df = pd.get_dummies(
    df,
    columns=['player_id','season']
)

# QB data cleaning
qb_df = df[df['position_group'] == 'QB'].copy()

qb_df = pd.get_dummies(
    qb_df,
    columns=['position_group']
)


qb_df.drop(
    columns=[
        'receiving_epa',
        'racr',
        'target_share',
        'air_yards_share',
        'wopr'
    ], inplace=True
)
qb_df.dropna(
    subset=[
        'passing_epa',
        'pacr',
        'dakota',
        'rushing_epa'
    ], inplace=True
)


qb_df_target = qb_df[target]
qb_df_features = qb_df.drop(columns=[target])




# RB data cleaning

rb_df = df[df['position_group'] == 'RB'].copy()

rb_df = pd.get_dummies(
    rb_df,
    columns=['position_group']
)

rb_df.drop(
    columns=[
        'passing_epa',
        'pacr',
        'dakota',
        'receiving_epa',
        'rushing_epa',
        'racr',
        'target_share',
        'air_yards_share',
        'wopr'
    ], inplace=True
)

rb_df_target = rb_df[target]
rb_df_features = rb_df.drop(columns=[target])



# WR data cleaning
wr_df = df[df['position_group'] == 'WR'].copy()

wr_df = pd.get_dummies(
    wr_df,
    columns=['position_group']
)

wr_df.drop(
    columns=[
        'passing_epa',
        'pacr',
        'dakota',
        'rushing_epa',
    ], inplace=True
)

wr_df.dropna(
    subset=[
        'receiving_epa',
        'racr',
        'target_share',
        'air_yards_share',
        'wopr'
    ], inplace=True
)


wr_df_target = wr_df[target]
wr_df_features = wr_df.drop(columns=[target])



# TE data cleaning
te_df = df[df['position_group'] == 'TE'].copy()

te_df = pd.get_dummies(
    te_df,
    columns=['position_group']
)

te_df.drop(
    columns=[
        'passing_epa',
        'pacr',
        'dakota',
        'rushing_epa',
    ], inplace=True
)

te_df.dropna(
    subset=[
        'receiving_epa',
        'racr',
        'target_share',
        'air_yards_share',
        'wopr'
    ], inplace=True
)

te_df_target = te_df[target]
te_df_features = te_df.drop(columns=[target])



#sequence_length = 4  # Example: 4 weeks of data
#X_qb, y_qb = create_sequences(np.hstack((qb_features, qb_target.reshape(-1, 1))), sequence_length)

In [9]:
df.columns

Index(['position_group', 'week', 'season_type', 'completions', 'attempts',
       'passing_yards', 'passing_tds', 'interceptions', 'sacks', 'sack_yards',
       ...
       'season_2014', 'season_2015', 'season_2016', 'season_2017',
       'season_2018', 'season_2019', 'season_2020', 'season_2021',
       'season_2022', 'season_2023'],
      dtype='object', length=4045)

In [10]:
qb_df.columns

Index(['week', 'season_type', 'completions', 'attempts', 'passing_yards',
       'passing_tds', 'interceptions', 'sacks', 'sack_yards', 'sack_fumbles',
       ...
       'season_2015', 'season_2016', 'season_2017', 'season_2018',
       'season_2019', 'season_2020', 'season_2021', 'season_2022',
       'season_2023', 'position_group_QB'],
      dtype='object', length=4040)

In [11]:
qb_df_features_encoded.columns

NameError: name 'qb_df_features_encoded' is not defined

# Sequence generation

In [None]:
def create_sequences(features, target, sequence_length):
    X, y = [], []
    for i in range(len(features) - sequence_length):
        X.append(features[i:i+sequence_length])
        y.append(target[i+sequence_length])
    return np.array(X), np.array(y)

In [None]:
def create_rnn_model(input_shape):
    model = Sequential()
    model.add(SimpleRNN(units=50, activation='relu', input_shape=input_shape))
    model.add(Dropout(0.2))  # Optional dropout layer for regularization
    model.add(Dense(1))  # Output layer for regression
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

In [None]:
qb_df_features_encoded.columns

Index(['recent_team', 'week', 'season_type', 'completions', 'attempts',
       'passing_yards', 'passing_tds', 'interceptions', 'sacks', 'sack_yards',
       ...
       'season_2014', 'season_2015', 'season_2016', 'season_2017',
       'season_2018', 'season_2019', 'season_2020', 'season_2021',
       'season_2022', 'season_2023'],
      dtype='object', length=401)

In [None]:
sequence_length = 4  # Example: 4 weeks of data

qb_features_sequences, qb_target_sequences = create_sequences(qb_df_features.values, qb_df_target.values, sequence_length)
rb_features_sequences, rb_target_sequences = create_sequences(rb_df_features.values, rb_df_target.values, sequence_length)
wr_features_sequences, wr_target_sequences = create_sequences(wr_df_features.values, wr_df_target.values, sequence_length)
te_features_sequences, te_target_sequences = create_sequences(te_df_features.values, te_df_target.values, sequence_length)

# Normalization

In [None]:
# Apply standardization (zero mean, unit variance)
scaler = StandardScaler()

qb_features_sequences = scaler.fit_transform(qb_features_sequences.reshape(-1, qb_df_features.shape[1])).reshape(qb_features_sequences.shape)
rb_features_sequences = scaler.fit_transform(rb_features_sequences.reshape(-1, rb_df_features.shape[1])).reshape(rb_features_sequences.shape)
wr_features_sequences = scaler.fit_transform(wr_features_sequences.reshape(-1, wr_df_features.shape[1])).reshape(wr_features_sequences.shape)
te_features_sequences = scaler.fit_transform(te_features_sequences.reshape(-1, te_df_features.shape[1])).reshape(te_features_sequences.shape)

ValueError: could not convert string to float: 'REG'

# Train/validation/test split

# Model building

# Training

# Testing

# Performance

#### RMSE
#### MAE

# Prediction