### Import packages

Import of the packages that will be needed for the project.  This includes packages for data manipulation, sklearn modules and custom functions.

In [1]:
import pandas as pd
import numpy as np
import pickle

import sklearn
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, RobustScaler, StandardScaler, MinMaxScaler, FunctionTransformer, PowerTransformer
from sklearn.base import BaseEstimator, TransformerMixin

from Project_Functions import offensive_contribution, trailing_stats_mean, tier_maker, get_tiers, get_yards, get_contribution, get_touchdowns, LogShift, stats_for_trailing

In [2]:
import warnings
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action = 'ignore',
                     category = SettingWithCopyWarning)

### Import Data

Let's import the dataframe that we will be using for modelling

In [3]:
data = pd.read_csv('Data/weekly_data.csv')

data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,GameKey,PlayerID,GameDate,Team,Opponent,HomeOrAway,Number,Name,...,FieldGoalsMade40to49,FieldGoalsMade50Plus,InjuryStatus,MatchString,Season,Week,TeamPoints,OpponentPoints,YardsFor,YardsAgainst
0,0,0,201810122,8283,2018-09-09T13:00:00,TB,NO,AWAY,14,Ryan Fitzpatrick,...,0.0,0.0,,20181TBNO,2018,1,48.0,40.0,529.0,475.0
1,1,1,201810122,18878,2018-09-09T13:00:00,NO,TB,HOME,41,Alvin Kamara,...,0.0,0.0,,20181NOTB,2018,1,40.0,48.0,475.0,529.0
2,2,2,201810122,7242,2018-09-09T13:00:00,NO,TB,HOME,9,Drew Brees,...,0.0,0.0,,20181NOTB,2018,1,40.0,48.0,475.0,529.0
3,3,4,201810129,18082,2018-09-09T16:05:00,KC,LAC,AWAY,10,Tyreek Hill,...,0.0,0.0,,20181KCLAC,2018,1,38.0,28.0,362.0,541.0
4,4,5,201810108,18983,2018-09-09T13:00:00,PIT,CLE,AWAY,30,James Conner,...,0.0,0.0,,20181PITCLE,2018,1,21.0,21.0,472.0,327.0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32046 entries, 0 to 32045
Data columns (total 63 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Unnamed: 0                    32046 non-null  int64  
 1   Unnamed: 0.1                  32046 non-null  int64  
 2   GameKey                       32046 non-null  int64  
 3   PlayerID                      32046 non-null  int64  
 4   GameDate                      32046 non-null  object 
 5   Team                          32046 non-null  object 
 6   Opponent                      32046 non-null  object 
 7   HomeOrAway                    32046 non-null  object 
 8   Number                        32046 non-null  int64  
 9   Name                          32046 non-null  object 
 10  Position                      32046 non-null  object 
 11  Started                       32046 non-null  int64  
 12  PassingAttempts               32046 non-null  float64
 13  P

In [5]:
data['Week'] = data['Week'].astype(str)

Before the train test split, we have to calculate the trailing average fantasy points for each observation, as we cannot incorporate this step into the pipeline without causing data leakage.

In [6]:
def trailing_stats(df):
    
    """
    Function to create a new column with a trailing aggregate mean
    as a new feature for prediction.
    
    Inputs:
        - df: The dataframe on which the function will be applied
        - Column: The column on which to apply the function
        - Window: The number of past values to consider when apply the function

        
    Output:
        - An aggregate value
        
    """
    #Access the column names in stats_for_trailing
    global stats_for_trailing
    
    # Get all unique players in the DataFrame
    players = df['Name'].unique().tolist()
    
    # Define a DataFrame to hold our values
    df_out = pd.DataFrame()
    # Loop through the unique players
    for player in players:
        # Create a temporary dataframe for each player
        temp_df = df[(df['Name'] == player) & (df['InjuryStatus'] != 'Out')] 
        # Calculate the n game trailing average for all players.  Set closed parameter to 'left'
        # so that the current value for fantasy points is not included in the calculation.
        # Backfill the two resulting NaN values
        for column in stats_for_trailing:
            temp_df[f'TA3{column}'] = temp_df.loc[:,column].rolling(window = 3, 
                                                              closed = 'left').mean().fillna(method = 'bfill')
            temp_df[f'TA7{column}'] = temp_df.loc[:,column].rolling(window = 7, 
                                                              closed = 'left').mean().fillna(method = 'bfill')
        # Append the temporary dataframe to the output
        df_out = df_out.append(temp_df)
    # Return a dataframe with the values sorted by the original index
    df_out.sort_index(inplace = True)
    return df_out
    

In [7]:
stats_for_trailing = ['FantasyPointsPPR']

In [8]:
# Prepare the trailing average fantasy points column

data = trailing_stats(data)
data[data['Name'] == 'Tom Brady'][['FantasyPointsPPR', 'TA3FantasyPointsPPR', 'TA7FantasyPointsPPR']].head(35)

Unnamed: 0,FantasyPointsPPR,TA3FantasyPointsPPR,TA7FantasyPointsPPR
14,21.28,15.053333,19.305714
506,16.36,15.053333,19.305714
1077,7.52,15.053333,19.305714
1454,18.96,15.053333,19.305714
1860,27.54,14.28,19.305714
2325,21.8,18.006667,19.305714
2758,21.68,22.766667,19.305714
3212,13.76,23.673333,19.305714
3665,15.66,19.08,18.231429
4128,11.76,17.033333,18.131429


### Train Test Split

In [9]:
data.isna().sum().sort_values()

Unnamed: 0                 0
ReceivingLong              0
Fumbles                    0
FumblesLost                0
PuntReturns                0
                       ...  
PlayerID                   0
GameKey                    0
TA3FantasyPointsPPR      240
TA7FantasyPointsPPR      765
InjuryStatus           28586
Length: 65, dtype: int64

In [10]:
# Separate data from the target
# y = data['FantasyPointsPPR']

# Apply a log transform to the target to see how it impacts prediction
y = LogShift(data['FantasyPointsPPR'])
data.drop(columns = ['FantasyPointsPPR'],
            inplace = True)
X = data

In [11]:
# Execute the train, test split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                   test_size = 0.2,
                                                    shuffle = False,
                                                   random_state = 13)

In [12]:
X_train.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,GameKey,PlayerID,GameDate,Team,Opponent,HomeOrAway,Number,Name,...,InjuryStatus,MatchString,Season,Week,TeamPoints,OpponentPoints,YardsFor,YardsAgainst,TA3FantasyPointsPPR,TA7FantasyPointsPPR
0,0,0,201810122,8283,2018-09-09T13:00:00,TB,NO,AWAY,14,Ryan Fitzpatrick,...,,20181TBNO,2018,1,48.0,40.0,529.0,475.0,32.466667,17.142857
1,1,1,201810122,18878,2018-09-09T13:00:00,NO,TB,HOME,41,Alvin Kamara,...,,20181NOTB,2018,1,40.0,48.0,475.0,529.0,31.666667,26.442857
2,2,2,201810122,7242,2018-09-09T13:00:00,NO,TB,HOME,9,Drew Brees,...,,20181NOTB,2018,1,40.0,48.0,475.0,529.0,29.906667,21.371429
3,3,4,201810129,18082,2018-09-09T16:05:00,KC,LAC,AWAY,10,Tyreek Hill,...,,20181KCLAC,2018,1,38.0,28.0,362.0,541.0,21.3,21.571429
4,4,5,201810108,18983,2018-09-09T13:00:00,PIT,CLE,AWAY,30,James Conner,...,,20181PITCLE,2018,1,21.0,21.0,472.0,327.0,22.733333,25.6


### Feature Engineering

The main features that we will be engineering to predict a player's fantasy output will be the 5-game trailing average of various statistics as well as the binning of players into their respective tiers based on recent performance.

In [13]:
# Define the columns for which we want a 5 game trailing average.

stats_for_trailing = ['TotalTouchdowns','RushingYards','PassingInterceptions','PassingTouchdowns','PassingRating','PassingYards',
                      'PassingCompletionPercentage', 'PassingLong','RushingYards', 'RushingTouchdowns', 'RushingLong',
                      'RushingYardsPerAttempt', 'ReceivingYardsPerReception', 'PuntReturns', 'PuntReturnTouchdowns',
                      'Receptions','ReceivingYards','ReceivingTargets', 'ReceivingTouchdowns', 'ExtraPointsMade', 'FieldGoalsMade',
                      'FieldGoalsMade40to49','FieldGoalsMade50Plus','Fumbles','FumblesLost', 'TeamPoints', 'OpponentPoints', 'YardsFor', 'YardsAgainst']

trailing_stats = []
for col in stats_for_trailing:
    trailing_stats.append('TA7' + col)
    trailing_stats.append('TA3' + col)
trailing_stats.append('TA3FantasyPointsPPR')
trailing_stats.append('TA7FantasyPointsPPR')

In [14]:
# Instantiate the function transformers for the feature engineering pipeline

touchdown_transformer = FunctionTransformer(get_touchdowns) # Get total touchdowns per week per player
yard_transformer = FunctionTransformer(get_yards) # Get total yardage per week per player
trailing_transformer = FunctionTransformer(trailing_stats_mean) # Get the 5 game trailing averages of appropriate statistics
tier_transformer = FunctionTransformer(get_tiers) # Bin players into the appropriate tiers based on recent performance
contribution_transformer = FunctionTransformer(get_contribution) # Calculate the offensive contribution of a given player relative to the team's offense


# Instantiate the pipeline for the necessary transformations

engineering = Pipeline([('touchdown', touchdown_transformer),
                        ('yards', yard_transformer),
                       ('trailing', trailing_transformer),
                       ('tier', tier_transformer),
                       ('contribution', contribution_transformer)])

<br>

### Preprocessing

As shown above, the bulk of the null values fall into one of two categories.  They are either:
* In the InjuryStatus column
    * Here we can impute a value of healthy, as the only values in the injury column are 
* In the TA (trailing average) columns we created
    * No player with a null value played more than 5 games, therefore we cannot calculate the trailing average for them.  We will impute a default value of 0 for these columns, as they represent players who likely did not have much impact.  If they had an impact, they likely would have played in more games.  I will explore imputing the median value as well through a grid search.

In [15]:
# Define the groups of columns for preprocessing steps.

categorical_columns = ['Week',
                       'Team',
                      'Opponent',
                      'PlayerTier',
                      'InjuryStatus']

numerical_columns = trailing_stats


In [16]:
# Create a custom function to generate a log-transformed version of continuous data with a constant 5 added prior to the transform

LogShiftTransformer = FunctionTransformer(LogShift)

In [18]:
# Define the preprocessing steps for categorical features
categorical_transform = Pipeline([('impute_cat',SimpleImputer(strategy = 'constant',
                                                              fill_value = 'Healthy')),
                                 ('one_hot_encoder', OneHotEncoder(handle_unknown = 'ignore'))])

# Define the preprocessing steps for numerical features
numerical_transform = Pipeline([('impute_num', SimpleImputer(strategy = 'mean')),
                               ('scaler', LogShiftTransformer)])

# Instantiate the column transformer object for the preprocessing steps
preprocessing = ColumnTransformer([('num', numerical_transform, numerical_columns),
                                  ('cat', categorical_transform, categorical_columns)])

In [19]:
# Instantiate a pipeline with a linear regression model as a baseline

pipeline = Pipeline([('engineering', engineering),
                    ('prep', preprocessing),
                    ('model', Ridge())])

In [20]:
# Set param grid values, parameters for grid search 

param_grid = {'model__alpha': [1]}

grid_search = GridSearchCV(pipeline, 
                          param_grid = param_grid,
                           scoring = 'neg_mean_squared_error',
                          cv = 5,
                          verbose = 3)

In [21]:
# Fit the grid search to X_train and y_train

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END ...................model__alpha=1;, score=-0.073 total time= 1.5min
[CV 2/5] END ...................model__alpha=1;, score=-0.086 total time= 1.6min
[CV 3/5] END ...................model__alpha=1;, score=-0.081 total time= 1.6min
[CV 4/5] END ...................model__alpha=1;, score=-0.082 total time= 1.5min
[CV 5/5] END ...................model__alpha=1;, score=-0.085 total time= 1.6min


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('engineering',
                                        Pipeline(steps=[('touchdown',
                                                         FunctionTransformer(func=<function get_touchdowns at 0x7f99c820b280>)),
                                                        ('yards',
                                                         FunctionTransformer(func=<function get_yards at 0x7f99c820b310>)),
                                                        ('trailing',
                                                         FunctionTransformer(func=<function trailing_stats_mean at 0x7f99c820b040>)),
                                                        ('tier',
                                                         FunctionTransformer(func=<fun...
                                                                          'TA7PuntReturnTouchdowns',
                                                                          'TA3

In [22]:
grid_search.best_score_

-0.08122583016274658

In [23]:
grid_search.best_params_

{'model__alpha': 1}

In [24]:
preds = grid_search.predict(X_test)

In [25]:
mean_squared_error(y_test, preds)

0.08748034965451248

In [26]:
mean_absolute_error(y_test, preds)


0.21900814690557296

In [27]:
r2_score(y_test, preds)

0.475896421780864

Let's dig into the predictions a little bit

In [28]:
df_preds = pd.DataFrame(y_test)
df_preds['Predicted'] = preds
df_preds.sort_values(by = 'Predicted',
                    ascending = False)

Unnamed: 0,FantasyPointsPPR,Predicted
26556,3.622205,3.609118
27197,2.861057,3.585089
26072,3.202746,3.577696
25971,3.376221,3.576842
29252,3.080992,3.536948
...,...,...
29006,2.302585,2.141232
28068,2.302585,2.132144
26958,2.302585,2.122726
29746,2.894806,2.111681


### Save model


In [29]:
import pickle

pickle.dump(grid_search, open('Pickles/log_transform_linear.pickle', 'wb'))

# Time to dig into the results

In [None]:
df_review = X_test
df_review['Predicted'] = df_preds['Predicted']
df_review['Target'] = df_preds['FantasyPointsPPR']
df_review['Error'] = df_review['Target'] - df_review['Predicted']
df_review.head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Let's visualize how the prediction error varies by player position.

plt.figure(figsize = (12, 9))
sns.violinplot(x = df_review['Position'],
               y = df_review['Error'])

In [None]:
# Let's apply our engineered features to df_review and keep on digging
df_review = trailing_stats_mean(df_review)
df_review['PlayerTier'] = df_review.apply(lambda x: tier_maker(x['Position'], x['TA7FantasyPointsPPR']), axis = 1)


In [None]:
df_review['AbsoluteError'] = abs(df_review['Error'])
mean_tier_error = df_review[['PlayerTier', 'Error', 'AbsoluteError']].groupby('PlayerTier').mean()
sum_tier_error = df_review[['PlayerTier', 'Error','AbsoluteError']].groupby('PlayerTier').sum()
count_tier_error = df_review[['PlayerTier', 'Error']].groupby('PlayerTier').count()
mean_tier_error['Total Absolute Error'] = sum_tier_error['AbsoluteError']
mean_tier_error['Total Error'] = sum_tier_error['Error']
mean_tier_error['Error Count'] = count_tier_error['Error']
mean_tier_error.rename({'Error': 'Mean Error'}, 
                       axis = 1)