### Import packages

Import of the packages that will be needed for the project.  This includes packages for data manipulation, sklearn modules and custom functions.

In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, RobustScaler, StandardScaler, FunctionTransformer, PowerTransformer
from sklearn.base import TransformerMixin
from sklearn.svm import SVR



In [2]:
import warnings
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action = 'ignore')

In [3]:

!pip install pandas==1.2.0



### Import Data

Let's import the dataframe that we will be using for modelling

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
def offensive_contribution(team_yards, player_yards):
    
    """
    Calculate a percentage for the percentage of team yards that a player contributes to.
    
    Input:  
        - Dataframe to use in the calculation
        
    Output:
        - New dataframe column with the desired contribution score
    """
    contribution = player_yards / team_yards
    if contribution > 1.0:
        return 1.0
    else:
        return contribution     

  #--------------------------------------------------------------------      
      
def get_contribution(df):
    
    """
    Apply offensive_contribution(), taking in the whole dataframe as input.
    """
    
    df['YardageContribution'] = df.apply(lambda x: offensive_contribution(x['YardsFor'],
                                                                         x['TotalYards'],
                                                                         ), axis = 1)
                                                      
    return df

#---------------------------------------------------------
# Define the stats for which we need to calculate trailing averages
stats_for_trailing = ['TotalTouchdowns','RushingYards','PassingInterceptions','PassingTouchdowns','PassingRating','PassingYards',
                      'PassingCompletionPercentage', 'PassingLong','RushingYards', 'RushingTouchdowns', 'RushingLong',
                      'RushingYardsPerAttempt', 'ReceivingYardsPerReception', 'PuntReturns', 'PuntReturnTouchdowns',
                      'Receptions','ReceivingYards','ReceivingTargets', 'ReceivingTouchdowns', 'ExtraPointsMade', 'FieldGoalsMade',
                      'FieldGoalsMade40to49','FieldGoalsMade50Plus','Fumbles','FumblesLost', 'TeamPoints', 'OpponentPoints', 'YardsFor', 'YardsAgainst']


def trailing_stats_mean(df):
    
    """
    Function to create a dataframe with a trailing aggregate mean
    as a new feature for prediction.  Does so for each column in the global
    variable stats_for_trailing
    
    Inputs:
        - df: The dataframe on which the function will be applied
        - Column: The column on which to apply the function
        - Window: The number of past values to consider when apply the function

        
    Output:
        - An aggregate value
        
    """
    #Access the column names in stats_for_trailing
    global stats_for_trailing
    
    # Get all unique players in the DataFrame
    players = df['Name'].unique().tolist()
    
    # Define a DataFrame to hold our values
    df_out = pd.DataFrame()
    # Loop through the unique players
    for player in players:
        # Create a temporary dataframe for each player
        temp_df = df[(df['Name'] == player) & (df['InjuryStatus'] != 'Out')]
        # Calculate the n game trailing average for all players.  Set closed parameter to 'left'
        # so that the current value for fantasy points is not included in the calculation.
        # Backfill the two resulting NaN values
        for column in stats_for_trailing:
            temp_df[f'TA7{column}'] = temp_df[column].fillna(method = 'ffill').rolling(window = 7, 
                                                              closed = 'left').mean().fillna(method = 'bfill')
            temp_df[f'TA3{column}'] = temp_df[column].rolling(window = 3, 
                                                              closed = 'left').mean().fillna(method = 'bfill') 
        # Append the temporary dataframe to the output
        df_out = df_out.append(temp_df)
    # Return a dataframe with the values sorted by the original index
    df_out.sort_index(inplace = True)
    return df_out
    
#---------------------------------------------------------

def trailing_stats_single_column(df, column):
    
    """
    Function to create a new column with a trailing aggregate mean
    as a new feature for prediction.
    
    Inputs:
        - df: The dataframe on which the function will be applied
        - Column: The column on which to apply the function
        - Window: The number of past values to consider when apply the function

        
    Output:
        - An aggregate value
        
    """
    
    # Get all unique players in the DataFrame
    players = df['Name'].unique().tolist()
    
    # Make a dataframe to store the output
    df_out = pd.DataFrame()
    # Loop through the unique players
    for player in players:
        # Create a temporary dataframe for each player
        temp_df = df[(df['Name'] == player) & (df['InjuryStatus'] != 'Out')]
        # Calculate the n game trailing average for all players.  Set closed parameter to 'left'
        # so that the current value for fantasy points is not included in the calculation.
        # Backfill the two resulting NaN values
       
        temp_df[f'TA7{column}'] = temp_df[column].fillna(method='ffill').rolling(window = 7, 
                                                            closed = 'left').mean().fillna(method = 'bfill')
        temp_df[f'TA3{column}'] = temp_df[column].rolling(window = 3, 
                                                              closed = 'left').mean().fillna(method = 'bfill')                                                     
        # Append the temporary dataframe to the output
        df_out = df_out.append(temp_df)
    # Return a dataframe with the values sorted by the original index
    df_out.sort_index(inplace = True)
    return df_out
    
#---------------------------------------------------------

def tier_maker(position, points):
    
    """
    Take in two arguments:
    
     - Position: Column of the dataframe holding the player position
     - Points: Trailing average of fantasy points for a given player
    
    Classify players at every position to a tier based on their recent 
    performance (Trailing average fantasy points). Classifications will work
    as follows.
    
    Running Back:
        -RB1: Trailing average greater than 18pts
        -RB2: Trailing average between 12 and 18 pts
        -RB3: Trailing average between 8 and 12 pts
        -RB4: Trailing average below 8 pts
        
    Wide Receiver:
        -WR1: Trailing average greater than 18pts
        -WR2: Trailing average between 12 and 18 pts
        -WR3: Trailing average between 8 and 12 pts
        -WR4: Trailing average below 8 pts
        
    Tight End:
        -TE1: Trailing average greater than 15pts
        -TE2: Trailing average between 10 and 15 pts
        -TE3: Trailing average below 10 pts
        
    Quarterback:
        -QB1: Trailing average greater than 24pts
        -QB2: Trailing average between 18 and 24pts
        -QB3: Trailing average between 12 and 18pts
        -QB4: Trailing average below 12 pts
    
    Kicker:
        - K1: Trailing average greater than 10 pts
        - K2: Trailing average between 7 and 10 points
        - K3: Trailing average below 7 points
        """

    # Let's make tier assignments for running backs
    # Let's make tier assignments for running backs
    if position == 'RB':
        if points > 16:
            return 'RB1'
        elif 12 < points <= 16:
            return 'RB2'
        elif 8 < points <= 12:
            return 'RB3'
        else:
            return 'RB4'
            
    # Let's make tier assignments for wide receivers
    if position == 'WR':
        if points > 18:
            return 'WR1'
        elif 12 < points <= 18:
            return 'WR2'
        elif 8 < points <= 12:
            return 'WR3'
        else:
            return 'WR4'
            
    # Let's make tier assignments for tight ends
    if position == 'TE':
        if points > 12:
            return 'TE1'
        elif 6 < points <= 12:
            return 'TE2'
        else:
            return 'TE3'
    
    # Let's make tier assignments for quarterbacks
    if position == 'QB':
        if points > 22:
            return 'QB1'
        elif 16 < points <= 22:
            return 'QB2'
        elif 8 < points <= 14:
            return 'QB3'
        else:
            return 'QB4'

    # Let's make tier assignments for kickers
    if position == 'K':
        if points > 10:
            return 'K1'
        elif 4 < points <= 8:
            return 'K2'
        else:
            return 'K3'
    
    
    # If noting is returned before this point, return np.nan
    return np.nan
#---------------------------------------------------------------------

def get_tiers(df):
    
    """
    Apply the tier_maker function to the entire dataframe.
    """
    
    df['PlayerTier'] = df.apply(lambda x: tier_maker(x['Position'], x['TA7FantasyPointsPPR']),
                               axis = 1)
    return df
#---------------------------------------------------------------------

def get_touchdowns(df):
    
    """
    Get the total number of touchdowns for a player in a given week.
    
    Input:
        - Dataframe
    Output:
        - Dataframe with a new column representing total touchdowns"""
    
    TD_sum = df['ReceivingTouchdowns'] + df['RushingTouchdowns'] + df['PassingTouchdowns']
    df['TotalTouchdowns'] = TD_sum
    
    return df

def get_yards(df):
    
    """
    Get the total number of yards for a player in a given week.
    
    Input:
        - Dataframe
    Output:
        - Dataframe with a new column representing total touchdowns"""
    
    yardage_sum = df['ReceivingYards'] + df['RushingYards'] + df['PassingYards']
    df['TotalYards'] = yardage_sum
    
    return df

#---------------------------------------------------------------------

def LogShift(X):
    '''
    Apply a constant of 10 to all continuous variables in the dataframe, then 
    apply a natural logarithm.

    Input:
        - Dataframe of continuous features

    Output:
        - Transformed dataframe
        
    '''
    X_10 = X + 10
    X_log = np.log(X_10)
    
    return X_log

#---------------------------------------------------------------------
from sklearn.base import TransformerMixin
# Build a new transformer class to convert the sparse matrix output of the 
# pipeline to a dense matrix compatible with the model

class DenseTransformer(TransformerMixin):

    def fit(self, X, y = None, **fit_params):
        return self

    def transform(self, X, y = None, **fit_params):
        return X.todense()

#---------------------------------------------------------------------
def apply_defensive_strength(data):
    '''
    Apply the defensive strength of the opposing team to the dataframe.
    '''
    # Import the defensive ranking csv
    def_rank = pd.read_csv('drive/MyDrive/LHL_Final_Project/Data/defensive_ranking.csv')
    # Make merge strings to merge the two dataframes
    def_rank['MergeString'] = def_rank['Team'].astype(str) + def_rank['Season'].astype(str) 
    # Drop duplicate columns
    def_rank.drop(columns = ['Team', 'Season'],
                 inplace = True)
    # Make the merge string on the target dataframe
    data['MergeString'] = data['Team'].astype(str) + data['Season'].astype(str)

    # Merge
    data = data.merge(def_rank,
                     how = 'left',
                     left_on = 'MergeString',
                     right_on = 'MergeString')
    return data

In [6]:
data = pd.read_csv('drive/MyDrive/LHL_Final_Project/Data/weekly_data.csv')
data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,GameKey,PlayerID,GameDate,Team,Opponent,HomeOrAway,Number,Name,Position,Started,PassingAttempts,PassingCompletions,PassingYards,PassingCompletionPercentage,PassingYardsPerAttempt,PassingYardsPerCompletion,PassingTouchdowns,PassingInterceptions,PassingRating,PassingLong,PassingSacks,PassingSackYards,RushingAttempts,RushingYards,RushingYardsPerAttempt,RushingTouchdowns,RushingLong,ReceivingTargets,Receptions,ReceivingYards,ReceivingYardsPerReception,ReceivingTouchdowns,ReceivingLong,Fumbles,FumblesLost,PuntReturns,PuntReturnYards,PuntReturnTouchdowns,KickReturns,KickReturnYards,KickReturnTouchdowns,FieldGoalsAttempted,FieldGoalsMade,ExtraPointsMade,TwoPointConversionPasses,TwoPointConversionRuns,TwoPointConversionReceptions,FantasyPointsPPR,FieldGoalsMade0to19,FieldGoalsMade20to29,FieldGoalsMade30to39,FieldGoalsMade40to49,FieldGoalsMade50Plus,InjuryStatus,MatchString,Season,Week,TeamPoints,OpponentPoints,YardsFor,YardsAgainst
0,0,0.0,201810122,8283,2018-09-09T13:00:00,TB,NO,AWAY,14,Ryan Fitzpatrick,QB,1,28.0,21.0,417.0,75.0,14.9,19.9,4.0,0.0,156.25,58.0,0.0,0.0,12.0,36.0,3.0,1.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,42.28,0.0,0.0,0.0,0.0,0.0,,20181TBNO,2018,1,48.0,40.0,529.0,475.0
1,1,1.0,201810122,18878,2018-09-09T13:00:00,NO,TB,HOME,41,Alvin Kamara,RB,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,29.0,3.6,2.0,10.0,12.0,9.0,112.0,12.4,1.0,35.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,43.1,0.0,0.0,0.0,0.0,0.0,,20181NOTB,2018,1,40.0,48.0,475.0,529.0
2,2,2.0,201810122,7242,2018-09-09T13:00:00,NO,TB,HOME,9,Drew Brees,QB,1,45.0,37.0,439.0,82.2,9.8,11.9,3.0,0.0,129.54,35.0,1.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,31.56,0.0,0.0,0.0,0.0,0.0,,20181NOTB,2018,1,40.0,48.0,475.0,529.0
3,3,4.0,201810129,18082,2018-09-09T16:05:00,KC,LAC,AWAY,10,Tyreek Hill,WR,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,4.0,2.0,0.0,7.0,8.0,7.0,169.0,24.1,2.0,58.0,0.0,0.0,2.0,95.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,36.3,0.0,0.0,0.0,0.0,0.0,,20181KCLAC,2018,1,38.0,28.0,362.0,541.0
4,4,5.0,201810108,18983,2018-09-09T13:00:00,PIT,CLE,AWAY,30,James Conner,RB,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,31.0,135.0,4.4,2.0,22.0,6.0,5.0,57.0,11.4,0.0,19.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34.2,0.0,0.0,0.0,0.0,0.0,,20181PITCLE,2018,1,21.0,21.0,472.0,327.0


In [7]:
data['Week'] = data['Week'].astype(str)
data.dtypes

Unnamed: 0          int64
Unnamed: 0.1      float64
GameKey             int64
PlayerID            int64
GameDate           object
                   ...   
Week               object
TeamPoints        float64
OpponentPoints    float64
YardsFor          float64
YardsAgainst      float64
Length: 63, dtype: object

Before the train test split, we have to calculate the trailing average fantasy points for each observation, as we cannot incorporate this step into the pipeline without causing data leakage.

In [8]:
def trailing_stats(df):
    
    """
    Function to create a new column with a trailing aggregate mean
    as a new feature for prediction.
    
    Inputs:
        - df: The dataframe on which the function will be applied
        - Column: The column on which to apply the function
        - Window: The number of past values to consider when apply the function

        
    Output:
        - An aggregate value
        
    """
    #Access the column names in stats_for_trailing
    global stats_for_trailing
    
    # Get all unique players in the DataFrame
    players = df['Name'].unique().tolist()
    
    # Define a DataFrame to hold our values
    df_out = pd.DataFrame()
    # Loop through the unique players
    for player in players:
        # Create a temporary dataframe for each player
        temp_df = df[(df['Name'] == player) & (df['InjuryStatus'] != 'Out')] 
        # Calculate the n game trailing average for all players.  Set closed parameter to 'left'
        # so that the current value for fantasy points is not included in the calculation.
        # Backfill the two resulting NaN values
        for column in stats_for_trailing:
            temp_df[f'TA7{column}'] = temp_df[column].rolling(window = 7, closed = 'left').mean().fillna(method = 'bfill')
            temp_df[f'TA3{column}'] = temp_df[column].rolling(window = 3, closed = 'left').mean().fillna(method = 'bfill')
        # Append the temporary dataframe to the output
        df_out = df_out.append(temp_df)
    # Return a dataframe with the values sorted by the original index
    df_out = df_out.sort_index()
    return df_out
    

In [9]:
# Prepare the trailing average fantasy points column (Target)
stats_for_trailing = ['FantasyPointsPPR']
data = trailing_stats(data)
data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,GameKey,PlayerID,GameDate,Team,Opponent,HomeOrAway,Number,Name,Position,Started,PassingAttempts,PassingCompletions,PassingYards,PassingCompletionPercentage,PassingYardsPerAttempt,PassingYardsPerCompletion,PassingTouchdowns,PassingInterceptions,PassingRating,PassingLong,PassingSacks,PassingSackYards,RushingAttempts,RushingYards,RushingYardsPerAttempt,RushingTouchdowns,RushingLong,ReceivingTargets,Receptions,ReceivingYards,ReceivingYardsPerReception,ReceivingTouchdowns,ReceivingLong,Fumbles,FumblesLost,PuntReturns,PuntReturnYards,PuntReturnTouchdowns,KickReturns,KickReturnYards,KickReturnTouchdowns,FieldGoalsAttempted,FieldGoalsMade,ExtraPointsMade,TwoPointConversionPasses,TwoPointConversionRuns,TwoPointConversionReceptions,FantasyPointsPPR,FieldGoalsMade0to19,FieldGoalsMade20to29,FieldGoalsMade30to39,FieldGoalsMade40to49,FieldGoalsMade50Plus,InjuryStatus,MatchString,Season,Week,TeamPoints,OpponentPoints,YardsFor,YardsAgainst,TA7FantasyPointsPPR,TA3FantasyPointsPPR
0,0,0.0,201810122,8283,2018-09-09T13:00:00,TB,NO,AWAY,14,Ryan Fitzpatrick,QB,1,28.0,21.0,417.0,75.0,14.9,19.9,4.0,0.0,156.25,58.0,0.0,0.0,12.0,36.0,3.0,1.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,42.28,0.0,0.0,0.0,0.0,0.0,,20181TBNO,2018,1,48.0,40.0,529.0,475.0,17.142857,32.466667
1,1,1.0,201810122,18878,2018-09-09T13:00:00,NO,TB,HOME,41,Alvin Kamara,RB,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,29.0,3.6,2.0,10.0,12.0,9.0,112.0,12.4,1.0,35.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,43.1,0.0,0.0,0.0,0.0,0.0,,20181NOTB,2018,1,40.0,48.0,475.0,529.0,26.442857,31.666667
2,2,2.0,201810122,7242,2018-09-09T13:00:00,NO,TB,HOME,9,Drew Brees,QB,1,45.0,37.0,439.0,82.2,9.8,11.9,3.0,0.0,129.54,35.0,1.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,31.56,0.0,0.0,0.0,0.0,0.0,,20181NOTB,2018,1,40.0,48.0,475.0,529.0,21.371429,29.906667
3,3,4.0,201810129,18082,2018-09-09T16:05:00,KC,LAC,AWAY,10,Tyreek Hill,WR,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,4.0,2.0,0.0,7.0,8.0,7.0,169.0,24.1,2.0,58.0,0.0,0.0,2.0,95.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,36.3,0.0,0.0,0.0,0.0,0.0,,20181KCLAC,2018,1,38.0,28.0,362.0,541.0,21.571429,21.3
4,4,5.0,201810108,18983,2018-09-09T13:00:00,PIT,CLE,AWAY,30,James Conner,RB,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,31.0,135.0,4.4,2.0,22.0,6.0,5.0,57.0,11.4,0.0,19.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34.2,0.0,0.0,0.0,0.0,0.0,,20181PITCLE,2018,1,21.0,21.0,472.0,327.0,25.6,22.733333


### Split the data into features and target
No train test split, since we will be training our model on all available data for deployment.

In [10]:
data['LogFantasyPointsPPR'] = LogShift(data['FantasyPointsPPR'])

In [11]:
# Separate data from the target
y = data['LogFantasyPointsPPR']
data.drop(columns = ['FantasyPointsPPR', 'LogFantasyPointsPPR'],
            inplace = True)
X = data

### Feature Engineering

The main features that we will be engineering to predict a player's fantasy output will be the 5-game trailing average of various statistics as well as the binning of players into their respective tiers based on recent performance.

In [12]:
# Define the columns for which we want a 5 game trailing average.

stats_for_trailing = ['TotalTouchdowns','RushingYards','PassingInterceptions','PassingTouchdowns','PassingRating','PassingYards',
                      'PassingCompletionPercentage', 'PassingLong','RushingYards', 'RushingTouchdowns', 'RushingLong',
                      'RushingYardsPerAttempt', 'ReceivingYardsPerReception', 'PuntReturns', 'PuntReturnTouchdowns',
                      'Receptions','ReceivingYards','ReceivingTargets', 'ReceivingTouchdowns', 'ExtraPointsMade', 'FieldGoalsMade',
                      'FieldGoalsMade40to49','FieldGoalsMade50Plus','Fumbles','FumblesLost', 'TeamPoints', 'OpponentPoints', 'YardsFor', 'YardsAgainst']

trailing_stats = []
for col in stats_for_trailing:
    trailing_stats.append('TA3' + col)
    trailing_stats.append('TA7' + col)
trailing_stats.append('TA3FantasyPointsPPR')
trailing_stats.append('TA7FantasyPointsPPR')
trailing_stats.append('DefensiveStrength')

In [13]:
# Instantiate the function transformers for the feature engineering pipeline

touchdown_transformer = FunctionTransformer(get_touchdowns) # Get total touchdowns per week per player
yard_transformer = FunctionTransformer(get_yards) # Get total yardage per week per player
trailing_transformer = FunctionTransformer(trailing_stats_mean) # Get the 5 game trailing averages of appropriate statistics
tier_transformer = FunctionTransformer(get_tiers) # Bin players into the appropriate tiers based on recent performance
contribution_transformer = FunctionTransformer(get_contribution) # Calculate the offensive contribution of a given player relative to the team's offense
defensive_transformer = FunctionTransformer(apply_defensive_strength) # Apply the strength of the opponent's defense to the dataframe


# Instantiate the pipeline for the necessary transformations

engineering = Pipeline([('touchdown', touchdown_transformer),
                        ('yards', yard_transformer),
                       ('trailing', trailing_transformer),
                       ('tier', tier_transformer),
                       ('contribution', contribution_transformer),
                        ('defense', defensive_transformer)]
                       )

<br>

### Preprocessing

As shown above, the bulk of the null values fall into one of two categories.  They are either:
* In the InjuryStatus column
    * Here we can impute a value of healthy, as the only values in the injury column are 
* In the TA (trailing average) columns we created
    * No player with a null value played more than 5 games, therefore we cannot calculate the trailing average for them.  We will impute the mean for these columns.
    

In [14]:
# Define the groups of columns for preprocessing steps.

categorical_columns = ['Week',
                       'Team',
                      'Opponent',
                      'PlayerTier',
                       'Position',
                      'InjuryStatus']
numerical_columns = trailing_stats


In [15]:
# Create a custom function to generate a log-transformed version of continuous data with a constant 5 added prior to the transform.
LogShiftTransformer = FunctionTransformer(LogShift)

In [16]:
# Define the preprocessing steps for categorical features
categorical_transform = Pipeline([('impute_cat',SimpleImputer(strategy = 'constant',
                                                              fill_value = 'Healthy')),
                                 ('one_hot_encoder', OneHotEncoder(handle_unknown = 'ignore'))])

# Define the preprocessing steps for numerical features
numerical_transform = Pipeline([('impute_num', SimpleImputer(strategy = 'mean')),
                               ('scaler', LogShiftTransformer)])

# Instantiate the column transformer object for the preprocessing steps
preprocessing = ColumnTransformer([('num', numerical_transform, numerical_columns),
                                  ('cat', categorical_transform, categorical_columns)])

### Modelling

In [17]:
# Instantiate a pipeline with a linear regression model as a baseline

pipeline = Pipeline([('engineering', engineering),
                    ('prep', preprocessing),
                    ('model', SVR())])

In [18]:
# Set param grid values, parameters for grid search 

param_grid = {'model__kernel': [ 'rbf'],     
              'model__C': [1],
              'model__epsilon': [.1]}

grid_search = GridSearchCV(pipeline, 
                          param_grid = param_grid,
                          cv = 5,
                          verbose = 2,
                          n_jobs = -2)

In [19]:
# Fit the grid search to X_train and y_train

grid_search.fit(X, y)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('engineering',
                                        Pipeline(steps=[('touchdown',
                                                         FunctionTransformer(func=<function get_touchdowns at 0x7fc64dc6c950>)),
                                                        ('yards',
                                                         FunctionTransformer(func=<function get_yards at 0x7fc64dca3d40>)),
                                                        ('trailing',
                                                         FunctionTransformer(func=<function trailing_stats_mean at 0x7fc64dc6c680>)),
                                                        ('tier',
                                                         FunctionTransformer(func=<fun...
                                                                          'TA7PuntReturnTouchdowns', ...]),
                                                                       

In [20]:
grid_search.best_score_

0.5352059834611294

In [21]:
grid_search.best_params_

{'model__C': 1, 'model__epsilon': 0.1, 'model__kernel': 'rbf'}

In [22]:
# Save pickle of model for deployment
import pickle

pickle.dump(grid_search, open('drive/MyDrive/LHL_Final_Project/Pickles/SVR_Final.pickle', 'wb'))