Let's kick things off by importing the packages I'll be using to manipulate dataframes and do some simple visualizations.

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Import the regular season 2018 data for exploration
df_2018 = pd.read_csv('Data/2018REG.csv')

# Import the regular season 2019 data for exploration
df_2019 = pd.read_csv('Data/2019REG.csv')

# Import the regular season 2020 data for exploration
df_2020 = pd.read_csv('Data/2020REG.csv')

# Import the regular season 2021 data for exploration
df_2021 = pd.read_csv('Data/2021REG.csv')

# Import the game scores data
df_scores = pd.read_csv('Data/game_scores.csv')

In [4]:
# Print the shapes of the dataframes

print('2018: ', df_2018.shape) 
print('2019: ', df_2019.shape) 
print('2020: ', df_2020.shape) 
print('2021: ', df_2021.shape) 

2018:  (26840, 81)
2019:  (29070, 81)
2020:  (29626, 81)
2021:  (16290, 81)


In [5]:
# Append the dataframes together 

data = df_2018.append(df_2019).append(df_2020).append(df_2021)
data.reset_index(inplace = True)
data.head(15)

Unnamed: 0.1,index,Unnamed: 0,GameKey,PlayerID,SeasonType,Season,GameDate,Week,Team,Opponent,...,FieldGoalsMade0to19,FieldGoalsMade20to29,FieldGoalsMade30to39,FieldGoalsMade40to49,FieldGoalsMade50Plus,FantasyPointsDraftKings,InjuryStatus,TeamID,OpponentID,ScoreID
0,0,0,201810122,8283,1,2018,2018-09-09T13:00:00,1,TB,NO,...,0.0,0.0,0.0,0.0,0.0,45.28,,33,22,16660
1,1,1,201810122,18878,1,2018,2018-09-09T13:00:00,1,NO,TB,...,0.0,0.0,0.0,0.0,0.0,46.1,,22,33,16660
2,2,2,201810122,7242,1,2018,2018-09-09T13:00:00,1,NO,TB,...,0.0,0.0,0.0,0.0,0.0,34.56,,22,33,16660
3,3,3,201810110,13022,1,2018,2018-09-09T16:25:00,1,DEN,SEA,...,0.0,0.0,0.0,0.0,0.0,0.0,,10,30,16664
4,4,4,201810129,18082,1,2018,2018-09-09T16:05:00,1,KC,LAC,...,0.0,0.0,0.0,0.0,0.0,45.3,,16,29,16663
5,5,5,201810108,18983,1,2018,2018-09-09T13:00:00,1,PIT,CLE,...,0.0,0.0,0.0,0.0,0.0,38.2,,28,8,16655
6,6,6,201810129,8244,1,2018,2018-09-09T16:05:00,1,LAC,KC,...,0.0,0.0,0.0,0.0,0.0,32.96,,29,16,16663
7,7,7,201810129,18890,1,2018,2018-09-09T16:05:00,1,KC,LAC,...,0.0,0.0,0.0,0.0,0.0,28.34,,16,29,16663
8,8,8,201810122,3943,1,2018,2018-09-09T13:00:00,1,TB,NO,...,0.0,0.0,0.0,0.0,0.0,34.6,Questionable,33,22,16660
9,9,9,201810108,18916,1,2018,2018-09-09T13:00:00,1,PIT,CLE,...,0.0,0.0,0.0,0.0,0.0,0.0,,28,8,16655


<br>
At this point there's just one more step before we're ready to merge our player data with the game data.  We need to create a 'MatchString' for the player data in the same format as the score data so that we can perform our left join on the datasets.



In [6]:
def make_match_string(df):
    
    """
    Make a string that can be used to match team game data to player game data
    
    Input:
    - df: The dataframe to perform the operation on
     
    Output:
    - A string of format '20181TBNO'
        - Where Season-Week-AWAY-HOME is the format
        
    """
    
    if df['HomeOrAway'] == 'HOME':
        return str(df['Season']) + str(df['Week']) + str(df['Opponent']) + str(df['Team'])
    elif df['HomeOrAway'] == 'AWAY':
        return str(df['Season']) + str(df['Week']) + str(df['Team']) + str(df['Opponent'])

In [7]:
# Apply the make_match_string function to the dataset

data['MatchString'] = data.apply(lambda x: make_match_string(x), axis = 1)
data.head(-5)

Unnamed: 0.1,index,Unnamed: 0,GameKey,PlayerID,SeasonType,Season,GameDate,Week,Team,Opponent,...,FieldGoalsMade20to29,FieldGoalsMade30to39,FieldGoalsMade40to49,FieldGoalsMade50Plus,FantasyPointsDraftKings,InjuryStatus,TeamID,OpponentID,ScoreID,MatchString
0,0,0,201810122,8283,1,2018,2018-09-09T13:00:00,1,TB,NO,...,0.0,0.0,0.0,0.0,45.28,,33,22,16660,20181TBNO
1,1,1,201810122,18878,1,2018,2018-09-09T13:00:00,1,NO,TB,...,0.0,0.0,0.0,0.0,46.10,,22,33,16660,20181TBNO
2,2,2,201810122,7242,1,2018,2018-09-09T13:00:00,1,NO,TB,...,0.0,0.0,0.0,0.0,34.56,,22,33,16660,20181TBNO
3,3,3,201810110,13022,1,2018,2018-09-09T16:25:00,1,DEN,SEA,...,0.0,0.0,0.0,0.0,0.00,,10,30,16664,20181SEADEN
4,4,4,201810129,18082,1,2018,2018-09-09T16:05:00,1,KC,LAC,...,0.0,0.0,0.0,0.0,45.30,,16,29,16663,20181KCLAC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101816,16280,1520,202111031,22493,1,2021,2021-11-15T20:15:00,10,SF,LAR,...,0.0,0.0,0.0,0.0,0.00,,31,32,17831,202110LARSF
101817,16281,1521,202111031,22708,1,2021,2021-11-15T20:15:00,10,SF,LAR,...,0.0,0.0,0.0,0.0,0.00,,31,32,17831,202110LARSF
101818,16282,1522,202111031,22782,1,2021,2021-11-15T20:15:00,10,SF,LAR,...,0.0,0.0,0.0,0.0,0.00,,31,32,17831,202110LARSF
101819,16283,1523,202111031,22884,1,2021,2021-11-15T20:15:00,10,SF,LAR,...,0.0,0.0,0.0,0.0,0.00,,31,32,17831,202110LARSF


In [9]:
# Perform a left join on data and df_scores

data = data.merge(df_scores,
                 how = 'left',
                 left_on = 'MatchString',
                 right_on = 'MatchString')
data.columns

Index(['index', 'Unnamed: 0_x', 'GameKey', 'PlayerID', 'SeasonType',
       'Season_x', 'GameDate', 'Week_x', 'Team', 'Opponent',
       ...
       'Unnamed: 7', 'PtsW', 'PtsL', 'YdsW', 'TOW', 'YdsL', 'TOL', 'HomeTeam',
       'AwayTeam', 'Season_y'],
      dtype='object', length=101)

In [None]:
# Get a list of the columns present in the dataset
data.columns

There are a bunch of player positions in the data that we have no interest in for the purposes of fantasy football.  Let's get rid of those, as well as statistics that only apply to those positions.

In [None]:
# Make a list of the positions we are interested in tracking
offensive_positions = ['WR', 'RB', 'TE', 'QB', 'K']

# Filter the dataframe to include only the players at these positions
data = data.loc[data.Position.isin(offensive_positions)]

# Make a list of the columns I am looking to drop

cols_to_drop = ['Unnamed: 0', 'SeasonType', 
       'PositionCategory', 'Played', 'SoloTackles', 'AssistedTackles',
       'TacklesForLoss', 'Sacks', 'SackYards', 'QuarterbackHits',
       'PassesDefended', 'FumblesForced', 'FumblesRecovered',
       'FumbleReturnTouchdowns', 'Interceptions',
       'InterceptionReturnTouchdowns',  'FantasyPoints', 
       'FantasyPosition', 'PlayerGameID', 'ExtraPointsAttempted',
       'FantasyPointsFanDuel', 'FantasyPointsDraftKings', 'TeamID', 'OpponentID',
       'ScoreID']

# Drop unwanted columns from the dataset
data.drop(columns = cols_to_drop,
         inplace = True)

In [None]:
data.head()

### Rushing Yards
Let's explore the relatinship between rushing yards per game and mean fantasy points over the course of the season.  <br>
As seen below, it is obvious that the number of rushing yards a player obtains sets the floor for a minimum number of fantasy points obtained.

In [None]:
# Filter only players at the running back position
running_backs = data[data['Position'] == 'RB'][['Name', 'RushingYards','FantasyPointsPPR']]

# Group all statistics for individual players at the position by mean value
running_backs.groupby('Name').mean().sort_values(by = 'RushingYards', ascending = False).head(10)

In [None]:
# Plot fantasy point production against rushing yards
plt.figure(figsize = (9,6))
sns.scatterplot(x = running_backs['RushingYards'],
        y = running_backs['FantasyPointsPPR'])
plt.xlabel = ('Mean Rushing Yards')
plt.ylabel = ('Mean Fantasy Points (PPR)')
plt.title = ('Impact of Rushing Yards on Fantasy Point Production')

### Touchdowns

Let's explore the comparative impact that a touchdown has on the performance of a player.

In [None]:
# Create a new column for total touchdowns
TD_sum = data['ReceivingTouchdowns'] + data['RushingTouchdowns'] + data['PassingTouchdowns']
data['TotalTouchdowns'] = TD_sum

# Group the data by player and get the averages
touchdowns = data[['Name', 'TotalTouchdowns', 'FantasyPointsPPR', 'Position']]
touchdowns = touchdowns.groupby(['Name', 'Position']).mean().sort_values(by='TotalTouchdowns', ascending = False)
touchdowns.reset_index(inplace = True)
touchdowns.head(10)

In [None]:
# Plot fantasy point production against total touchdowns
plt.figure(figsize = (9,6))
ax = sns.scatterplot(x = touchdowns['TotalTouchdowns'],
                y = touchdowns['FantasyPointsPPR'],
                    hue = touchdowns['Position'])
ax.set(xlabel = 'Mean Touchdowns', 
       ylabel = 'Mean Fantasy Points (PPR)')
plt.title = ('Impact of Touchdowns on Fantasy Point Production')

From the above chart, we can see that the position most dependent on touchdowns is the quarterback, followed in order by the running back, wide receiver, tight end and finally the kicker.

In [None]:
# Let's visualize this same data a little differently using a violin plot

plt.figure(figsize = (9, 6))
ax = sns.violinplot(x = touchdowns['Position'],
                y = touchdowns['FantasyPointsPPR'])

### Create a function to create a trailing average or sum for a particular statistic

Using trailing aggregate statistics will be an important tool for predicting the future performance of players.  <br>
Below I will define a function that will be the basis for many of the transformations I will perform.

In [None]:
def trailing_stats_mean(df, column, window):
    
    """
    Function to create a new column with a trailing aggregate mean
    as a new feature for prediction.
    
    Inputs:
        - df: The dataframe on which the function will be applied
        - Column: The column on which to apply the function
        - Window: The number of past values to consider when apply the function

        
    Output:
        - An aggregate value
        
    """
    # Get all unique players in the DataFrame
    players = df['Name'].unique().tolist()
    
    # Define a DataFrame to hold our values
    df_out = pd.DataFrame()
    # Loop through the unique players
    for player in players:
        # Create a temporary dataframe for each player
        temp_df = df[df['Name'] == player]
        # Calculate the n game trailing average for all players.  Set closed parameter to 'left'
        # so that the current value for fantasy points is not included in the calculation.
        # Backfill the two resulting NaN values
        temp_df[f'TA_{column}'] = temp_df[column].rolling(window = window,
                                                                     closed = 'left').mean().fillna(method = 'bfill')
        # Append the temporary dataframe to the output
        df_out = df_out.append(temp_df)
    # Return a dataframe with the values sorted by the original index
    return df_out.sort_index()
    
    

Create some trailing aggregate stats that can be used as predictive indicators of future player performance.

In [None]:
# Apply the above function to the columns I want to use to create trailing indicators for

trailing_columns = ['FantasyPointsPPR',
                   'TotalTouchdowns',
                   'RushingYards',
                   'PassingInterceptions',
                   'PassingYards',
                   'PassingCompletionPercentage',
                   'Receptions',
                   'ReceivingYards',
                    'ReceivingTargets',
                   'Fumbles',
                   'FumblesLost']
df_rolling = data
for col in trailing_columns:
    df_rolling = trailing_stats_mean(df_rolling, col, 5)

    
df_rolling.head()


Check the NaN values for the new column that we've created.  Turns out that they are rooking players who don't have enough games under their belt to have a 3 point average.  I will have to figure out the best strategy for dealing with these guys.  I can either drop them or I can impute a median value for their performances.

In [None]:

df_rolling[df_rolling['TA_FantasyPointsPPR'].isna()].groupby('Name')['Name'].value_counts().head()

<br> <br>
### Ranking players by tier
In fantasy football, it is common practice to refer to players by their tier level at a given position.  For example, a running back who is expected to average around 18 points in a given week is considered an RB1.  If a player is expected to get 15 points, we refer to them as an RB2 and so on.  Let's explore this idea to see if it carries any weight for trying to make our predictions.  Rather than assigning a player a permanent value, let's use their rolling fantasy points average to assign a tier.  Could explore using a a larger window for this assignment.

In [None]:
def tier_maker(position, points):
    
    """
    Take in two arguments:
    
     - Position: Column of the dataframe holding the player position
     - Points: Trailing average of fantasy points for a given player
    
    Classify players at every position to a tier based on their recent 
    performance (Trailing average fantasy points). Classifications will work
    as follows.
    
    Running Back:
        -RB1: Trailing average greater than 18pts
        -RB2: Trailing average between 12 and 18 pts
        -RB3: Trailing average between 8 and 12 pts
        -RB4: Trailing average below 8 pts
        
    Wide Receiver:
        -WR1: Trailing average greater than 18pts
        -WR2: Trailing average between 12 and 18 pts
        -WR3: Trailing average between 8 and 12 pts
        -WR4: Trailing average below 8 pts
        
    Tight End:
        -TE1: Trailing average greater than 15pts
        -TE2: Trailing average between 10 and 15 pts
        -TE3: Trailing average below 10 pts
        
    Quarterback:
        -QB1: Trailing average greater than 24pts
        -QB2: Trailing average between 18 and 24pts
        -QB3: Trailing average between 12 and 18pts
        -QB4: Trailing average below 12 pts
    
    Kicker:
        - K1: Trailing average greater than 10 pts
        - K2: Trailing average between 7 and 10 points
        - K3: Trailing average below 7 points
        """

    # Let's make tier assignments for running backs
    if position == 'RB':
        if points > 18:
            return 'RB1'
        elif 12 < points <= 18:
            return 'RB2'
        elif 8 < points <= 12:
            return 'RB3'
        else:
            return 'RB4'
            
    # Let's make tier assignments for wide receivers
    if position == 'WR':
        if points > 18:
            return 'WR1'
        elif 12 < points <= 18:
            return 'WR2'
        elif 8 < points <= 12:
            return 'WR3'
        else:
            return 'WR4'
            
    # Let's make tier assignments for tight ends
    if position == 'TE':
        if points > 15:
            return 'TE1'
        elif 10 < points <= 15:
            return 'TE2'
        else:
            return 'TE3'
    
    # Let's make tier assignments for quarterbacks
    if position == 'QB':
        if points > 24:
            return 'QB1'
        elif 16 < points <= 22:
            return 'QB2'
        elif 10 < points <= 16:
            return 'QB3'
        else:
            return 'QB4'

    # Let's make tier assignments for kickers
    if position == 'K':
        if points > 10:
            return 'K1'
        elif 7 < points <= 10:
            return 'K2'
        else:
            return 'K3'
    
    # If noting is returned before this point, return np.nan
    return np.nan

In [None]:
# Apply the tier_maker_df() function to the dataframe

df_rolling['PlayerTier']= df_rolling.apply(lambda x: tier_maker(x['Position'], x['TA_FantasyPointsPPR']), axis = 1)

In [None]:
df_rolling

<br>

### Let's Investigate How Well the Tiers Delineate Players 

In [None]:
# Make a list of unique positions
positions = df_rolling['Position'].unique().tolist()

# Create a dataframe summarizing weekly performance by tier
df_tier = df_rolling[['Week','Position','PlayerTier', 'FantasyPointsPPR']].groupby(['Week','Position', 'PlayerTier']).mean().reset_index()
df_tier

In [None]:
# Create a facet plot of the big 4 positions illustrating the performance differences between 
# the various tiers that have been assigned.

sns.set_theme(style = 'darkgrid')
              
fig, axes = plt.subplots(2, 2, figsize=(12, 12), 
                         sharey=False)
fig.suptitle('Fantasy Point Performance by Player Tier')

# Running Backs
sns.lineplot(ax=axes[0,0], x=df_tier['Week'], 
             y=df_tier[df_tier['Position'] =='RB']['FantasyPointsPPR'],
            hue = df_tier['PlayerTier'],
            legend = False)


axes[0,0].set_title(' Running Backs')

# Wide Receiver
sns.lineplot(ax=axes[0,1], x=df_tier['Week'], 
             y=df_tier[df_tier['Position'] =='WR']['FantasyPointsPPR'],
            hue = df_tier['PlayerTier'],
            legend = False)
axes[0,1].set_title('Wide Receiver')


# Quarterbacks
sns.lineplot(ax=axes[1,0], x=df_tier['Week'], 
             y=df_tier[df_tier['Position'] =='QB']['FantasyPointsPPR'],
            hue = df_tier['PlayerTier'],
            legend = False)
axes[1,0].set_title(' Quarterbacks')


# Tight Ends
sns.lineplot(ax=axes[1,1], x=df_tier['Week'], 
             y=df_tier[df_tier['Position'] =='TE']['FantasyPointsPPR'],
            hue = df_tier['PlayerTier'],
            legend = False)
axes[1,1].set_title(' Tight Ends')

fig.legend(title = 'Tier', 
           labels = df_tier['PlayerTier'].unique().tolist(),
          loc = 'right')

###  Let's correlate our features with Fantasy Points to see how well they might predict the target variable.

In [None]:
corr = df_rolling.corr(min_periods = 100)
c = corr.unstack()
c = c.sort_values(kind = 'quicksort')
c['FantasyPointsPPR'][-30:]