In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import matplotlib as plt
import datetime as dt

# Display all columns
pd.set_option('display.max_columns', None)

# 1. Importing data from nflfastR

## Player stats from 1999-2021

In [2]:
# Player stats from 1999 - 2020
players = pd.read_csv('https://github.com/nflverse/nflfastR-data/blob/master/data/player_stats.csv.gz?raw=True', compression='gzip', low_memory=False)
# Clean dataset to relevant features
player_cols = ['player_id', 'player_name', 'recent_team', 'season', 'attempts', 'passing_yards', 'passing_tds', 'interceptions', 'passing_epa', 'rushing_yards', 'rushing_tds', 'rushing_epa', 'receiving_yards', 'receiving_tds', 'receiving_epa']
players = players[player_cols]
players = players[players['season'] > 2003]
players = players.groupby(by=['player_id', 'player_name' , 'recent_team', 'season']).sum().reset_index()
players = players[players['attempts'] > 150]
# Combine passing, rushing and receiving yards and EPA.
players['total_yards'] = players['passing_yards'] + players['rushing_yards'] + players['receiving_yards']
players['total_tds'] = players['passing_tds'] + players['rushing_tds'] + players['receiving_tds']
players['total_epa'] = players['passing_epa'] + players['rushing_epa'] + players['receiving_epa']
# Combine passing, rushing and receiving yards
players.drop(columns=['passing_yards', 'rushing_yards', 'receiving_yards', 'passing_tds', 'rushing_tds', 'receiving_tds', 'passing_epa', 'rushing_epa', 'receiving_epa'], inplace=True)
players[['games_played', 'games_won']] = 0
# Reset index
players.reset_index(drop=True, inplace=True)

In [3]:
players.head()

Unnamed: 0,player_id,player_name,recent_team,season,attempts,interceptions,total_yards,total_tds,total_epa,games_played,games_won
0,00-0001361,D.Bledsoe,BUF,2004,450,16,2969,20,-31.915171,0,0
1,00-0001361,D.Bledsoe,DAL,2005,499,17,3689,25,32.794141,0,0
2,00-0001361,D.Bledsoe,DAL,2006,169,8,1192,9,-3.91339,0,0
3,00-0001823,A.Brooks,LV,2006,192,8,1229,3,-49.179365,0,0
4,00-0001823,A.Brooks,NO,2004,542,16,3984,25,18.590598,0,0


## Game stats from 2004-2020

In [4]:
# Add the player's team for that year
def add_teams(year, season_data):
    annex = players[players['season'] == year]
    annex = dict(zip(annex['player_id'], annex['recent_team']))
    season_data['recent_team'] = season_data['passer_player_id'].apply(lambda x: annex[x] if x in annex else ' ')
    season_data = season_data[season_data.recent_team != ' ']
    season_data.reset_index(drop=True, inplace=True)
    return season_data

In [5]:
# Match winning team to the player's team
def add_games_won(season_data):
    for i in range(len(season_data)):
        # If the player's team is the home team and the home team won
        if (season_data.iloc[i,10] == season_data.iloc[i,2]) and (season_data.iloc[i,3] > season_data.iloc[i,5]):
            # Count it as a win
            season_data.iloc[i, 9] = 1
        # Or if the player's team is the away team and the away team won
        elif (season_data.iloc[i,10] == season_data.iloc[i,4]) and (season_data.iloc[i,3] < season_data.iloc[i,5]):
            # Count it as a win
            season_data.iloc[i, 9] = 1
    return season_data

In [6]:
# Save the games won and played by each player in a dictionary
def get_games_dict(season_data):
    new_dict = {}
    id_list = season_data['passer_player_id'].unique().tolist()
    for i in id_list:
        new_dict[i] = {'games_played': season_data.loc[season_data['passer_player_id'] == i, 'games_played'].sum(), 'games_won': season_data.loc[season_data['passer_player_id'] == i, 'games_won'].sum()}
    return new_dict

In [7]:
# Add the games played and won to the dataframe
def update_games(games_dict, year):
    new = players[players['season'] == year]
    id_list = new['player_id'].unique().tolist()
    for i in id_list:
        new.loc[new['player_id'] == i, 'games_played'] = games_dict[i]['games_played']
        new.loc[new['player_id'] == i, 'games_won'] = games_dict[i]['games_won']
    return new

In [67]:
play_cols = ['game_id', 'game_date', 'home_team', 'home_score', 'away_team', 'away_score', 'passer_player_name', 'passer_player_id']
playoff_date = pd.to_datetime('2005-01-02')
# low_memory=False eliminates a warning
season_data = pd.read_csv('https://github.com/nflverse/nflfastR-data/blob/master/data/play_by_play_2004.csv.gz?raw=True', compression='gzip', low_memory=False)
# Clean dataset to relevant features
season_data = season_data[play_cols]
season_data.dropna(axis=0, how='any', inplace=True)
season_data.drop_duplicates(inplace=True)
season_data[['games_played', 'games_won', 'recent_team']] = 1, 0, ' '
# Filter out games after relevant end date
season_data['game_date'] = pd.to_datetime(season_data['game_date'])
season_data = season_data[season_data['game_date'] <= playoff_date]
season_data = add_teams(2004, season_data)
season_data = add_games_won(season_data)
games_dict = get_games_dict(season_data)
new_plays = update_games(games_dict, 2004)
new_plays[['act_MVP', 'pred_MVP']] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_plays[new_plays['player_id'] == '00-0010346']['act_MVP'] = 1


In [72]:
new_plays

Unnamed: 0,player_id,player_name,recent_team,season,attempts,interceptions,total_yards,total_tds,total_epa,games_played,games_won,act_MVP,pred_MVP
0,00-0001361,D.Bledsoe,BUF,2004,450,16,2969,20,-31.915171,16,9,0,0
4,00-0001823,A.Brooks,NO,2004,542,16,3984,25,18.590598,16,8,0,0
6,00-0002110,M.Brunell,WAS,2004,237,6,1256,7,-72.579912,9,3,0,0
9,00-0003292,K.Collins,LV,2004,513,20,3531,21,-9.758616,14,4,0,0
17,00-0003739,D.Culpepper,MIN,2004,623,13,5795,47,182.846454,16,8,0,0
19,00-0004161,J.Delhomme,CAR,2004,533,15,3957,30,59.673043,16,7,0,0
26,00-0005106,B.Favre,GB,2004,573,21,4347,31,118.575554,16,10,0,0
33,00-0005180,J.Fiedler,MIA,2004,190,8,1245,7,-43.878135,8,1,0,0
37,00-0005755,J.Garcia,CLE,2004,252,9,1900,12,-26.055815,11,3,0,0
42,00-0006355,T.Green,KC,2004,556,17,4676,27,147.757509,16,7,0,0


In [76]:
new_plays.loc[new_plays['player_id'] == '00-0010346', 'act_MVP'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [77]:
new_plays

Unnamed: 0,player_id,player_name,recent_team,season,attempts,interceptions,total_yards,total_tds,total_epa,games_played,games_won,act_MVP,pred_MVP
0,00-0001361,D.Bledsoe,BUF,2004,450,16,2969,20,-31.915171,16,9,0,0
4,00-0001823,A.Brooks,NO,2004,542,16,3984,25,18.590598,16,8,0,0
6,00-0002110,M.Brunell,WAS,2004,237,6,1256,7,-72.579912,9,3,0,0
9,00-0003292,K.Collins,LV,2004,513,20,3531,21,-9.758616,14,4,0,0
17,00-0003739,D.Culpepper,MIN,2004,623,13,5795,47,182.846454,16,8,0,0
19,00-0004161,J.Delhomme,CAR,2004,533,15,3957,30,59.673043,16,7,0,0
26,00-0005106,B.Favre,GB,2004,573,21,4347,31,118.575554,16,10,0,0
33,00-0005180,J.Fiedler,MIA,2004,190,8,1245,7,-43.878135,8,1,0,0
37,00-0005755,J.Garcia,CLE,2004,252,9,1900,12,-26.055815,11,3,0,0
42,00-0006355,T.Green,KC,2004,556,17,4676,27,147.757509,16,7,0,0


## Clean season stats

In [113]:
# Play-by-play stats from 2004 - 2020
play_cols = ['game_id', 'game_date', 'home_team', 'home_score', 'away_team', 'away_score', 'passer_player_name', 'passer_player_id']
end_date = {'2004': '2005-01-02', '2005': '2006-01-01', '2006': '2006-12-31', '2007': '2007-12-30', '2008': '2008-12-28', '2009': '2010-01-03', '2010': '2011-01-02', '2011': '2012-01-01', '2012': '2012-12-30', '2013': '2013-12-29', '2014': '2014-12-28', '2015': '2016-01-03', '2016': '2017-01-01', '2017': '2017-12-31', '2018': '2018-12-30', '2019': '2019-12-29', '2020': '2021-01-03'}
plays = []
mvp = {'2004': '00-0010346', '2007': '00-0019596', '2008': '00-0010346', '2009': '00-0010346', '2010': '00-0019596', '2011': '00-0023459', '2013': '00-0010346', '2014': '00-0023459', '2015': '00-0027939', '2016': '00-0026143', '2017': '00-0019596', '2018': '00-0033873', '2019': '00-0034796', '2020': '00-0023459'}
years = list(range(2004,2021))
years = [e for e in years if e not in (2005, 2006, 2012)]

for i in years:
    playoff_date = pd.to_datetime(end_date[str(i)])
    # low_memory=False eliminates a warning
    season_data = pd.read_csv('https://github.com/nflverse/nflfastR-data/blob/master/data/play_by_play_' + str(i) + '.csv.gz?raw=True', compression='gzip', low_memory=False)
    # Clean dataset to relevant features
    season_data = season_data[play_cols]
    season_data.dropna(axis=0, how='any', inplace=True)
    season_data.drop_duplicates(inplace=True)
    season_data[['games_played', 'games_won', 'recent_team']] = 1, 0, ' '
    # Filter out games after relevant end date
    season_data['game_date'] = pd.to_datetime(season_data['game_date'])
    season_data = season_data[season_data['game_date'] <= playoff_date]
    season_data = add_teams(i, season_data)
    season_data = add_games_won(season_data)
    games_dict = get_games_dict(season_data)
    new_plays = update_games(games_dict, i)
    # Add MVP
    new_plays[['act_MVP', 'pred_MVP']] = 0
    new_plays['prob_MVP'] = 0.0
    new_plays.loc[new_plays['player_id'] == mvp[str(i)], 'act_MVP'] = 1
    # Assign rankings
    new_plays['total_tds_rank'] = new_plays['total_tds'].rank(method='max', ascending=False)
    new_plays['total_epa_rank'] = new_plays['total_epa'].rank(method='min', ascending=False)
    new_plays['total_yards_rank'] = new_plays['total_yards'].rank(method='min', ascending=False)
    new_plays['games_won_rank'] = new_plays['games_won'].rank(method='min', ascending=False)
    # Append dataframe to list
    plays.append(new_plays)

# Convert list to dataframe
df = pd.concat(plays)
df.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_plays['prob_MVP'] = 0.0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value 

# Training the Model

In [115]:
df = df[['player_name','recent_team', 'season', 'prob_MVP', 'pred_MVP', 'act_MVP', 'interceptions', 'total_tds_rank', 'total_epa_rank', 'total_yards_rank', 'games_won_rank', 'games_played']]