In [3]:
# Import dependencies
import pandas as pd
import numpy as np
import matplotlib as plt
import datetime as dt

# Display all columns
pd.set_option('display.max_columns', None)

# 1. Importing data from nflfastR

## Player stats from 1999-2021

In [4]:
# Player stats from 1999 - 2021
players = pd.read_csv('https://github.com/nflverse/nflfastR-data/blob/master/data/player_stats.csv.gz?raw=True', compression='gzip', low_memory=False)
# Clean dataset to relevant features
player_cols = ['player_id', 'player_name', 'recent_team', 'season', 'attempts', 'passing_yards', 'passing_tds', 'interceptions', 'passing_epa', 'rushing_yards', 'rushing_tds', 'rushing_epa', 'receiving_yards', 'receiving_tds', 'receiving_epa']
players = players[player_cols]
players = players[players['season'] > 2003]
players = players.groupby(by=['player_id', 'player_name' , 'recent_team', 'season']).sum().reset_index()
players = players[players['attempts'] > 150]
# Combine passing, rushing and receiving yards and EPA.
players['total_yards'] = players['passing_yards'] + players['rushing_yards'] + players['receiving_yards']
players['total_tds'] = players['passing_tds'] + players['rushing_tds'] + players['receiving_tds']
players['total_epa'] = players['passing_epa'] + players['rushing_epa'] + players['receiving_epa']
# Combine passing, rushing and receiving yards
players.drop(columns=['passing_yards', 'rushing_yards', 'receiving_yards', 'passing_tds', 'rushing_tds', 'receiving_tds', 'passing_epa', 'rushing_epa', 'receiving_epa'], inplace=True)
players[['games_played', 'games_won']] = 0
# Reset index
players.reset_index(drop=True, inplace=True)

In [5]:
players.head()

Unnamed: 0,player_id,player_name,recent_team,season,attempts,interceptions,total_yards,total_tds,total_epa,games_played,games_won
0,00-0001361,D.Bledsoe,BUF,2004,450,16,2969,20,-31.915171,0,0
1,00-0001361,D.Bledsoe,DAL,2005,499,17,3689,25,32.794141,0,0
2,00-0001361,D.Bledsoe,DAL,2006,169,8,1192,9,-3.91339,0,0
3,00-0001823,A.Brooks,LV,2006,192,8,1229,3,-49.179365,0,0
4,00-0001823,A.Brooks,NO,2004,542,16,3984,25,18.590598,0,0


## Define functions

In [6]:
# Add the player's team for that year
def add_teams(year, season_data):
    annex = players[players['season'] == year]
    annex = dict(zip(annex['player_id'], annex['recent_team']))
    season_data['recent_team'] = season_data['passer_player_id'].apply(lambda x: annex[x] if x in annex else ' ')
    season_data = season_data[season_data.recent_team != ' ']
    season_data.reset_index(drop=True, inplace=True)
    return season_data

In [7]:
# Match winning team to the player's team
def add_games_won(season_data):
    for i in range(len(season_data)):
        # If the player's team is the home team and the home team won
        if (season_data.iloc[i,10] == season_data.iloc[i,2]) and (season_data.iloc[i,3] > season_data.iloc[i,5]):
            # Count it as a win
            season_data.iloc[i, 9] = 1
        # Or if the player's team is the away team and the away team won
        elif (season_data.iloc[i,10] == season_data.iloc[i,4]) and (season_data.iloc[i,3] < season_data.iloc[i,5]):
            # Count it as a win
            season_data.iloc[i, 9] = 1
    return season_data

In [8]:
# Save the games won and played by each player in a dictionary
def get_games_dict(season_data):
    new_dict = {}
    id_list = season_data['passer_player_id'].unique().tolist()
    for i in id_list:
        new_dict[i] = {'games_played': season_data.loc[season_data['passer_player_id'] == i, 'games_played'].sum(), 'games_won': season_data.loc[season_data['passer_player_id'] == i, 'games_won'].sum()}
    return new_dict

In [9]:
# Add the games played and won to the dataframe
def update_games(games_dict, year):
    new = players[players['season'] == year]
    id_list = new['player_id'].unique().tolist()
    for i in id_list:
        new.loc[new['player_id'] == i, 'games_played'] = games_dict[i]['games_played']
        new.loc[new['player_id'] == i, 'games_won'] = games_dict[i]['games_won']
    return new

# 2. Clean season stats

In [157]:
# Play-by-play stats from 2004 - 2020
play_cols = ['game_id', 'game_date', 'home_team', 'home_score', 'away_team', 'away_score', 'passer_player_name', 'passer_player_id']
end_date = {'2004': '2005-01-02', '2005': '2006-01-01', '2006': '2006-12-31', '2007': '2007-12-30', '2008': '2008-12-28', '2009': '2010-01-03', '2010': '2011-01-02', '2011': '2012-01-01', '2012': '2012-12-30', '2013': '2013-12-29', '2014': '2014-12-28', '2015': '2016-01-03', '2016': '2017-01-01', '2017': '2017-12-31', '2018': '2018-12-30', '2019': '2019-12-29', '2020': '2021-01-03', '2021': '2022-01-09'}
plays = []
mvp = {'2004': '00-0010346', '2007': '00-0019596', '2008': '00-0010346', '2009': '00-0010346', '2010': '00-0019596', '2011': '00-0023459', '2013': '00-0010346', '2014': '00-0023459', '2015': '00-0027939', '2016': '00-0026143', '2017': '00-0019596', '2018': '00-0033873', '2019': '00-0034796', '2020': '00-0023459', '2021': '00-0023459'}
years = list(range(2004,2022))
years = [e for e in years if e not in (2005, 2006, 2012)]

for i in years:
    playoff_date = pd.to_datetime(end_date[str(i)])
    # low_memory=False eliminates a warning
    season_data = pd.read_csv('https://github.com/nflverse/nflfastR-data/blob/master/data/play_by_play_' + str(i) + '.csv.gz?raw=True', compression='gzip', low_memory=False)
    # Clean dataset to relevant features
    season_data = season_data[play_cols]
    season_data.dropna(axis=0, how='any', inplace=True)
    season_data.drop_duplicates(inplace=True)
    season_data[['games_played', 'games_won', 'recent_team']] = 1, 0, ' '
    # Filter out games after relevant end date
    season_data['game_date'] = pd.to_datetime(season_data['game_date'])
    season_data = season_data[season_data['game_date'] <= playoff_date]
    season_data = add_teams(i, season_data)
    season_data = add_games_won(season_data)
    games_dict = get_games_dict(season_data)
    new_plays = update_games(games_dict, i)
    # Add MVP
    new_plays[['prob_MVP', 'act_MVP']] = 0
    new_plays.loc[new_plays['player_id'] == mvp[str(i)], 'act_MVP'] = 1
    # Assign rankings
    new_plays = new_plays.groupby(['player_id', 'recent_team', 'season', 'games_played', 'games_won', 'prob_MVP', 'act_MVP'],as_index=False)[['attempts', 'interceptions', 'total_yards', 'total_tds', 'total_epa']].agg('sum')
    new_plays['total_tds_rank'] = new_plays['total_tds'].rank(method='average', ascending=False)
    new_plays['total_epa_rank'] = new_plays['total_epa'].rank(method='average', ascending=False)
    new_plays['total_yards_rank'] = new_plays['total_yards'].rank(method='average', ascending=False)
    new_plays['games_won_rank'] = new_plays['games_won'].rank(method='average', ascending=False)
    # Append dataframe to list
    plays.append(new_plays)

# Convert list to dataframe
df = pd.concat(plays)
df.reset_index(drop=True, inplace=True)

# 3. Training the Model

In [159]:
# Omitting 'games_played'
df = df[['player_id','recent_team', 'season', 'act_MVP', 'interceptions', 'total_tds_rank', 'total_epa_rank', 'total_yards_rank', 'games_won_rank']]

In [160]:
df.shape

(572, 9)

In [161]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from scipy.special import softmax

## Split data

In [162]:
feature_cols = ['interceptions', 'total_tds_rank', 'total_epa_rank', 'total_yards_rank', 'games_won_rank']

# Split into training and test sets
train = df[df['season'] < 2017]
test = df[df['season'] >= 2017]
x_train = train[feature_cols]
y_train = train['act_MVP']
x_test = test[feature_cols]
y_test = test['act_MVP']

In [163]:
# Fit the model
logr = LogisticRegression()
logr.fit(x_train, y_train)

LogisticRegression()

In [164]:
# Calculate the model accruacy
score = logr.score(x_test, y_test)
score

0.9894736842105263

## Gather coefficients

In [165]:
print("Features: ", feature_cols)
print("Coefficients: ", logr.coef_)
print("Intercept: ", logr.intercept_)

Features:  ['interceptions', 'total_tds_rank', 'total_epa_rank', 'total_yards_rank', 'games_won_rank']
Coefficients:  [[-0.31995716  0.14681122 -0.52002776 -0.43299722 -0.92778108]]
Intercept:  [7.94432044]


# 4. Testing the model

In [175]:
new = df[df['season'] == 2004]
new['prob_MVP'] = logr.predict_proba(new[feature_cols]).tolist()
new['prob_MVP'] = new['prob_MVP'].apply(lambda x: x[1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new['prob_MVP'] = logr.predict_proba(new[feature_cols]).tolist()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new['prob_MVP'] = new['prob_MVP'].apply(lambda x: x[1])


In [176]:
pd.set_option("display.precision", 15)

In [177]:
new = new[['player_id', 'recent_team', 'season', 'prob_MVP', 'act_MVP', 'interceptions', 'total_tds_rank', 'total_epa_rank', 'total_yards_rank', 'games_won_rank']]
new.sort_values(by=['prob_MVP'], ascending=False).head(15)

Unnamed: 0,player_id,recent_team,season,prob_MVP,act_MVP,interceptions,total_tds_rank,total_epa_rank,total_yards_rank,games_won_rank
13,00-0011022,PHI,2004,0.302941409287557,0,11,3.0,4.0,3.0,2.5
12,00-0010346,IND,2004,0.300377525840083,1,12,1.0,1.0,2.0,4.0
20,00-0019596,NE,2004,0.030479115681379,0,14,4.0,6.0,8.0,1.0
25,00-0020531,LAC,2004,0.000792303272513,0,8,5.5,5.0,13.0,5.5
4,00-0003739,MIN,2004,7.8235643971e-05,0,13,2.0,2.0,1.0,13.0
6,00-0005106,GB,2004,9.19491134e-06,0,21,5.5,7.0,7.0,7.5
15,00-0013042,DEN,2004,6.720864697e-06,0,21,7.5,9.0,6.0,7.5
36,00-0022924,PIT,2004,3.416693316e-06,0,16,13.5,14.0,18.0,2.5
21,00-0019599,LA,2004,1.008487226e-06,0,16,9.0,8.0,4.0,13.0
9,00-0006355,KC,2004,1.1388491e-07,0,17,10.0,3.0,5.0,17.5
