In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import matplotlib as plt

# Display all columns
pd.set_option('display.max_columns', None)

# 1. Importing data from nflfastR

## Player stats from 1999-2021

In [224]:
# Player stats from 1999 - 2020
players = pd.read_csv('https://github.com/nflverse/nflfastR-data/blob/master/data/player_stats.csv.gz?raw=True', compression='gzip', low_memory=False)
# Clean dataset to relevant features
player_cols = ['player_id', 'player_name', 'recent_team', 'season', 'attempts', 'passing_yards', 'passing_tds', 'interceptions', 'passing_epa', 'rushing_yards', 'rushing_tds', 'rushing_epa', 'receiving_yards', 'receiving_tds', 'receiving_epa']
players = players[player_cols]
players = players[players['season'] > 2003]
players = players.groupby(by=['player_id', 'player_name' , 'recent_team', 'season']).sum().reset_index()
players = players[players['attempts'] > 150]
# Combine passing, rushing and receiving yards and EPA.
players['total_yards'] = players['passing_yards'] + players['rushing_yards'] + players['receiving_yards']
players['total_tds'] = players['passing_tds'] + players['rushing_tds'] + players['receiving_tds']
players['total_epa'] = players['passing_epa'] + players['rushing_epa'] + players['receiving_epa']
# Combine passing, rushing and receiving yards
players.drop(columns=['passing_yards', 'rushing_yards', 'receiving_yards', 'passing_tds', 'rushing_tds', 'receiving_tds', 'passing_epa', 'rushing_epa', 'receiving_epa'], inplace=True)
players[['games_played', 'games_won']] = 0
# Reset index
players.reset_index(drop=True, inplace=True)

In [241]:
players.head(50)

Unnamed: 0,player_id,player_name,recent_team,season,attempts,interceptions,total_yards,total_tds,total_epa,games_played,games_won
0,00-0001361,D.Bledsoe,BUF,2004,450,16,2969,20,-31.915171,0,0
1,00-0001361,D.Bledsoe,DAL,2005,499,17,3689,25,32.794141,0,0
2,00-0001361,D.Bledsoe,DAL,2006,169,8,1192,9,-3.91339,0,0
3,00-0001823,A.Brooks,LV,2006,192,8,1229,3,-49.179365,0,0
4,00-0001823,A.Brooks,NO,2004,542,16,3984,25,18.590598,0,0
5,00-0001823,A.Brooks,NO,2005,431,17,3163,15,12.182625,0,0
6,00-0002110,M.Brunell,WAS,2004,237,6,1256,7,-72.579912,0,0
7,00-0002110,M.Brunell,WAS,2005,506,11,3462,24,10.083081,0,0
8,00-0002110,M.Brunell,WAS,2006,261,4,1823,8,-1.035508,0,0
9,00-0003292,K.Collins,LV,2004,513,20,3531,21,-9.758616,0,0


## Game stats from 2004-2020

In [226]:
def add_teams(year, season_data):
    annex = players[players['season'] == year]
    annex = dict(zip(annex['player_id'], annex['recent_team']))
    season_data['recent_team'] = season_data['passer_player_id'].apply(lambda x: annex[x] if x in annex else ' ')
    season_data = season_data[season_data.recent_team != ' ']
    season_data.reset_index(drop=True, inplace=True)
    return season_data

In [227]:
def add_games_won(season_data):
    for i in range(len(season_data)):
        # If the player's team is the home team and the home team won
        if (season_data.iloc[i,10] == season_data.iloc[i,2]) and (season_data.iloc[i,3] > season_data.iloc[i,5]):
            # Count it as a win
            season_data.iloc[i, 9] = 1
        # Or if the player's team is the away team and the away team won
        elif (season_data.iloc[i,10] == season_data.iloc[i,4]) and (season_data.iloc[i,3] < season_data.iloc[i,5]):
            # Count it as a win
            season_data.iloc[i, 9] = 1
    return season_data

In [228]:
def get_games_dict(season_data):
    new_dict = {}
    id_list = season_data['passer_player_id'].unique().tolist()
    for i in id_list:
        new_dict[i] = {'games_played': season_data.loc[season_data['passer_player_id'] == i, 'games_played'].sum(), 'games_won': season_data.loc[season_data['passer_player_id'] == i, 'games_won'].sum()}
    return new_dict

In [229]:
def update_games(games_dict, year):
    new = players[players['season'] == year]
    id_list = new['player_id'].unique().tolist()
    for i in id_list:
        new.loc[new['player_id'] == i, 'games_played'] = games_dict[i]['games_played']
        new.loc[new['player_id'] == i, 'games_won'] = games_dict[i]['games_won']
    return new

-------

In [247]:
# play_cols = ['game_id', 'game_date', 'home_team', 'home_score', 'away_team', 'away_score', 'passer_player_name', 'passer_player_id']
# end_date = {'2004': '2005-01-02', '2005': '2006-01-01', '2006': '2006-12-31', '2007': '2007-12-30', '2008': '2008-12-28', '2009': '2010-01-03', '2010': '2011-01-02', '2011': '2012-12-30', '2012': '2019-09-30', '2013': '2013-12-29', '2014': '2014-12-28', '2015': '2015-01-03', '2016': '2017-01-01', '2017': '2017-12-31', '2018': '2018-12-30', '2019': '2019-12-29', '2020': '2021-01-03', '2021': '2022-01-09'}
# season_data = pd.read_csv('https://github.com/nflverse/nflfastR-data/blob/master/data/play_by_play_2015.csv.gz?raw=True', compression='gzip', low_memory=False)
# # Clean dataset to relevant features
# season_data = season_data[play_cols]
# season_data.dropna(axis=0, how='any', inplace=True)
# season_data.drop_duplicates(inplace=True)
# season_data[['games_played', 'games_won', 'recent_team']] = 1, 0, ' '
# season_data = season_data[season_data['game_date'] < end_date[str(2015)]]
# season_data

In [248]:
# Play-by-play stats from 2004 - 2020
play_cols = ['game_id', 'game_date', 'home_team', 'home_score', 'away_team', 'away_score', 'passer_player_name', 'passer_player_id']
# end_date = {'2004': '2005-01-02', '2005': '2006-01-01', '2006': '2006-12-31', '2007': '2007-12-30', '2008': '2008-12-28', '2009': '2010-01-03', '2010': '2011-01-02', '2011': '2012-12-30', '2012': '2019-09-30', '2013': '2013-12-29', '2014': '2014-12-28', '2015': '2015-01-03', '2016': '2017-01-01', '2017': '2017-12-31', '2018': '2018-12-30', '2019': '2019-12-29', '2020': '2021-01-03', '2021': '2022-01-09'}
plays = []
years = list(range(2004,2021))

for i in years:  
    # low_memory=False eliminates a warning
    season_data = pd.read_csv('https://github.com/nflverse/nflfastR-data/blob/master/data/play_by_play_' + str(i) + '.csv.gz?raw=True', compression='gzip', low_memory=False)
    # Clean dataset to relevant features
    season_data = season_data[play_cols]
    season_data.dropna(axis=0, how='any', inplace=True)
    season_data.drop_duplicates(inplace=True)
    season_data[['games_played', 'games_won', 'recent_team']] = 1, 0, ' '
    # Filter out games after relevant end date
    # season_data = season_data[season_data['game_date'] < end_date[str(i)]]
    season_data = add_teams(i, season_data)
    season_data = add_games_won(season_data)
    games_dict = get_games_dict(season_data)
    new_plays = update_games(games_dict, i)
    # Append dataframe to list
    plays.append(new_plays)

# Convert list to dataframe
df = pd.concat(plays)
df.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [250]:
df.tail(50)

Unnamed: 0,player_id,player_name,recent_team,season,attempts,interceptions,total_yards,total_tds,total_epa,games_played,games_won
607,00-0034771,M.Rudolph,PIT,2019,283,9,1807,13,-32.231451,10,5
608,00-0034796,L.Jackson,BAL,2019,460,8,4841,44,189.929095,16,13
609,00-0034855,B.Mayfield,CLE,2019,534,21,3968,25,0.951979,16,6
610,00-0034857,J.Allen,BUF,2019,413,9,3193,22,-18.555112,17,10
611,00-0034869,S.Darnold,NYJ,2019,441,13,3086,21,-24.169827,13,7
612,00-0035040,D.Blough,DET,2019,174,6,1034,5,-40.217407,5,0
613,00-0035228,K.Murray,ARI,2019,542,12,4266,24,1.391961,16,5
614,00-0035232,D.Haskins,WAS,2019,203,7,1466,7,-49.217326,9,2
615,00-0035289,G.Minshew,JAX,2019,194,2,1567,9,-1.284651,14,6
616,00-0035289,G.Minshew II,JAX,2019,276,4,2048,12,-17.622178,14,6
