In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

%matplotlib inline

# Cleaning NFL stats
* For scraping & generating CSV, see Scraping.ipynb

In [3]:
df_stats = pd.read_csv("nfl_stats.csv")

In [4]:
# clean up null values in games
df_stats.games.fillna(0, inplace=True)

In [5]:
# get first season for player with > 10 games, if none > 10 games, get their first season
# would like to also get # seasons before 10 games

### Determining First Year Passer rating
* Find first 10 game season if exists
* If no 10 game season, then get first season
* Also calculating years to 10 games as a feature
* Calculate passer rating using NFL formula [Wiki](https://en.wikipedia.org/wiki/Passer_rating)

In [6]:
# get first 10 year season
df_gt_10= df_stats[df_stats.games >= 10].copy()
df_gt_10.sort_values(['player','year'], inplace = True)
df_first_10 = df_gt_10.groupby('player')['year'].min().reset_index()
df_first_10 = df_first_10.rename(columns={'year': 'first_10'})


In [7]:
# get first
df_first = df_stats.sort_values(['player','year']).copy()
df_first = df_first.groupby('player')['year'].min().reset_index()
df_first = df_first.rename(columns={'year': 'first_year'})

In [8]:
# merge 1st season and 10 season
df_years = df_first.merge(df_first_10, how='left', on='player')

In [9]:
# calculate years to 10 year (if no 10 year, then 0)
df_years['year'] = df_years.apply(lambda x: x['first_year']
                                  if pd.isnull(x['first_10']) 
                                  else x['first_10'], 
                                  axis=1)
df_years['years_to_10'] = df_years.apply(lambda x: x['year'] - x['first_year'], axis = 1)
del df_years['first_year']
del df_years['first_10']

In [10]:
# merge with OG dataframe to get stats for first 10-year
df_nfl_year1 = df_stats.merge(df_years, left_on = ['player','year'], right_on =['player','year'])

In [13]:
# calculate passer rating for 1st year
def passer_rating(comp, att, yds, td, intc):
    if att == 0:
        rating = 0
    else:
        a = (comp/att - .3) * 5
        a = a if a > 0 else 0
        a = a if a < 2.375 else 2.375
        b = (yds/att - 3) * .25
        b = b if b > 0 else 0
        b = b if b < 2.375 else 2.375
        c = td/att * 20
        c = c if c > 0 else 0
        c = c if c < 2.375 else 2.375
        d = 2.375 - (intc/att * 25)
        d = d if d > 0 else 0
        d = d if d < 2.375 else 2.375
        rating = (a+b+c+d)/6*100
    return rating

df_nfl_year1['passer_rating'] = df_nfl_year1.apply(lambda x: 
                                                passer_rating(
                                                 x['pass_cmp'],
                                                 x['pass_att'],
                                                 x['pass_yds'],
                                                 x['pass_td'],
                                                 x['pass_int']),
                                               axis=1)
    

In [54]:
# pickle
import pickle

with open('nfl_year1.pkl', 'wb') as picklefile:
    pickle.dump(df_nfl_year1, picklefile)

In [14]:
df_nfl_year1.head(5)