# NFL Exploratory Data Analysis: Basic Statistics

## Dataset Summary

In [50]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
import datetime as dt
import re
%matplotlib inline

In [51]:
df = pd.read_csv('data/raw/basic_stats.csv',parse_dates=['Birthday'])

In [52]:
df.head()

Unnamed: 0,Age,Birth Place,Birthday,College,Current Status,Current Team,Experience,Height (inches),High School,High School Location,Name,Number,Player Id,Position,Weight (lbs),Years Played
0,,"Grand Rapids , MI",1921-05-23,Notre Dame,Retired,,3 Seasons,71.0,,,"Evans, Fred",,fredevans/2513736,,185.0,1946 - 1948
1,,"Dayton , OH",1930-12-21,Dayton,Retired,,1 Season,70.0,,,"Raiff, Jim",,jimraiff/2523700,,235.0,1954 - 1954
2,56.0,"Temple , TX",1960-09-11,Louisiana Tech,Retired,,1 Season,74.0,,,"Fowler, Bobby",,bobbyfowler/2514295,,230.0,1985 - 1985
3,30.0,"New Orleans , LA",1986-09-30,LSU,Retired,,5 Seasons,73.0,,,"Johnson, Quinn",,quinnjohnson/79593,,255.0,2009 - 2013
4,25.0,"Detroit , MI",1992-03-31,Central Michigan,Active,Pittsburgh Steelers,3rd season,77.0,Clintondale HS,"Clinton Twp.,Macomb Co., MI","Walton, L.T.",96.0,l.t.walton/2552444,DE,305.0,


## Cleaning Goals
* position to category
* column names to database table style
* current team to category
* position to category
* name split to first and last name
* player_id split to name and id parts
* years played to time delta
* birthday to datetime
* calculate missing ages where applicable
* status to category
* convert experience to numeric

In [53]:
def norm_cols(cols):
    '''
    converts column naming to traditional database-stle naming convention
    '''
    
    cols = cols.str.replace(' ','_').str.replace('(','').str.replace(')','').str.lower()
    return cols

In [54]:
# apply column renaming to dataframe
df.columns = norm_cols(df.columns)
df.columns

Index(['age', 'birth_place', 'birthday', 'college', 'current_status',
       'current_team', 'experience', 'height_inches', 'high_school',
       'high_school_location', 'name', 'number', 'player_id', 'position',
       'weight_lbs', 'years_played'],
      dtype='object')

In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17172 entries, 0 to 17171
Data columns (total 16 columns):
age                     13504 non-null float64
birth_place             14794 non-null object
birthday                16835 non-null datetime64[ns]
college                 17133 non-null object
current_status          17172 non-null object
current_team            3096 non-null object
experience              17133 non-null object
height_inches           17026 non-null float64
high_school             2514 non-null object
high_school_location    2510 non-null object
name                    17172 non-null object
number                  1708 non-null float64
player_id               17172 non-null object
position                3096 non-null object
weight_lbs              17121 non-null float64
years_played            14076 non-null object
dtypes: datetime64[ns](1), float64(4), object(11)
memory usage: 2.1+ MB


In [56]:
# set new id as index
df.set_index('player_id',inplace=True)

In [57]:
df.head()

Unnamed: 0_level_0,age,birth_place,birthday,college,current_status,current_team,experience,height_inches,high_school,high_school_location,name,number,position,weight_lbs,years_played
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
fredevans/2513736,,"Grand Rapids , MI",1921-05-23,Notre Dame,Retired,,3 Seasons,71.0,,,"Evans, Fred",,,185.0,1946 - 1948
jimraiff/2523700,,"Dayton , OH",1930-12-21,Dayton,Retired,,1 Season,70.0,,,"Raiff, Jim",,,235.0,1954 - 1954
bobbyfowler/2514295,56.0,"Temple , TX",1960-09-11,Louisiana Tech,Retired,,1 Season,74.0,,,"Fowler, Bobby",,,230.0,1985 - 1985
quinnjohnson/79593,30.0,"New Orleans , LA",1986-09-30,LSU,Retired,,5 Seasons,73.0,,,"Johnson, Quinn",,,255.0,2009 - 2013
l.t.walton/2552444,25.0,"Detroit , MI",1992-03-31,Central Michigan,Active,Pittsburgh Steelers,3rd season,77.0,Clintondale HS,"Clinton Twp.,Macomb Co., MI","Walton, L.T.",96.0,DE,305.0,


In [58]:
df.sort_index(inplace=True)

In [59]:
df.head()

Unnamed: 0_level_0,age,birth_place,birthday,college,current_status,current_team,experience,height_inches,high_school,high_school_location,name,number,position,weight_lbs,years_played
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
'omarellison/2500540,45.0,"Griffin , GA",1971-10-08,Florida State,Retired,,2 Seasons,73.0,,,"Ellison, 'Omar",,,200.0,1995 - 1996
a'shawnrobinson/2555265,22.0,"Fort Worth , TX",1995-03-21,Alabama,Active,Detroit Lions,2nd season,75.0,Arlington Heights HS,"Fort Worth, TX","Robinson, A'Shawn",91.0,DT,320.0,
a.c.bauer/2509176,,,NaT,Unknown,Retired,,1 Season,74.0,,,"Bauer, A.C.",,,210.0,1923 - 1923
a.j.bouye/2541162,25.0,"Dallas , TX",1991-08-16,Central Florida,Active,Jacksonville Jaguars,5th season,72.0,Tucker HS,GA,"Bouye, A.J.",,CB,191.0,
a.j.cann/2552330,25.0,"Bamberg , SC",1991-10-03,South Carolina,Active,Jacksonville Jaguars,3rd season,75.0,Bamberg-Ehrhardt HS,"Bamberg, SC","Cann, A.J.",60.0,OG,317.0,


In [60]:
df['current_status'] = df['current_status'].astype('category')
df['current_team'] = df['current_team'].astype('category')
df['position'] = df['position'].astype('category')

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17172 entries, 'omarellison/2500540 to zurlontipton/2550407
Data columns (total 15 columns):
age                     13504 non-null float64
birth_place             14794 non-null object
birthday                16835 non-null datetime64[ns]
college                 17133 non-null object
current_status          17172 non-null category
current_team            3096 non-null category
experience              17133 non-null object
height_inches           17026 non-null float64
high_school             2514 non-null object
high_school_location    2510 non-null object
name                    17172 non-null object
number                  1708 non-null float64
position                3096 non-null category
weight_lbs              17121 non-null float64
years_played            14076 non-null object
dtypes: category(3), datetime64[ns](1), float64(4), object(7)
memory usage: 1.8+ MB


In [62]:
def compute_age(birthday):
    ''' computes a a birthday from a given date'''
    
    today = date.today()
    return today.year - birthday.year - ((today.month, today.day) < (birthday.month, birthday.day))

In [63]:
df['computed_age'] = df['birthday'].apply(compute_age)
df.head()

Unnamed: 0_level_0,age,birth_place,birthday,college,current_status,current_team,experience,height_inches,high_school,high_school_location,name,number,position,weight_lbs,years_played,computed_age
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
'omarellison/2500540,45.0,"Griffin , GA",1971-10-08,Florida State,Retired,,2 Seasons,73.0,,,"Ellison, 'Omar",,,200.0,1995 - 1996,47.0
a'shawnrobinson/2555265,22.0,"Fort Worth , TX",1995-03-21,Alabama,Active,Detroit Lions,2nd season,75.0,Arlington Heights HS,"Fort Worth, TX","Robinson, A'Shawn",91.0,DT,320.0,,23.0
a.c.bauer/2509176,,,NaT,Unknown,Retired,,1 Season,74.0,,,"Bauer, A.C.",,,210.0,1923 - 1923,
a.j.bouye/2541162,25.0,"Dallas , TX",1991-08-16,Central Florida,Active,Jacksonville Jaguars,5th season,72.0,Tucker HS,GA,"Bouye, A.J.",,CB,191.0,,27.0
a.j.cann/2552330,25.0,"Bamberg , SC",1991-10-03,South Carolina,Active,Jacksonville Jaguars,3rd season,75.0,Bamberg-Ehrhardt HS,"Bamberg, SC","Cann, A.J.",60.0,OG,317.0,,27.0


In [64]:
df['last_name'],df['first_name'] = df['name'].str.split(', ',1).str
df.head()

Unnamed: 0_level_0,age,birth_place,birthday,college,current_status,current_team,experience,height_inches,high_school,high_school_location,name,number,position,weight_lbs,years_played,computed_age,last_name,first_name
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
'omarellison/2500540,45.0,"Griffin , GA",1971-10-08,Florida State,Retired,,2 Seasons,73.0,,,"Ellison, 'Omar",,,200.0,1995 - 1996,47.0,Ellison,'Omar
a'shawnrobinson/2555265,22.0,"Fort Worth , TX",1995-03-21,Alabama,Active,Detroit Lions,2nd season,75.0,Arlington Heights HS,"Fort Worth, TX","Robinson, A'Shawn",91.0,DT,320.0,,23.0,Robinson,A'Shawn
a.c.bauer/2509176,,,NaT,Unknown,Retired,,1 Season,74.0,,,"Bauer, A.C.",,,210.0,1923 - 1923,,Bauer,A.C.
a.j.bouye/2541162,25.0,"Dallas , TX",1991-08-16,Central Florida,Active,Jacksonville Jaguars,5th season,72.0,Tucker HS,GA,"Bouye, A.J.",,CB,191.0,,27.0,Bouye,A.J.
a.j.cann/2552330,25.0,"Bamberg , SC",1991-10-03,South Carolina,Active,Jacksonville Jaguars,3rd season,75.0,Bamberg-Ehrhardt HS,"Bamberg, SC","Cann, A.J.",60.0,OG,317.0,,27.0,Cann,A.J.


In [65]:
def numeric_experience(experience):
    ''' converts string experience to numeric'''
    
    experience = str(experience)
    
    if experience == 'Rookie':
        return 1
    elif experience == 'nan':
        return np.nan
    else:
        x = re.findall(r'\d+',experience)
        return int(x[0])

In [66]:
df['years_experience'] = df['experience'].apply(numeric_experience)
df.head()

Unnamed: 0_level_0,age,birth_place,birthday,college,current_status,current_team,experience,height_inches,high_school,high_school_location,name,number,position,weight_lbs,years_played,computed_age,last_name,first_name,years_experience
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
'omarellison/2500540,45.0,"Griffin , GA",1971-10-08,Florida State,Retired,,2 Seasons,73.0,,,"Ellison, 'Omar",,,200.0,1995 - 1996,47.0,Ellison,'Omar,2.0
a'shawnrobinson/2555265,22.0,"Fort Worth , TX",1995-03-21,Alabama,Active,Detroit Lions,2nd season,75.0,Arlington Heights HS,"Fort Worth, TX","Robinson, A'Shawn",91.0,DT,320.0,,23.0,Robinson,A'Shawn,2.0
a.c.bauer/2509176,,,NaT,Unknown,Retired,,1 Season,74.0,,,"Bauer, A.C.",,,210.0,1923 - 1923,,Bauer,A.C.,1.0
a.j.bouye/2541162,25.0,"Dallas , TX",1991-08-16,Central Florida,Active,Jacksonville Jaguars,5th season,72.0,Tucker HS,GA,"Bouye, A.J.",,CB,191.0,,27.0,Bouye,A.J.,5.0
a.j.cann/2552330,25.0,"Bamberg , SC",1991-10-03,South Carolina,Active,Jacksonville Jaguars,3rd season,75.0,Bamberg-Ehrhardt HS,"Bamberg, SC","Cann, A.J.",60.0,OG,317.0,,27.0,Cann,A.J.,3.0


In [67]:
df.drop(['age','experience'],axis=1,inplace=True)
df.head()

Unnamed: 0_level_0,birth_place,birthday,college,current_status,current_team,height_inches,high_school,high_school_location,name,number,position,weight_lbs,years_played,computed_age,last_name,first_name,years_experience
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
'omarellison/2500540,"Griffin , GA",1971-10-08,Florida State,Retired,,73.0,,,"Ellison, 'Omar",,,200.0,1995 - 1996,47.0,Ellison,'Omar,2.0
a'shawnrobinson/2555265,"Fort Worth , TX",1995-03-21,Alabama,Active,Detroit Lions,75.0,Arlington Heights HS,"Fort Worth, TX","Robinson, A'Shawn",91.0,DT,320.0,,23.0,Robinson,A'Shawn,2.0
a.c.bauer/2509176,,NaT,Unknown,Retired,,74.0,,,"Bauer, A.C.",,,210.0,1923 - 1923,,Bauer,A.C.,1.0
a.j.bouye/2541162,"Dallas , TX",1991-08-16,Central Florida,Active,Jacksonville Jaguars,72.0,Tucker HS,GA,"Bouye, A.J.",,CB,191.0,,27.0,Bouye,A.J.,5.0
a.j.cann/2552330,"Bamberg , SC",1991-10-03,South Carolina,Active,Jacksonville Jaguars,75.0,Bamberg-Ehrhardt HS,"Bamberg, SC","Cann, A.J.",60.0,OG,317.0,,27.0,Cann,A.J.,3.0


In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17172 entries, 'omarellison/2500540 to zurlontipton/2550407
Data columns (total 17 columns):
birth_place             14794 non-null object
birthday                16835 non-null datetime64[ns]
college                 17133 non-null object
current_status          17172 non-null category
current_team            3096 non-null category
height_inches           17026 non-null float64
high_school             2514 non-null object
high_school_location    2510 non-null object
name                    17172 non-null object
number                  1708 non-null float64
position                3096 non-null category
weight_lbs              17121 non-null float64
years_played            14076 non-null object
computed_age            16835 non-null float64
last_name               17172 non-null object
first_name              17172 non-null object
years_experience        17133 non-null float64
dtypes: category(3), datetime64[ns](1), float64(5), object(8)
memo

In [69]:
# active players
active = df[df['current_status'] == 'Active']
active.head()

Unnamed: 0_level_0,birth_place,birthday,college,current_status,current_team,height_inches,high_school,high_school_location,name,number,position,weight_lbs,years_played,computed_age,last_name,first_name,years_experience
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
a'shawnrobinson/2555265,"Fort Worth , TX",1995-03-21,Alabama,Active,Detroit Lions,75.0,Arlington Heights HS,"Fort Worth, TX","Robinson, A'Shawn",91.0,DT,320.0,,23.0,Robinson,A'Shawn,2.0
a.j.bouye/2541162,"Dallas , TX",1991-08-16,Central Florida,Active,Jacksonville Jaguars,72.0,Tucker HS,GA,"Bouye, A.J.",,CB,191.0,,27.0,Bouye,A.J.,5.0
a.j.cann/2552330,"Bamberg , SC",1991-10-03,South Carolina,Active,Jacksonville Jaguars,75.0,Bamberg-Ehrhardt HS,"Bamberg, SC","Cann, A.J.",60.0,OG,317.0,,27.0,Cann,A.J.,3.0
a.j.derby/2552580,"Iowa City , IA",1991-09-20,Arkansas,Active,Denver Broncos,77.0,Iowa City HS,IA,"Derby, A.J.",83.0,TE,255.0,,27.0,Derby,A.J.,3.0
a.j.francis/2541707,"Washington , DC",1990-05-07,Maryland,Active,Washington Redskins,77.0,Gonzaga HS,"Washington, DC","Francis, A.J.",,DT,330.0,,28.0,Francis,A.J.,2.0


In [70]:
active.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2877 entries, a'shawnrobinson/2555265 to ziggyhood/2507780
Data columns (total 17 columns):
birth_place             2305 non-null object
birthday                2742 non-null datetime64[ns]
college                 2877 non-null object
current_status          2877 non-null category
current_team            2877 non-null category
height_inches           2877 non-null float64
high_school             2303 non-null object
high_school_location    2299 non-null object
name                    2877 non-null object
number                  1519 non-null float64
position                2877 non-null category
weight_lbs              2877 non-null float64
years_played            0 non-null object
computed_age            2742 non-null float64
last_name               2877 non-null object
first_name              2877 non-null object
years_experience        2877 non-null float64
dtypes: category(3), datetime64[ns](1), float64(5), object(8)
memory usage: 348.7+

In [71]:
active.drop('years_played',axis=1,inplace=True)
active.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2877 entries, a'shawnrobinson/2555265 to ziggyhood/2507780
Data columns (total 16 columns):
birth_place             2305 non-null object
birthday                2742 non-null datetime64[ns]
college                 2877 non-null object
current_status          2877 non-null category
current_team            2877 non-null category
height_inches           2877 non-null float64
high_school             2303 non-null object
high_school_location    2299 non-null object
name                    2877 non-null object
number                  1519 non-null float64
position                2877 non-null category
weight_lbs              2877 non-null float64
computed_age            2742 non-null float64
last_name               2877 non-null object
first_name              2877 non-null object
years_experience        2877 non-null float64
dtypes: category(3), datetime64[ns](1), float64(5), object(7)
memory usage: 326.3+ KB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [72]:
# write processed file
active.to_csv('data/processed/basic_stats_active.csv')