In [195]:
import pandas as pd


# Exploratory Data Analysis - Turf Datasets

In [196]:
injuries = pd.read_csv("NFL_Turf/InjuryRecord.csv")  # 105 rows
injuries.head()


Unnamed: 0,PlayerKey,GameID,PlayKey,BodyPart,Surface,DM_M1,DM_M7,DM_M28,DM_M42
0,39873,39873-4,39873-4-32,Knee,Synthetic,1,1,1,1
1,46074,46074-7,46074-7-26,Knee,Natural,1,1,0,0
2,36557,36557-1,36557-1-70,Ankle,Synthetic,1,1,1,1
3,46646,46646-3,46646-3-30,Ankle,Natural,1,0,0,0
4,43532,43532-5,43532-5-69,Ankle,Synthetic,1,1,1,1


In [197]:
# The PlayKey column is the only one that has NaN values
injuries['PlayKey'].isna().sum()

28

In [198]:
# Drop the NaN values, since we won't be able to correlate these with the other tables
injuries = injuries.dropna(subset = ['PlayKey'])

In [199]:
injuries.nunique()

PlayerKey    74
GameID       76
PlayKey      76
BodyPart      3
Surface       2
DM_M1         1
DM_M7         2
DM_M28        2
DM_M42        2
dtype: int64

Note: there is only 1 unique value for DM_M1 - which means that every player on this list was injured for at least 1 day

In [200]:
injuries.dtypes

PlayerKey     int64
GameID       object
PlayKey      object
BodyPart     object
Surface      object
DM_M1         int64
DM_M7         int64
DM_M28        int64
DM_M42        int64
dtype: object

In [201]:
injuries.Surface.unique()

array(['Synthetic', 'Natural'], dtype=object)

Changing the object values of Natural and Synthetic to 0 and 1, respectively

In [202]:
# Creates a function to change the surface values
def surface_code(row):
    surface = row['Surface']
    coded_surface = 0
    if surface == 'Natural':
        coded_surface = 0
    elif surface == 'Synthetic': 
        coded_surface = 1
    
    return coded_surface 

injuries['Coded_Surface'] = injuries.apply(surface_code, axis = 1)
injuries.head()

Unnamed: 0,PlayerKey,GameID,PlayKey,BodyPart,Surface,DM_M1,DM_M7,DM_M28,DM_M42,Coded_Surface
0,39873,39873-4,39873-4-32,Knee,Synthetic,1,1,1,1,1
1,46074,46074-7,46074-7-26,Knee,Natural,1,1,0,0,0
2,36557,36557-1,36557-1-70,Ankle,Synthetic,1,1,1,1,1
3,46646,46646-3,46646-3-30,Ankle,Natural,1,0,0,0,0
4,43532,43532-5,43532-5-69,Ankle,Synthetic,1,1,1,1,1


In [203]:
# Verify that the Coded_Surface worked correctly, then change the Surface Values to the Coded and drop the Coded_Surface column
injuries['Surface'] = injuries['Coded_Surface']
injuries.drop(columns='Coded_Surface', inplace=True)
injuries.head()

Unnamed: 0,PlayerKey,GameID,PlayKey,BodyPart,Surface,DM_M1,DM_M7,DM_M28,DM_M42
0,39873,39873-4,39873-4-32,Knee,1,1,1,1,1
1,46074,46074-7,46074-7-26,Knee,0,1,1,0,0
2,36557,36557-1,36557-1-70,Ankle,1,1,1,1,1
3,46646,46646-3,46646-3-30,Ankle,0,1,0,0,0
4,43532,43532-5,43532-5-69,Ankle,1,1,1,1,1


Note: Anyone whose injury is in the DM_M42 list is also in all of the prior lists, so there will be more of the lower values due the the encoding. Going to change this to a single column with values of 1, 7, 28, and 42

In [204]:
def injury_duration(row):
    injury_duration = 0
    if row["DM_M42"] == 1:
        injury_duration = 42
    else:
        if row["DM_M28"] == 1:
            injury_duration = 28
        else:
            if row["DM_M7"] == 1:
                injury_duration = 7
            else: 
                injury_duration = 1
    
    return injury_duration

# Apply the function to all rows
injuries['Injury_Duration'] = injuries.apply(injury_duration, axis=1)
injuries.head()


Unnamed: 0,PlayerKey,GameID,PlayKey,BodyPart,Surface,DM_M1,DM_M7,DM_M28,DM_M42,Injury_Duration
0,39873,39873-4,39873-4-32,Knee,1,1,1,1,1,42
1,46074,46074-7,46074-7-26,Knee,0,1,1,0,0,7
2,36557,36557-1,36557-1-70,Ankle,1,1,1,1,1,42
3,46646,46646-3,46646-3-30,Ankle,0,1,0,0,0,1
4,43532,43532-5,43532-5-69,Ankle,1,1,1,1,1,42


In [205]:
# Remove the rows for DMs
injuries.drop(columns=['DM_M1', 'DM_M7', 'DM_M28', 'DM_M42'], inplace=True)
injuries.head()

Unnamed: 0,PlayerKey,GameID,PlayKey,BodyPart,Surface,Injury_Duration
0,39873,39873-4,39873-4-32,Knee,1,42
1,46074,46074-7,46074-7-26,Knee,0,7
2,36557,36557-1,36557-1-70,Ankle,1,42
3,46646,46646-3,46646-3-30,Ankle,0,1
4,43532,43532-5,43532-5-69,Ankle,1,42


In [206]:
# The body parts are categorical and will be better using OneHotEncoder - apparently all of the toes injuries fell into the PlayKey = NaN values
injuries.BodyPart.unique()

array(['Knee', 'Ankle', 'Foot'], dtype=object)

In [207]:
# There are 100 individual players that have been injured for at least 1 day 
# Since there are 105 rows, does this mean there were 5 reinjuries?
injuries.PlayerKey.nunique()


74

In [208]:
injuries.PlayerKey.unique()

array([39873, 46074, 36557, 46646, 43532, 41145, 46014, 44860, 44806,
       45962, 46331, 36621, 44492, 43505, 41094, 40474, 39656, 46587,
       46119, 38364, 45966, 35611, 44434, 44489, 44511, 43826, 43518,
       34347, 41943, 41209, 44900, 31070, 38228, 39956, 45950, 43540,
       44440, 44449, 42406, 42637, 46430, 38192, 39678, 39850, 42600,
       42456, 46038, 41113, 47235, 47382, 44421, 42348, 42398, 36559,
       47220, 47813, 35570, 44482, 38876, 46098, 36607, 44542, 45983,
       33474, 47287, 33337, 47307, 43672, 46316, 42418, 46394, 45187,
       42448, 47334], dtype=int64)

In [209]:
# This output only 76 unique plays with only 74 players, so only 2 players were reinjured at different times of the season
injuries.PlayKey.nunique() 

76

In [210]:
# Every GameID and PlayID are unique, meaning that once that
#  particular player was injured during a specific game at a specific play,
#  they didn't return to the field. Since the GameID numbers are not in any 
# chronological order and offer no information other than the PlayKey can, this column can be dropped
injuries.GameID.nunique()

76

Since the PlayerID, GamerID, and PlayKey number are all contained within the PlayKey, the GameID and PlayerID can be dropped. 

In [211]:
injuries.drop(columns=['GameID', 'PlayerKey'], inplace=True)
injuries.head()

Unnamed: 0,PlayKey,BodyPart,Surface,Injury_Duration
0,39873-4-32,Knee,1,42
1,46074-7-26,Knee,0,7
2,36557-1-70,Ankle,1,42
3,46646-3-30,Ankle,0,1
4,43532-5-69,Ankle,1,42


Going to look at the PlayerKey rom the PlayList file

In [212]:
plays = pd.read_csv('NFL_Turf/PlayList.csv')  # 267,000 rows
plays.head()


Unnamed: 0,PlayerKey,GameID,PlayKey,RosterPosition,PlayerDay,PlayerGame,StadiumType,FieldType,Temperature,Weather,PlayType,PlayerGamePlay,Position,PositionGroup
0,26624,26624-1,26624-1-1,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Pass,1,QB,QB
1,26624,26624-1,26624-1-2,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Pass,2,QB,QB
2,26624,26624-1,26624-1-3,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Rush,3,QB,QB
3,26624,26624-1,26624-1-4,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Rush,4,QB,QB
4,26624,26624-1,26624-1-5,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Pass,5,QB,QB


PlayKey will be used as the Key to merge the datasets, so PlayerKey and GameID can once again be removed. Similarly, we already have the FieldType information in the surface column of the other table. 

In [213]:
plays.drop(columns=['PlayerKey', 'GameID', 'FieldType'], inplace=True)
plays.head()

Unnamed: 0,PlayKey,RosterPosition,PlayerDay,PlayerGame,StadiumType,Temperature,Weather,PlayType,PlayerGamePlay,Position,PositionGroup
0,26624-1-1,Quarterback,1,1,Outdoor,63,Clear and warm,Pass,1,QB,QB
1,26624-1-2,Quarterback,1,1,Outdoor,63,Clear and warm,Pass,2,QB,QB
2,26624-1-3,Quarterback,1,1,Outdoor,63,Clear and warm,Rush,3,QB,QB
3,26624-1-4,Quarterback,1,1,Outdoor,63,Clear and warm,Rush,4,QB,QB
4,26624-1-5,Quarterback,1,1,Outdoor,63,Clear and warm,Pass,5,QB,QB


In [214]:
plays.nunique()

PlayKey           267005
RosterPosition        10
PlayerDay            215
PlayerGame            32
StadiumType           29
Temperature           79
Weather               63
PlayType              11
PlayerGamePlay       102
Position              23
PositionGroup         10
dtype: int64

In [215]:
objects = plays.dtypes[plays.dtypes == 'object'].index.tolist()
objects

['PlayKey',
 'RosterPosition',
 'StadiumType',
 'Weather',
 'PlayType',
 'Position',
 'PositionGroup']

- PlayKeys now represent all plays, not only those where injuries occurred - these will function to merge the tables
- RosterPosition, Position, and Position Group are all similar and need to be investigated
- Weather - there are 63 unique types of weather.... this is odd. 
- PlayTypes should be encoded, as they are categorical such as pass, rush, kick, ... 
- Stadium Type is also strange with 29 unique types of stadiums. These can likely be grouped in smaller categories.

In [216]:
# No Missing values from this column
plays.RosterPosition.unique()

array(['Quarterback', 'Wide Receiver', 'Linebacker', 'Running Back',
       'Defensive Lineman', 'Tight End', 'Safety', 'Cornerback',
       'Offensive Lineman', 'Kicker'], dtype=object)

In [217]:
plays.Position.unique()

array(['QB', 'Missing Data', 'WR', 'ILB', 'RB', 'DE', 'TE', 'FS', 'CB',
       'G', 'T', 'OLB', 'DT', 'SS', 'MLB', 'C', 'NT', 'DB', 'K', 'LB',
       'S', 'HB', 'P'], dtype=object)

In [218]:
plays.Position[plays.Position == "Missing Data"].value_counts()


Missing Data    45
Name: Position, dtype: int64

- There are 45 Missing values from the Actual position played, which can be set = the Roster Position. 
- Can change the Roster positions to [QB, WR, LB, RB, DL, TE, S, CB, OL, P]
- I'm thinking to fill the missing data in Position to the roster position and then dropping the position group. It will be interesting to see how the roster position differs from the actual position played.  

In [219]:
plays.PositionGroup.unique()

array(['QB', 'Missing Data', 'WR', 'LB', 'RB', 'DL', 'TE', 'DB', 'OL',
       'SPEC'], dtype=object)

In [220]:
plays.PositionGroup[plays.PositionGroup == "Missing Data"].value_counts()

Missing Data    45
Name: PositionGroup, dtype: int64

In [221]:
stadiums = plays.StadiumType.unique().tolist()
stadiums

['Outdoor',
 'Indoors',
 'Oudoor',
 'Outdoors',
 'Open',
 'Closed Dome',
 'Domed, closed',
 nan,
 'Dome',
 'Indoor',
 'Domed',
 'Retr. Roof-Closed',
 'Outdoor Retr Roof-Open',
 'Retractable Roof',
 'Ourdoor',
 'Indoor, Roof Closed',
 'Retr. Roof - Closed',
 'Bowl',
 'Outddors',
 'Retr. Roof-Open',
 'Dome, closed',
 'Indoor, Open Roof',
 'Domed, Open',
 'Domed, open',
 'Heinz Field',
 'Cloudy',
 'Retr. Roof - Open',
 'Retr. Roof Closed',
 'Outdor',
 'Outside']

Most of the values in the stadium types are just spelled wrong. There are at least 7 spellings of outdoor.

In [222]:
plays.StadiumType.isna().sum()

16910

In [223]:
plays.StadiumType.fillna('Outdoor', inplace=True)
plays.StadiumType.isna().sum()


0

In [224]:

dict = {'Outdoor': 'Outdoor',
         'Indoors': 'Indoor',
         'Oudoor': 'Outdoor',
         'Outdoors': 'Outdoor',
         'Open': 'Open Dome',
         'Closed Dome': 'Closed Dome',
         'Domed, closed': 'Closed Dome',
         'Dome': 'Closed Dome',
         'Indoor': 'Indoor',
         'Domed': 'Closed Dome',
         'Retr. Roof-Closed': 'Closed Dome',
         'Outdoor Retr Roof-Open': 'Open Dome',
         'Retractable Roof': 'Open Dome',
         'Ourdoor': 'Outdoor',
         'Indoor, Roof Closed': 'Closed Dome',
         'Retr. Roof - Closed': 'Closed Dome',
         'Bowl': 'Outdoor',
         'Outddors': 'Outdoor',
         'Retr. Roof-Open': 'Open Dome',
         'Dome, closed': 'Closed Dome',
         'Indoor, Open Roof': 'Open Dome',
         'Domed, Open': 'Open Dome',
         'Domed, open': 'Open Dome',
         'Heinz Field': 'Outdoor',
         'Cloudy': 'Outdoor',
         'Retr. Roof - Open': 'Open Dome',
         'Retr. Roof Closed': 'Closed Dome',
         'Outdor': 'Outdoor',
         'Outside': 'Outdoor'}


plays.StadiumType.replace(dict, inplace=True)
plays.head()

Unnamed: 0,PlayKey,RosterPosition,PlayerDay,PlayerGame,StadiumType,Temperature,Weather,PlayType,PlayerGamePlay,Position,PositionGroup
0,26624-1-1,Quarterback,1,1,Outdoor,63,Clear and warm,Pass,1,QB,QB
1,26624-1-2,Quarterback,1,1,Outdoor,63,Clear and warm,Pass,2,QB,QB
2,26624-1-3,Quarterback,1,1,Outdoor,63,Clear and warm,Rush,3,QB,QB
3,26624-1-4,Quarterback,1,1,Outdoor,63,Clear and warm,Rush,4,QB,QB
4,26624-1-5,Quarterback,1,1,Outdoor,63,Clear and warm,Pass,5,QB,QB


In [225]:
plays.StadiumType.nunique()

4

In [226]:
weather_list = plays.Weather.unique().tolist()
weather_list

['Clear and warm',
 'Mostly Cloudy',
 'Sunny',
 'Clear',
 'Cloudy',
 'Cloudy, fog started developing in 2nd quarter',
 'Rain',
 'Partly Cloudy',
 'Mostly cloudy',
 'Cloudy and cold',
 'Cloudy and Cool',
 'Rain Chance 40%',
 'Controlled Climate',
 'Sunny and warm',
 'Partly cloudy',
 'Clear and Cool',
 'Clear and cold',
 'Sunny and cold',
 'Indoor',
 nan,
 'Partly Sunny',
 'N/A (Indoors)',
 'Mostly Sunny',
 'Indoors',
 'Clear Skies',
 'Partly sunny',
 'Showers',
 'N/A Indoor',
 'Sunny and clear',
 'Snow',
 'Scattered Showers',
 'Party Cloudy',
 'Clear skies',
 'Rain likely, temps in low 40s.',
 'Hazy',
 'Partly Clouidy',
 'Sunny Skies',
 'Overcast',
 'Cloudy, 50% change of rain',
 'Fair',
 'Light Rain',
 'Partly clear',
 'Mostly Coudy',
 '10% Chance of Rain',
 'Cloudy, chance of rain',
 'Heat Index 95',
 'Sunny, highs to upper 80s',
 'Sun & clouds',
 'Heavy lake effect snow',
 'Mostly sunny',
 'Cloudy, Rain',
 'Sunny, Windy',
 'Mostly Sunny Skies',
 'Rainy',
 '30% Chance of Rain',
 'Clo

In [227]:
weather_dict = {'Clear and warm': 'Clear',
 'Mostly Cloudy': 'Cloudy',
 'Sunny': 'Clear',
 'Clear': 'Clear',
 'Cloudy': 'Cloudy',
 'Cloudy, fog started developing in 2nd quarter': 'Cloudy',
 'Rain': 'Rain',
 'Partly Cloudy': 'Cloudy',
 'Mostly cloudy': 'Cloudy',
 'Cloudy and cold': 'Cloudy',
 'Cloudy and Cool': 'Cloudy',
 'Rain Chance 40%': 'Rain',
 'Controlled Climate': 'Indoor',
 'Sunny and warm': 'Clear',
 'Partly cloudy': 'Cloudy',
 'Clear and Cool': 'Cloudy',
 'Clear and cold': 'Cloudy',
 'Sunny and cold': 'Clear',
 'Indoor': 'Indoor',
 'Partly Sunny': 'Clear',
 'N/A (Indoors)': 'Indoor',
 'Mostly Sunny': 'Clear',
 'Indoors': 'Indoor',
 'Clear Skies': 'Clear',
 'Partly sunny': 'Clear',
 'Showers': 'Rain',
 'N/A Indoor': 'Indoor',
 'Sunny and clear': 'Clear',
 'Snow': 'Snow',
 'Scattered Showers': 'Rain',
 'Party Cloudy': 'Cloudy',
 'Clear skies': 'Clear',
 'Rain likely, temps in low 40s.': 'Rain',
 'Hazy': 'Cloudy',
 'Partly Clouidy': 'Cloudy',
 'Sunny Skies': 'Clear',
 'Overcast': 'Cloudy',
 'Cloudy, 50% change of rain': 'Cloudy',
 'Fair': 'Clear',
 'Light Rain': 'Rain',
 'Partly clear': 'Clear',
 'Mostly Coudy': 'Cloudy',
 '10% Chance of Rain': 'Cloudy',
 'Cloudy, chance of rain': 'Cloudy',
 'Heat Index 95': 'Clear',
 'Sunny, highs to upper 80s': 'Clear',
 'Sun & clouds': 'Cloudy',
 'Heavy lake effect snow': 'Snow',
 'Mostly sunny': 'Clear',
 'Cloudy, Rain': 'Rain',
 'Sunny, Windy': 'Windy',
 'Mostly Sunny Skies': 'Clear',
 'Rainy': 'Rain',
 '30% Chance of Rain': 'Rain',
 'Cloudy, light snow accumulating 1-3"': 'Snow',
 'cloudy': 'Cloudy',
 'Clear and Sunny': 'Clear',
 'Coudy': 'Cloudy',
 'Clear and sunny': 'Clear',
 'Clear to Partly Cloudy': 'Clear',
 'Cloudy with periods of rain, thunder possible. Winds shifting to WNW, 10-20 mph.': 'Windy',
 'Rain shower': 'Rain',
 'Cold': 'Clear'}

plays.Weather.replace(weather_dict, inplace=True)
plays.head()

Unnamed: 0,PlayKey,RosterPosition,PlayerDay,PlayerGame,StadiumType,Temperature,Weather,PlayType,PlayerGamePlay,Position,PositionGroup
0,26624-1-1,Quarterback,1,1,Outdoor,63,Clear,Pass,1,QB,QB
1,26624-1-2,Quarterback,1,1,Outdoor,63,Clear,Pass,2,QB,QB
2,26624-1-3,Quarterback,1,1,Outdoor,63,Clear,Rush,3,QB,QB
3,26624-1-4,Quarterback,1,1,Outdoor,63,Clear,Rush,4,QB,QB
4,26624-1-5,Quarterback,1,1,Outdoor,63,Clear,Pass,5,QB,QB


Assess whether the nan rows are indoor statiums, in which case, change to Indoor, otherwise ???

In [228]:
plays['Weather'].unique()

array(['Clear', 'Cloudy', 'Rain', 'Indoor', nan, 'Snow', 'Windy'],
      dtype=object)

In [229]:
plays.Weather.value_counts()

Cloudy    114115
Clear      96985
Indoor     20276
Rain       14280
Snow        1945
Windy        713
Name: Weather, dtype: int64

In [231]:
plays.Weather.isna().sum()

18691

In [232]:
plays.loc[plays.StadiumType == 'Indoor', 'Weather'] = plays.loc[plays.StadiumType == 'Indoor', 'Weather'].fillna('Indoor')


In [233]:
plays.Weather.value_counts()

Cloudy    114115
Clear      96985
Indoor     27113
Rain       14280
Snow        1945
Windy        713
Name: Weather, dtype: int64

In [234]:
plays.Weather.isna().sum()

11854

For now, drop the remaining columns where weather is unknown. 

In [239]:
plays = plays.loc[plays.Weather.isna() == False]
plays.Weather.isna().sum()

0

So now the NaN values for outdoor games have been removed, and all non-outdoor games were set to indoor.

In [243]:
plays.Weather.nunique()

6

With 6 unique weather values, these data are ready to be encoded. 