In [290]:
import pandas as pd
import numpy as np


# Exploratory Data Analysis - Turf Datasets

## Analyzing the Plays from the PlayList file

The first thing to note is that this list contains all of the plays, including the exact play that will match with the injury list, therefore anything that is on both with the exception of the PlayerKey should be maintained on THIS DF so that we don't lose data on the non-injury columns

In [291]:
plays = pd.read_csv('NFL_Turf/PlayList.csv')  # 267,000 rows
plays.head()

Unnamed: 0,PlayerKey,GameID,PlayKey,RosterPosition,PlayerDay,PlayerGame,StadiumType,FieldType,Temperature,Weather,PlayType,PlayerGamePlay,Position,PositionGroup
0,26624,26624-1,26624-1-1,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Pass,1,QB,QB
1,26624,26624-1,26624-1-2,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Pass,2,QB,QB
2,26624,26624-1,26624-1-3,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Rush,3,QB,QB
3,26624,26624-1,26624-1-4,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Rush,4,QB,QB
4,26624,26624-1,26624-1-5,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Pass,5,QB,QB


PlayKey will be used as the Key to merge the datasets, so PlayerKey and GameID can be removed. While FieldType information is also in the surface column of the injuries table, we need to maintain it here, so we don't lose the data from the columns not containing injuries. 

In [292]:
plays.drop(columns=['PlayerKey', 'GameID'], inplace=True)
plays.head()

Unnamed: 0,PlayKey,RosterPosition,PlayerDay,PlayerGame,StadiumType,FieldType,Temperature,Weather,PlayType,PlayerGamePlay,Position,PositionGroup
0,26624-1-1,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Pass,1,QB,QB
1,26624-1-2,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Pass,2,QB,QB
2,26624-1-3,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Rush,3,QB,QB
3,26624-1-4,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Rush,4,QB,QB
4,26624-1-5,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Pass,5,QB,QB


In [293]:
plays.nunique()

PlayKey           267005
RosterPosition        10
PlayerDay            215
PlayerGame            32
StadiumType           29
FieldType              2
Temperature           79
Weather               63
PlayType              11
PlayerGamePlay       102
Position              23
PositionGroup         10
dtype: int64

In [294]:
objects = plays.dtypes[plays.dtypes == 'object'].index.tolist()
objects

['PlayKey',
 'RosterPosition',
 'StadiumType',
 'FieldType',
 'Weather',
 'PlayType',
 'Position',
 'PositionGroup']

- PlayKeys represent all plays, not only those where injuries occurred - these will function to merge the tables
- FieldType only has 2 values, Natural or Synthetic and can be easily changed to binary values 
- Stadium Type is also strange with 29 unique types of stadiums. These can likely be grouped in smaller categories.
- Weather - there are 63 unique types of weather.... this is odd. 
- RosterPosition, Position, and Position Group are all similar and need to be investigated
- PlayTypes should be encoded, as they are categorical such as pass, rush, kick, ... 


### Change the Field Types to Binary Values

In [295]:
# Creates a function to change the surface values
def surface_code(row):
    surface = row['FieldType']
    coded_surface = 0
    if surface == 'Natural':
        coded_surface = 0
    elif surface == 'Synthetic':
        coded_surface = 1

    return coded_surface

In [296]:
# Create a new column called Coded_Surface that encodes 0 for a natural surface and 1 for a synthetic surface, then verify
plays['Coded_Surface'] = plays.apply(surface_code, axis=1)
plays.head()

Unnamed: 0,PlayKey,RosterPosition,PlayerDay,PlayerGame,StadiumType,FieldType,Temperature,Weather,PlayType,PlayerGamePlay,Position,PositionGroup,Coded_Surface
0,26624-1-1,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Pass,1,QB,QB,1
1,26624-1-2,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Pass,2,QB,QB,1
2,26624-1-3,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Rush,3,QB,QB,1
3,26624-1-4,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Rush,4,QB,QB,1
4,26624-1-5,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Pass,5,QB,QB,1


In [297]:
# The code above worked, now change the FieldType to the coded and remove the redundant column
plays['FieldType'] = plays['Coded_Surface']
plays.drop(columns='Coded_Surface', inplace=True)

In [298]:
plays.head()

Unnamed: 0,PlayKey,RosterPosition,PlayerDay,PlayerGame,StadiumType,FieldType,Temperature,Weather,PlayType,PlayerGamePlay,Position,PositionGroup
0,26624-1-1,Quarterback,1,1,Outdoor,1,63,Clear and warm,Pass,1,QB,QB
1,26624-1-2,Quarterback,1,1,Outdoor,1,63,Clear and warm,Pass,2,QB,QB
2,26624-1-3,Quarterback,1,1,Outdoor,1,63,Clear and warm,Rush,3,QB,QB
3,26624-1-4,Quarterback,1,1,Outdoor,1,63,Clear and warm,Rush,4,QB,QB
4,26624-1-5,Quarterback,1,1,Outdoor,1,63,Clear and warm,Pass,5,QB,QB


### Reduce the Number of Stadium Types to Something Meaningful

It turns out that there are a lot of misspelled stadium types. There are 7 unique spellings of the word 'Outdoor' alone. Also, the people of Pittsburgh seemed pretty confused as to the meaning of Stadium Type, as there are MANY entries listing the stadium type as Heinz Field. 

In [299]:
stadiums = plays.StadiumType.unique().tolist()
stadiums

['Outdoor',
 'Indoors',
 'Oudoor',
 'Outdoors',
 'Open',
 'Closed Dome',
 'Domed, closed',
 nan,
 'Dome',
 'Indoor',
 'Domed',
 'Retr. Roof-Closed',
 'Outdoor Retr Roof-Open',
 'Retractable Roof',
 'Ourdoor',
 'Indoor, Roof Closed',
 'Retr. Roof - Closed',
 'Bowl',
 'Outddors',
 'Retr. Roof-Open',
 'Dome, closed',
 'Indoor, Open Roof',
 'Domed, Open',
 'Domed, open',
 'Heinz Field',
 'Cloudy',
 'Retr. Roof - Open',
 'Retr. Roof Closed',
 'Outdor',
 'Outside']

In [300]:
# How many Stadium Types are missing? 
plays.StadiumType.isna().sum()

16910

In [301]:
# Since most stadiums are outdoor stadiums, for now, just going to change any NaN stadiums to outdoor for now
plays.StadiumType.fillna('Outdoor', inplace=True)
plays.StadiumType.isna().sum()

0

Grouping all stadiums into Outdoor, Indoor, Open Dome, or Closed Dome using a dictionary 

In [302]:
dict = {'Outdoor': 'Outdoor',
        'Indoors': 'Indoor',
        'Oudoor': 'Outdoor',
        'Outdoors': 'Outdoor',
        'Open': 'Open Dome',
        'Closed Dome': 'Closed Dome',
        'Domed, closed': 'Closed Dome',
        'Dome': 'Closed Dome',
        'Indoor': 'Indoor',
        'Domed': 'Closed Dome',
        'Retr. Roof-Closed': 'Closed Dome',
        'Outdoor Retr Roof-Open': 'Open Dome',
        'Retractable Roof': 'Open Dome',
        'Ourdoor': 'Outdoor',
        'Indoor, Roof Closed': 'Closed Dome',
        'Retr. Roof - Closed': 'Closed Dome',
        'Bowl': 'Outdoor',
        'Outddors': 'Outdoor',
        'Retr. Roof-Open': 'Open Dome',
        'Dome, closed': 'Closed Dome',
        'Indoor, Open Roof': 'Open Dome',
        'Domed, Open': 'Open Dome',
        'Domed, open': 'Open Dome',
        'Heinz Field': 'Outdoor',
        'Cloudy': 'Outdoor',
        'Retr. Roof - Open': 'Open Dome',
        'Retr. Roof Closed': 'Closed Dome',
        'Outdor': 'Outdoor',
        'Outside': 'Outdoor'}


plays.StadiumType.replace(dict, inplace=True)
plays.head()

Unnamed: 0,PlayKey,RosterPosition,PlayerDay,PlayerGame,StadiumType,FieldType,Temperature,Weather,PlayType,PlayerGamePlay,Position,PositionGroup
0,26624-1-1,Quarterback,1,1,Outdoor,1,63,Clear and warm,Pass,1,QB,QB
1,26624-1-2,Quarterback,1,1,Outdoor,1,63,Clear and warm,Pass,2,QB,QB
2,26624-1-3,Quarterback,1,1,Outdoor,1,63,Clear and warm,Rush,3,QB,QB
3,26624-1-4,Quarterback,1,1,Outdoor,1,63,Clear and warm,Rush,4,QB,QB
4,26624-1-5,Quarterback,1,1,Outdoor,1,63,Clear and warm,Pass,5,QB,QB


### Dealing with the Weather Situation

There were a lot of different entries meaning the same thing; these were grouped in a dictionary the same way the stadiums were, and can be adjusted if necessary 

In [303]:
weather_dict = {'Clear and warm': 'Clear',
                'Mostly Cloudy': 'Cloudy',
                'Sunny': 'Clear',
                'Clear': 'Clear',
                'Cloudy': 'Cloudy',
                'Cloudy, fog started developing in 2nd quarter': 'Hazy/Fog',
                'Rain': 'Rain',
                'Partly Cloudy': 'Cloudy',
                'Mostly cloudy': 'Cloudy',
                'Cloudy and cold': 'Cloudy',
                'Cloudy and Cool': 'Cloudy',
                'Rain Chance 40%': 'Rain',
                'Controlled Climate': 'Indoor',
                'Sunny and warm': 'Clear',
                'Partly cloudy': 'Cloudy',
                'Clear and Cool': 'Cloudy',
                'Clear and cold': 'Cloudy',
                'Sunny and cold': 'Clear',
                'Indoor': 'Indoor',
                'Partly Sunny': 'Clear',
                'N/A (Indoors)': 'Indoor',
                'Mostly Sunny': 'Clear',
                'Indoors': 'Indoor',
                'Clear Skies': 'Clear',
                'Partly sunny': 'Clear',
                'Showers': 'Rain',
                'N/A Indoor': 'Indoor',
                'Sunny and clear': 'Clear',
                'Snow': 'Snow',
                'Scattered Showers': 'Rain',
                'Party Cloudy': 'Cloudy',
                'Clear skies': 'Clear',
                'Rain likely, temps in low 40s.': 'Rain',
                'Hazy': 'Hazy/Fog',
                'Partly Clouidy': 'Cloudy',
                'Sunny Skies': 'Clear',
                'Overcast': 'Cloudy',
                'Cloudy, 50% change of rain': 'Cloudy',
                'Fair': 'Clear',
                'Light Rain': 'Rain',
                'Partly clear': 'Clear',
                'Mostly Coudy': 'Cloudy',
                '10% Chance of Rain': 'Cloudy',
                'Cloudy, chance of rain': 'Cloudy',
                'Heat Index 95': 'Clear',
                'Sunny, highs to upper 80s': 'Clear',
                'Sun & clouds': 'Cloudy',
                'Heavy lake effect snow': 'Snow',
                'Mostly sunny': 'Clear',
                'Cloudy, Rain': 'Rain',
                'Sunny, Windy': 'Windy',
                'Mostly Sunny Skies': 'Clear',
                'Rainy': 'Rain',
                '30% Chance of Rain': 'Rain',
                'Cloudy, light snow accumulating 1-3"': 'Snow',
                'cloudy': 'Cloudy',
                'Clear and Sunny': 'Clear',
                'Coudy': 'Cloudy',
                'Clear and sunny': 'Clear',
                'Clear to Partly Cloudy': 'Clear',
                'Cloudy with periods of rain, thunder possible. Winds shifting to WNW, 10-20 mph.': 'Windy',
                'Rain shower': 'Rain',
                'Cold': 'Clear'}

plays.Weather.replace(weather_dict, inplace=True)
plays.head()

Unnamed: 0,PlayKey,RosterPosition,PlayerDay,PlayerGame,StadiumType,FieldType,Temperature,Weather,PlayType,PlayerGamePlay,Position,PositionGroup
0,26624-1-1,Quarterback,1,1,Outdoor,1,63,Clear,Pass,1,QB,QB
1,26624-1-2,Quarterback,1,1,Outdoor,1,63,Clear,Pass,2,QB,QB
2,26624-1-3,Quarterback,1,1,Outdoor,1,63,Clear,Rush,3,QB,QB
3,26624-1-4,Quarterback,1,1,Outdoor,1,63,Clear,Rush,4,QB,QB
4,26624-1-5,Quarterback,1,1,Outdoor,1,63,Clear,Pass,5,QB,QB


Assess whether the nan rows are indoor statiums, in which case, change to Indoor, otherwise remove

In [304]:
plays['Weather'].unique()


array(['Clear', 'Cloudy', 'Hazy/Fog', 'Rain', 'Indoor', nan, 'Snow',
       'Windy'], dtype=object)

In [305]:
plays.Weather.value_counts()


Cloudy      112306
Clear        96985
Indoor       20276
Rain         14280
Snow          1945
Hazy/Fog      1809
Windy          713
Name: Weather, dtype: int64

In [306]:
plays.Weather.isna().sum()

18691

In [307]:
# This line of code identifies from the plays table, where the stadium type is 'Indoor' and then fills NaN values in the 'Weather' column with 'Indoor'.
plays.loc[plays.StadiumType == 'Indoor', 'Weather'] = plays.loc[plays.StadiumType == 'Indoor', 'Weather'].fillna('Indoor')


In [308]:
# This addeda bout 7000 values to the Indoor values
plays.Weather.value_counts()

Cloudy      112306
Clear        96985
Indoor       27113
Rain         14280
Snow          1945
Hazy/Fog      1809
Windy          713
Name: Weather, dtype: int64

In [309]:
# The remaining ~ 12,000 were outdoor with no weather - going to remove these since it's impossible to predict the weather conditions
plays.Weather.isna().sum()

11854

In [310]:
# It's possible to determine the weather on those days if absolutely necessary, this looks like about 4.7% of the data...
plays = plays.loc[plays.Weather.isna() == False]
plays.Weather.isna().sum()

0

In [311]:
# Weather has been reduced from 63 different values to 7
plays.Weather.nunique()

7

Now that the Weather has been reduced to fewer than 10, it is ready to be encoded.

### Addressing the Positions Issue

RosterPositions are similar to the PositionGroups, only not put in abbreviations. Will need to change the Roster Positions into abbreviations first. PositionGroups can be dropped, since they are nearly identical to the Roster and actual positions. 

In [312]:
roster = plays.RosterPosition.unique()
roster

array(['Quarterback', 'Wide Receiver', 'Linebacker', 'Running Back',
       'Defensive Lineman', 'Tight End', 'Safety', 'Cornerback',
       'Offensive Lineman', 'Kicker'], dtype=object)

In [313]:
abbreviations = plays.Position.unique()
abbreviations

array(['QB', 'Missing Data', 'WR', 'ILB', 'RB', 'DE', 'TE', 'FS', 'CB',
       'G', 'T', 'OLB', 'DT', 'SS', 'MLB', 'C', 'NT', 'DB', 'K', 'LB',
       'S', 'HB', 'P'], dtype=object)

Going to change the the positions the same way, using a dictionary

In [314]:
position_dict = {
    'Quarterback': 'QB',
    'Wide Receiver': 'WR',
    'Linebacker': 'LB',
    'Running Back': 'RB',
    'Defensive Lineman': 'DL',
    'Tight End': 'TE',
    'Safety': 'S',
    'Cornerback': 'CB',
    'Offensive Lineman': 'OL',
    'Kicker': 'K'
}

plays.RosterPosition.replace(position_dict, inplace=True)
plays.head()

Unnamed: 0,PlayKey,RosterPosition,PlayerDay,PlayerGame,StadiumType,FieldType,Temperature,Weather,PlayType,PlayerGamePlay,Position,PositionGroup
0,26624-1-1,QB,1,1,Outdoor,1,63,Clear,Pass,1,QB,QB
1,26624-1-2,QB,1,1,Outdoor,1,63,Clear,Pass,2,QB,QB
2,26624-1-3,QB,1,1,Outdoor,1,63,Clear,Rush,3,QB,QB
3,26624-1-4,QB,1,1,Outdoor,1,63,Clear,Rush,4,QB,QB
4,26624-1-5,QB,1,1,Outdoor,1,63,Clear,Pass,5,QB,QB


In [315]:
# Drop the Position Group column
plays = plays.drop(columns='PositionGroup')
plays.head()

Unnamed: 0,PlayKey,RosterPosition,PlayerDay,PlayerGame,StadiumType,FieldType,Temperature,Weather,PlayType,PlayerGamePlay,Position
0,26624-1-1,QB,1,1,Outdoor,1,63,Clear,Pass,1,QB
1,26624-1-2,QB,1,1,Outdoor,1,63,Clear,Pass,2,QB
2,26624-1-3,QB,1,1,Outdoor,1,63,Clear,Rush,3,QB
3,26624-1-4,QB,1,1,Outdoor,1,63,Clear,Rush,4,QB
4,26624-1-5,QB,1,1,Outdoor,1,63,Clear,Pass,5,QB


In [316]:
plays.Position[plays.Position == "Missing Data"].value_counts()

Missing Data    45
Name: Position, dtype: int64

In [317]:
# This code identifies "Missing Data" from the Position and replaces the missing value with the RosterPosition
plays['Position'] = np.where(plays['Position'] == 'Missing Data', plays['RosterPosition'], plays['Position'])

# Verify that the missing Data values have been replaced
plays.Position[plays.Position == "Missing Data"].value_counts()

Series([], Name: Position, dtype: int64)

In [318]:
plays.Position.unique()

array(['QB', 'WR', 'ILB', 'RB', 'DE', 'TE', 'FS', 'CB', 'G', 'T', 'OLB',
       'LB', 'DT', 'SS', 'MLB', 'C', 'OL', 'NT', 'DL', 'DB', 'K', 'S',
       'HB', 'P'], dtype=object)

## Now To Address the Injuries Dataset

In [319]:
injuries = pd.read_csv("NFL_Turf/InjuryRecord.csv")  # 105 rows
injuries.head()

Unnamed: 0,PlayerKey,GameID,PlayKey,BodyPart,Surface,DM_M1,DM_M7,DM_M28,DM_M42
0,39873,39873-4,39873-4-32,Knee,Synthetic,1,1,1,1
1,46074,46074-7,46074-7-26,Knee,Natural,1,1,0,0
2,36557,36557-1,36557-1-70,Ankle,Synthetic,1,1,1,1
3,46646,46646-3,46646-3-30,Ankle,Natural,1,0,0,0
4,43532,43532-5,43532-5-69,Ankle,Synthetic,1,1,1,1


Evaluate all columns for na values

In [320]:
# The PlayKey column is the only one that has NaN values
injuries['PlayKey'].isna().sum()

28

In [321]:
# Drop the NaN values, since we won't be able to correlate these with the other tables
injuries = injuries.dropna(subset = ['PlayKey'])

In [322]:
injuries.nunique()

PlayerKey    74
GameID       76
PlayKey      76
BodyPart      3
Surface       2
DM_M1         1
DM_M7         2
DM_M28        2
DM_M42        2
dtype: int64

Note: there is only 1 unique value for DM_M1 - which means that every player on this list was injured for at least 1 day

In [323]:
injuries.dtypes

PlayerKey     int64
GameID       object
PlayKey      object
BodyPart     object
Surface      object
DM_M1         int64
DM_M7         int64
DM_M28        int64
DM_M42        int64
dtype: object

The Surface is the same as the Field Type from the other table, so this can be dropped. 
Note: Anyone whose injury is in the DM_M42 list is also in all of the prior lists, so there will be more of the lower values due the the encoding. Going to change this to a single column with values of 1, 7, 28, and 42

### Group the DM columns into a single Injury Duration column

In [324]:
def injury_duration(row):
    injury_duration = 0
    if row["DM_M42"] == 1:
        injury_duration = 42
    else:
        if row["DM_M28"] == 1:
            injury_duration = 28
        else:
            if row["DM_M7"] == 1:
                injury_duration = 7
            else: 
                injury_duration = 1
    
    return injury_duration

# Apply the function to all rows
injuries['Injury_Duration'] = injuries.apply(injury_duration, axis=1)
injuries.head()


Unnamed: 0,PlayerKey,GameID,PlayKey,BodyPart,Surface,DM_M1,DM_M7,DM_M28,DM_M42,Injury_Duration
0,39873,39873-4,39873-4-32,Knee,Synthetic,1,1,1,1,42
1,46074,46074-7,46074-7-26,Knee,Natural,1,1,0,0,7
2,36557,36557-1,36557-1-70,Ankle,Synthetic,1,1,1,1,42
3,46646,46646-3,46646-3-30,Ankle,Natural,1,0,0,0,1
4,43532,43532-5,43532-5-69,Ankle,Synthetic,1,1,1,1,42


In [325]:
# Remove the rows for DMs
injuries.drop(columns=['DM_M1', 'DM_M7', 'DM_M28', 'DM_M42', 'Surface'], inplace=True)
injuries.head()

Unnamed: 0,PlayerKey,GameID,PlayKey,BodyPart,Injury_Duration
0,39873,39873-4,39873-4-32,Knee,42
1,46074,46074-7,46074-7-26,Knee,7
2,36557,36557-1,36557-1-70,Ankle,42
3,46646,46646-3,46646-3-30,Ankle,1
4,43532,43532-5,43532-5-69,Ankle,42


Analyze the BodyPart of injury to verify it's ready for encoding

In [326]:
# The body parts are categorical and will be better using OneHotEncoder - apparently all of the toes injuries fell into the PlayKey = NaN values
injuries.BodyPart.unique()

array(['Knee', 'Ankle', 'Foot'], dtype=object)

In [327]:
# There are 100 individual players that have been injured for at least 1 day 
# Since there are 105 rows, does this mean there were 5 reinjuries?
injuries.PlayerKey.nunique()

74

In [329]:
# This output only 76 unique plays with only 74 players, so only 2 players were reinjured at different times of the season
injuries.PlayKey.nunique()

76

Every GameID and PlayID are unique, meaning that once that
particular player was injured during a specific game at a specific play,
they didn't return to the field. Since the GameID numbers are not in any 
chronological order and offer no information other than the PlayKey can, this column can be dropped

In [330]:
injuries.GameID.nunique()

76

Since the PlayerID, GamerID, and PlayKey number are all contained within the PlayKey, the GameID and PlayerID can be dropped. 

In [331]:
injuries.drop(columns=['GameID', 'PlayerKey'], inplace=True)
injuries.head()

Unnamed: 0,PlayKey,BodyPart,Injury_Duration
0,39873-4-32,Knee,42
1,46074-7-26,Knee,7
2,36557-1-70,Ankle,42
3,46646-3-30,Ankle,1
4,43532-5-69,Ankle,42


## Now Have 2 lists that can be merged

In [332]:
plays.head()

Unnamed: 0,PlayKey,RosterPosition,PlayerDay,PlayerGame,StadiumType,FieldType,Temperature,Weather,PlayType,PlayerGamePlay,Position
0,26624-1-1,QB,1,1,Outdoor,1,63,Clear,Pass,1,QB
1,26624-1-2,QB,1,1,Outdoor,1,63,Clear,Pass,2,QB
2,26624-1-3,QB,1,1,Outdoor,1,63,Clear,Rush,3,QB
3,26624-1-4,QB,1,1,Outdoor,1,63,Clear,Rush,4,QB
4,26624-1-5,QB,1,1,Outdoor,1,63,Clear,Pass,5,QB


In [333]:
injuries.head()

Unnamed: 0,PlayKey,BodyPart,Injury_Duration
0,39873-4-32,Knee,42
1,46074-7-26,Knee,7
2,36557-1-70,Ankle,42
3,46646-3-30,Ankle,1
4,43532-5-69,Ankle,42


In [336]:
play_injuries = pd.merge(plays, injuries, on='PlayKey', how='outer')

In [337]:
play_injuries.head()

Unnamed: 0,PlayKey,RosterPosition,PlayerDay,PlayerGame,StadiumType,FieldType,Temperature,Weather,PlayType,PlayerGamePlay,Position,BodyPart,Injury_Duration
0,26624-1-1,QB,1.0,1.0,Outdoor,1.0,63.0,Clear,Pass,1.0,QB,,
1,26624-1-2,QB,1.0,1.0,Outdoor,1.0,63.0,Clear,Pass,2.0,QB,,
2,26624-1-3,QB,1.0,1.0,Outdoor,1.0,63.0,Clear,Rush,3.0,QB,,
3,26624-1-4,QB,1.0,1.0,Outdoor,1.0,63.0,Clear,Rush,4.0,QB,,
4,26624-1-5,QB,1.0,1.0,Outdoor,1.0,63.0,Clear,Pass,5.0,QB,,


### Add values for duration and Body Part. Change NaN to None for body part. Change Injury_Duration to 0 for all NaN values