In [1]:
import pandas as pd
import numpy as np

In [2]:
# Import cleaned injury data 
raw_data = pd.read_csv('Resources/new_cleaned_injury_data.csv')

In [3]:
# Drop the unnamed column
raw_data = raw_data.drop("Unnamed: 0", 1)
raw_data

  raw_data = raw_data.drop("Unnamed: 0", 1)


Unnamed: 0,Name,Team,Date,Status,injury_length,Injury,location
0,Jeff Suppan,Brewers,2010-03-30,Placed,15-day,cervical disc pain,
1,Josh Butler,Brewers,2010-03-30,Placed,15-day,elbow impingement,right
2,Alberto Arias,Astros,2010-03-31,Placed,15-day,rotator cuff injury,
3,Lance Berkman,Astros,2010-03-31,Placed,15-day,knee injury (surgery),left
4,Yorman Bazardo,Astros,2010-03-31,Placed,15-day,strained shoulder,right
...,...,...,...,...,...,...,...
16651,Luke Voit,Yankees,2021-09-30,Placed,10-day,knee inflammation (out for season),left
16652,Darin Ruf,Giants,2021-09-30,Activated,10-day,,
16653,Johnny Cueto,Giants,2021-09-30,Activated,10-day,,
16654,Craig Stammen,Padres,2021-09-30,Activated,10-day,,


In [4]:
# Check data types
raw_data.dtypes

Name             object
Team             object
Date             object
Status           object
injury_length    object
Injury           object
location         object
dtype: object

In [5]:
# Convert Date column type to datetime
raw_data['Date'] = pd.to_datetime(raw_data['Date'])

In [6]:
# Sort the dataset by names first, then by date
raw_data = raw_data.sort_values(by = ['Name', 'Date'], ascending = [True, True], na_position = 'first')
raw_data.head(20)

Unnamed: 0,Name,Team,Date,Status,injury_length,Injury,location
7476,,Mariners,2016-06-30,Placed,15-day,fractured hand,right
8916,,Padres,2017-06-23,Activated,10-day,,
13546,,Astros,2020-08-20,Placed,10-day,hamstring injury,right
14543,,White Sox,2021-04-15,Placed,10-day,strained abdominal,left
8844,(Christopher) Brian Johnson,Red Sox,2017-06-15,Placed,10-day,shoulder impingement,left
9035,(Christopher) Brian Johnson,Red Sox,2017-07-06,Activated,10-day,,
10674,(Christopher) Brian Johnson,Red Sox,2018-07-08,Placed,10-day,hip inflammation,left
10747,(Christopher) Brian Johnson,Red Sox,2018-07-15,Activated,10-day,,
11592,(Christopher) Brian Johnson,Red Sox,2019-04-06,Placed,10-day,elbow inflammation,left
12105,(Christopher) Brian Johnson,Red Sox,2019-06-14,Activated,10-day,,


In [7]:
# Drop the rows that have NaN values for the name 
raw_data = raw_data.drop([14543, 13546, 8916, 7476])
raw_data.head(20)

Unnamed: 0,Name,Team,Date,Status,injury_length,Injury,location
8844,(Christopher) Brian Johnson,Red Sox,2017-06-15,Placed,10-day,shoulder impingement,left
9035,(Christopher) Brian Johnson,Red Sox,2017-07-06,Activated,10-day,,
10674,(Christopher) Brian Johnson,Red Sox,2018-07-08,Placed,10-day,hip inflammation,left
10747,(Christopher) Brian Johnson,Red Sox,2018-07-15,Activated,10-day,,
11592,(Christopher) Brian Johnson,Red Sox,2019-04-06,Placed,10-day,elbow inflammation,left
12105,(Christopher) Brian Johnson,Red Sox,2019-06-14,Activated,10-day,,
12226,(Christopher) Brian Johnson,Red Sox,2019-06-29,Placed,10-day,non-baseball medical condition,
12499,(Christopher) Brian Johnson,Red Sox,2019-08-03,Activated,10-day,,
5061,(Darrell) David Carpenter,Braves,2014-06-17,Placed,15-day,strained biceps,right
5156,(Darrell) David Carpenter,Braves,2014-07-02,Activated,15-day,,


In [8]:
# Create a new column to hold number of days on IL/DL
raw_data["time_out"] = ''
raw_data.head(20)

Unnamed: 0,Name,Team,Date,Status,injury_length,Injury,location,time_out
8844,(Christopher) Brian Johnson,Red Sox,2017-06-15,Placed,10-day,shoulder impingement,left,
9035,(Christopher) Brian Johnson,Red Sox,2017-07-06,Activated,10-day,,,
10674,(Christopher) Brian Johnson,Red Sox,2018-07-08,Placed,10-day,hip inflammation,left,
10747,(Christopher) Brian Johnson,Red Sox,2018-07-15,Activated,10-day,,,
11592,(Christopher) Brian Johnson,Red Sox,2019-04-06,Placed,10-day,elbow inflammation,left,
12105,(Christopher) Brian Johnson,Red Sox,2019-06-14,Activated,10-day,,,
12226,(Christopher) Brian Johnson,Red Sox,2019-06-29,Placed,10-day,non-baseball medical condition,,
12499,(Christopher) Brian Johnson,Red Sox,2019-08-03,Activated,10-day,,,
5061,(Darrell) David Carpenter,Braves,2014-06-17,Placed,15-day,strained biceps,right,
5156,(Darrell) David Carpenter,Braves,2014-07-02,Activated,15-day,,,


In [9]:
# Check amount of null values across dataframe
# This is mostly to see how many NaNs are in the injujry length column because we are 
# about to drop them
raw_data.isna().sum()

Name                0
Team                0
Date                0
Status              0
injury_length    1828
Injury           8179
location         9723
time_out            0
dtype: int64

In [10]:
# drop rows with NaN in injury_length column
# These indicated when a player was upgraded from say the 15 to 60 day IL/DL
# Which we do not need to know
date_df = raw_data[raw_data['injury_length'].notna()]
date_df.head(20)

Unnamed: 0,Name,Team,Date,Status,injury_length,Injury,location,time_out
8844,(Christopher) Brian Johnson,Red Sox,2017-06-15,Placed,10-day,shoulder impingement,left,
9035,(Christopher) Brian Johnson,Red Sox,2017-07-06,Activated,10-day,,,
10674,(Christopher) Brian Johnson,Red Sox,2018-07-08,Placed,10-day,hip inflammation,left,
10747,(Christopher) Brian Johnson,Red Sox,2018-07-15,Activated,10-day,,,
11592,(Christopher) Brian Johnson,Red Sox,2019-04-06,Placed,10-day,elbow inflammation,left,
12105,(Christopher) Brian Johnson,Red Sox,2019-06-14,Activated,10-day,,,
12226,(Christopher) Brian Johnson,Red Sox,2019-06-29,Placed,10-day,non-baseball medical condition,,
12499,(Christopher) Brian Johnson,Red Sox,2019-08-03,Activated,10-day,,,
5061,(Darrell) David Carpenter,Braves,2014-06-17,Placed,15-day,strained biceps,right,
5156,(Darrell) David Carpenter,Braves,2014-07-02,Activated,15-day,,,


In [11]:
# calculate the difference in the date column with the value before it
# AKA, calculate how long the player was on the IL/DL
date_df['time_out'] = date_df.Date.diff()
date_df.head(20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  date_df['time_out'] = date_df.Date.diff()


Unnamed: 0,Name,Team,Date,Status,injury_length,Injury,location,time_out
8844,(Christopher) Brian Johnson,Red Sox,2017-06-15,Placed,10-day,shoulder impingement,left,NaT
9035,(Christopher) Brian Johnson,Red Sox,2017-07-06,Activated,10-day,,,21 days
10674,(Christopher) Brian Johnson,Red Sox,2018-07-08,Placed,10-day,hip inflammation,left,367 days
10747,(Christopher) Brian Johnson,Red Sox,2018-07-15,Activated,10-day,,,7 days
11592,(Christopher) Brian Johnson,Red Sox,2019-04-06,Placed,10-day,elbow inflammation,left,265 days
12105,(Christopher) Brian Johnson,Red Sox,2019-06-14,Activated,10-day,,,69 days
12226,(Christopher) Brian Johnson,Red Sox,2019-06-29,Placed,10-day,non-baseball medical condition,,15 days
12499,(Christopher) Brian Johnson,Red Sox,2019-08-03,Activated,10-day,,,35 days
5061,(Darrell) David Carpenter,Braves,2014-06-17,Placed,15-day,strained biceps,right,-1873 days
5156,(Darrell) David Carpenter,Braves,2014-07-02,Activated,15-day,,,15 days


In [12]:
# delete time_out values for rows that have 'Placed' in the Status column
# this is unnecessary info that will mess up our machine learning results if kept
for idx, row in date_df.iterrows():
    if  date_df.loc[idx,'Status'] == 'Placed':
        date_df.loc[idx,'time_out'] = ''

date_df.head(30)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


Unnamed: 0,Name,Team,Date,Status,injury_length,Injury,location,time_out
8844,(Christopher) Brian Johnson,Red Sox,2017-06-15,Placed,10-day,shoulder impingement,left,NaT
9035,(Christopher) Brian Johnson,Red Sox,2017-07-06,Activated,10-day,,,21 days
10674,(Christopher) Brian Johnson,Red Sox,2018-07-08,Placed,10-day,hip inflammation,left,NaT
10747,(Christopher) Brian Johnson,Red Sox,2018-07-15,Activated,10-day,,,7 days
11592,(Christopher) Brian Johnson,Red Sox,2019-04-06,Placed,10-day,elbow inflammation,left,NaT
12105,(Christopher) Brian Johnson,Red Sox,2019-06-14,Activated,10-day,,,69 days
12226,(Christopher) Brian Johnson,Red Sox,2019-06-29,Placed,10-day,non-baseball medical condition,,NaT
12499,(Christopher) Brian Johnson,Red Sox,2019-08-03,Activated,10-day,,,35 days
5061,(Darrell) David Carpenter,Braves,2014-06-17,Placed,15-day,strained biceps,right,NaT
5156,(Darrell) David Carpenter,Braves,2014-07-02,Activated,15-day,,,15 days


In [13]:
# Save this dataframe as a new CSV file
date_df.to_csv("cleaned_injury_data_with_dates.csv")