In [1]:
# Import dependencies

import pandas as pd
import datetime as dt
import csv
import os
import numpy as np

In [2]:
# Load the data from People.csv

file_path = "Resources/People.csv"
data_df = pd.read_csv(file_path)

data_df.head(5)

Unnamed: 0,playerID,birthYear,birthMonth,birthDay,birthCountry,birthState,birthCity,deathYear,deathMonth,deathDay,...,nameLast,nameGiven,weight,height,bats,throws,debut,finalGame,retroID,bbrefID
0,aardsda01,1981.0,12.0,27.0,USA,CO,Denver,,,,...,Aardsma,David Allan,215.0,75.0,R,R,4/6/2004,8/23/2015,aardd001,aardsda01
1,aaronha01,1934.0,2.0,5.0,USA,AL,Mobile,2021.0,1.0,22.0,...,Aaron,Henry Louis,180.0,72.0,R,R,4/13/1954,10/3/1976,aaroh101,aaronha01
2,aaronto01,1939.0,8.0,5.0,USA,AL,Mobile,1984.0,8.0,16.0,...,Aaron,Tommie Lee,190.0,75.0,R,R,4/10/1962,9/26/1971,aarot101,aaronto01
3,aasedo01,1954.0,9.0,8.0,USA,CA,Orange,,,,...,Aase,Donald William,190.0,75.0,R,R,7/26/1977,10/3/1990,aased001,aasedo01
4,abadan01,1972.0,8.0,25.0,USA,FL,Palm Beach,,,,...,Abad,Fausto Andres,184.0,73.0,L,L,9/10/2001,4/13/2006,abada001,abadan01


In [3]:
# Keep the player name columns and the debut and final game dates

career_data_df = pd.DataFrame(data_df, columns=['nameFirst','nameLast', 'debut', 'finalGame'])
career_data_df.head(5)

Unnamed: 0,nameFirst,nameLast,debut,finalGame
0,David,Aardsma,4/6/2004,8/23/2015
1,Hank,Aaron,4/13/1954,10/3/1976
2,Tommie,Aaron,4/10/1962,9/26/1971
3,Don,Aase,7/26/1977,10/3/1990
4,Andy,Abad,9/10/2001,4/13/2006


In [4]:
# Combine the first and last names into new Name column

career_data_df["player_name"] = career_data_df['nameFirst'] +' '+ career_data_df['nameLast']
career_data_df.head(5)
                                                  

Unnamed: 0,nameFirst,nameLast,debut,finalGame,player_name
0,David,Aardsma,4/6/2004,8/23/2015,David Aardsma
1,Hank,Aaron,4/13/1954,10/3/1976,Hank Aaron
2,Tommie,Aaron,4/10/1962,9/26/1971,Tommie Aaron
3,Don,Aase,7/26/1977,10/3/1990,Don Aase
4,Andy,Abad,9/10/2001,4/13/2006,Andy Abad


In [5]:
# Drop the first and last name columns 

career_data_df.drop("nameFirst", axis=1, inplace=True)
career_data_df.drop("nameLast", axis=1, inplace=True)
career_data_df.head(5)

Unnamed: 0,debut,finalGame,player_name
0,4/6/2004,8/23/2015,David Aardsma
1,4/13/1954,10/3/1976,Hank Aaron
2,4/10/1962,9/26/1971,Tommie Aaron
3,7/26/1977,10/3/1990,Don Aase
4,9/10/2001,4/13/2006,Andy Abad


In [6]:
career_data_df.dtypes

debut          object
finalGame      object
player_name    object
dtype: object

In [7]:
career_data_df['debut'] = pd.to_datetime(career_data_df['debut'])
career_data_df['finalGame'] = pd.to_datetime(career_data_df['finalGame'])
career_data_df.head(5)

Unnamed: 0,debut,finalGame,player_name
0,2004-04-06,2015-08-23,David Aardsma
1,1954-04-13,1976-10-03,Hank Aaron
2,1962-04-10,1971-09-26,Tommie Aaron
3,1977-07-26,1990-10-03,Don Aase
4,2001-09-10,2006-04-13,Andy Abad


In [8]:
career_data_df['years_played'] = (career_data_df['finalGame'] - career_data_df['debut'])/np.timedelta64(1, 'Y')
career_data_df.head(5)

Unnamed: 0,debut,finalGame,player_name,years_played
0,2004-04-06,2015-08-23,David Aardsma,11.378742
1,1954-04-13,1976-10-03,Hank Aaron,22.475479
2,1962-04-10,1971-09-26,Tommie Aaron,9.462207
3,1977-07-26,1990-10-03,Don Aase,13.188498
4,2001-09-10,2006-04-13,Andy Abad,4.588732


In [23]:
# Drop rows with debut games before 1999
career_data_filtered_df = career_data_df.loc[(career_data_df['debut'] >= '2000-05-30') & (career_data_df['debut'] < '2020-01-01')]

In [25]:
career_data_filtered_df.head()


Unnamed: 0,debut,finalGame,player_name,years_played
0,2004-04-06,2015-08-23,David Aardsma,11.378742
4,2001-09-10,2006-04-13,Andy Abad,4.588732
5,2010-07-28,2021-10-01,Fernando Abad,11.178874
22,2006-04-04,2008-09-28,Reggie Abercrombie,2.48602
24,2001-06-25,2005-09-29,Brent Abernathy,4.262921


In [26]:
# Save the Cleaned Data as csv

# Saving cleaned data
file_path = "Resources/career_length.csv"
career_data_filtered_df.to_csv(file_path, index=False)