In [1]:
# Import Dependencies
import pandas as pd
import numpy as np
from functools import reduce

# File Paths
hof_path = 'Resources/HallOfFame.csv'
batting_path = 'Resources/Batting.csv'
player_path = 'Resources/People.csv'
awards_path = 'Resources/AwardsPlayers.csv'
allstar_path = 'Resources/AllstarFull.csv'

# Create Dataframes
hof_df = pd.read_csv(hof_path)
batting_df = pd.read_csv(batting_path)
player_df = pd.read_csv(player_path)
awards_df = pd.read_csv(awards_path)
allstar_df = pd.read_csv(allstar_path)


In [2]:
# merge the 5 csv files into one dataframe
merged_df = reduce(lambda x,y: pd.merge(x,y, on='playerID', how='outer'), 
                    [hof_df, batting_df, player_df, awards_df, allstar_df])
merged_df.head()

  


Unnamed: 0,playerID,yearID_x,votedBy,ballots,needed,votes,inducted,category,needed_note,yearID_y,...,lgID_y,tie,notes,yearID_y.1,gameNum,gameID,teamID_y,lgID,GP,startingPos
0,cobbty01,1936.0,BBWAA,226.0,170.0,222.0,Y,Player,,1905.0,...,AL,,RF,,,,,,,
1,cobbty01,1936.0,BBWAA,226.0,170.0,222.0,Y,Player,,1905.0,...,ML,,RF,,,,,,,
2,cobbty01,1936.0,BBWAA,226.0,170.0,222.0,Y,Player,,1905.0,...,AL,,RF,,,,,,,
3,cobbty01,1936.0,BBWAA,226.0,170.0,222.0,Y,Player,,1905.0,...,ML,,RF,,,,,,,
4,cobbty01,1936.0,BBWAA,226.0,170.0,222.0,Y,Player,,1905.0,...,AL,,,,,,,,,


In [3]:
# sort by playerID and drop duplicates and keeping the last playerID
sorted_df = merged_df.sort_values(by=['playerID'], ascending=False)
sorted_df.drop_duplicates(subset=['playerID'], keep='last', inplace = True)

In [4]:
sorted_df.columns

Index(['playerID', 'yearID_x', 'votedBy', 'ballots', 'needed', 'votes',
       'inducted', 'category', 'needed_note', 'yearID_y', 'stint', 'teamID_x',
       'lgID_x', 'G', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'SB', 'CS',
       'BB', 'SO', 'IBB', 'HBP', 'SH', 'SF', 'GIDP', 'birthYear', 'birthMonth',
       'birthDay', 'birthCountry', 'birthState', 'birthCity', 'deathYear',
       'deathMonth', 'deathDay', 'deathCountry', 'deathState', 'deathCity',
       'nameFirst', 'nameLast', 'nameGiven', 'weight', 'height', 'bats',
       'throws', 'debut', 'finalGame', 'retroID', 'bbrefID', 'awardID',
       'yearID_x', 'lgID_y', 'tie', 'notes', 'yearID_y', 'gameNum', 'gameID',
       'teamID_y', 'lgID', 'GP', 'startingPos'],
      dtype='object')

In [5]:
sorted_df.drop(['yearID_x', 'votedBy', 'ballots', 'needed', 'votes', 'category', 
               'needed_note', 'yearID_y', 'stint', 'teamID_x', 'lgID_x', 'deathYear',
              'deathMonth', 'deathDay', 'deathCountry', 'deathState', 'deathCity',
              'nameFirst', 'nameLast', 'nameGiven', 'birthYear', 'birthMonth',
              'birthDay', 'birthCountry', 'birthState', 'birthCity', 
              'retroID', 'bbrefID', 'yearID_x', 'lgID_y', 'tie', 'notes', 'yearID_y',
              'gameNum', 'gameID', 'teamID_y', 'lgID', 'startingPos'], axis = 1, inplace = True)

In [6]:
cleaned_df = sorted_df.reset_index()
cleaned_df['inducted'].value_counts()

N    1113
Y     166
Name: inducted, dtype: int64

In [7]:
cleaned_df['inducted'].fillna('N', inplace=True)
cleaned_df['awardID'].fillna('None', inplace=True)
cleaned_df['GP'].fillna('None', inplace=True)

In [8]:
names = {'playerID':'Player ID','inducted':'Inducted into HOF', 'weight':'Weight', 'height':'Height', 
         'bats':'Batting Hand', 'throws':'Throwing Hand', 'debut':'MLB Debut', 'finalGame':'Final MLB Game',
         'awardID':'Award Name', 'GP':'All Star Games Played'
        }

In [9]:
cleaned_df.rename(columns=names, inplace=True)

In [10]:
cleaned_df.isnull().sum()

index                       0
Player ID                   0
Inducted into HOF           0
G                         204
AB                        204
R                         204
H                         204
2B                        204
3B                        204
HR                        204
RBI                       509
SB                        879
CS                       5642
BB                        204
SO                        925
IBB                      8342
HBP                      1018
SH                       1841
SF                       8255
GIDP                     6138
Weight                    816
Height                    736
Batting Hand             1181
Throwing Hand             977
MLB Debut                 210
Final MLB Game            210
Award Name                  0
All Star Games Played       0
dtype: int64

In [11]:
cleaned_df.notnull().sum()

index                    20370
Player ID                20370
Inducted into HOF        20370
G                        20166
AB                       20166
R                        20166
H                        20166
2B                       20166
3B                       20166
HR                       20166
RBI                      19861
SB                       19491
CS                       14728
BB                       20166
SO                       19445
IBB                      12028
HBP                      19352
SH                       18529
SF                       12115
GIDP                     14232
Weight                   19554
Height                   19634
Batting Hand             19189
Throwing Hand            19393
MLB Debut                20160
Final MLB Game           20160
Award Name               20370
All Star Games Played    20370
dtype: int64

In [12]:
final_df = cleaned_df.dropna()

In [13]:
final_df

Unnamed: 0,index,Player ID,Inducted into HOF,G,AB,R,H,2B,3B,HR,...,SF,GIDP,Weight,Height,Batting Hand,Throwing Hand,MLB Debut,Final MLB Game,Award Name,All Star Games Played
0,2638153,zychto01,N,13.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,190.0,75.0,R,R,2015-09-04,2017-08-19,,
2,2490993,zuverge01,N,2.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,195.0,76.0,R,R,1951-04-21,1959-06-15,,
3,2516491,zuvelpa01,N,21.0,48.0,2.0,4.0,1.0,0.0,0.0,...,0.0,1.0,173.0,72.0,R,R,1982-09-04,1991-05-02,,
4,2497698,zupofr01,N,1.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,182.0,71.0,L,R,1957-07-01,1961-05-09,,
5,2524976,zupcibo01,N,32.0,88.0,10.0,18.0,4.0,1.0,1.0,...,1.0,2.0,220.0,76.0,R,R,1991-09-07,1994-08-04,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20365,2568629,abadan01,N,5.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,184.0,73.0,L,L,2001-09-10,2006-04-13,,
20366,2510737,aasedo01,N,66.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,190.0,75.0,R,R,1977-07-26,1990-10-03,,1.0
20367,2500741,aaronto01,N,141.0,334.0,54.0,77.0,20.0,2.0,8.0,...,3.0,10.0,190.0,75.0,R,R,1962-04-10,1971-09-26,,
20368,1452395,aaronha01,Y,160.0,606.0,84.0,174.0,33.0,4.0,29.0,...,5.0,21.0,180.0,72.0,R,R,1954-04-13,1976-10-03,TSN All-Star,1.0
