In [1]:
import os            # for file paths
import pandas as pd
from pandas import isnull


In [2]:
# Master.csv
master = pd.read_csv(os.path.join('data','Master.csv'))
master.head()


Unnamed: 0,playerID,coachID,hofID,firstName,lastName,nameNote,nameGiven,nameNick,height,weight,...,birthDay,birthCountry,birthState,birthCity,deathYear,deathMon,deathDay,deathCountry,deathState,deathCity
0,aaltoan01,,,Antti,Aalto,,Antti,,73.0,210.0,...,4.0,Finland,,Lappeenranta,,,,,,
1,abbeybr01,,,Bruce,Abbey,,Bruce,,73.0,185.0,...,18.0,Canada,ON,Toronto,,,,,,
2,abbotge01,,,George,Abbott,,George Henry,Preacher,67.0,153.0,...,3.0,Canada,ON,Synenham,,,,,,
3,abbotre01,,,Reg,Abbott,,Reginald Stewart,,71.0,164.0,...,4.0,Canada,MB,Winnipeg,,,,,,
4,abdelju01,,,Justin,Abdelkader,,,,73.0,195.0,...,25.0,USA,MI,Muskegon,,,,,,


In [3]:
master.shape


(7761, 31)

In [4]:
master.columns


Index(['playerID', 'coachID', 'hofID', 'firstName', 'lastName', 'nameNote',
       'nameGiven', 'nameNick', 'height', 'weight', 'shootCatch', 'legendsID',
       'ihdbID', 'hrefID', 'firstNHL', 'lastNHL', 'firstWHA', 'lastWHA', 'pos',
       'birthYear', 'birthMon', 'birthDay', 'birthCountry', 'birthState',
       'birthCity', 'deathYear', 'deathMon', 'deathDay', 'deathCountry',
       'deathState', 'deathCity'],
      dtype='object')

In [12]:
# Braces let us structure the code in a more readable way
(master['playerID']
    .pipe(isnull)
    .value_counts())


False    7520
True      241
Name: playerID, dtype: int64

In [13]:
# This is an alternative to:
isnull(master['playerID']).value_counts()


False    7520
True      241
Name: playerID, dtype: int64

In [15]:
master_original = master.copy()


In [16]:
master.dropna(subset=['playerID'], inplace=True)
master.shape


(7520, 31)

In [17]:
master.columns


Index(['playerID', 'coachID', 'hofID', 'firstName', 'lastName', 'nameNote',
       'nameGiven', 'nameNick', 'height', 'weight', 'shootCatch', 'legendsID',
       'ihdbID', 'hrefID', 'firstNHL', 'lastNHL', 'firstWHA', 'lastWHA', 'pos',
       'birthYear', 'birthMon', 'birthDay', 'birthCountry', 'birthState',
       'birthCity', 'deathYear', 'deathMon', 'deathDay', 'deathCountry',
       'deathState', 'deathCity'],
      dtype='object')

In [18]:
# Drop rows where no data for NHL years
master = master.dropna(subset=['firstNHL', 'lastNHL'], how='all')
master.shape


(6851, 31)

In [19]:
master.dtypes



playerID         object
coachID          object
hofID            object
firstName        object
lastName         object
nameNote         object
nameGiven        object
nameNick         object
height          float64
weight          float64
shootCatch       object
legendsID        object
ihdbID          float64
hrefID           object
firstNHL        float64
lastNHL         float64
firstWHA        float64
lastWHA         float64
pos              object
birthYear       float64
birthMon        float64
birthDay        float64
birthCountry     object
birthState       object
birthCity        object
deathYear       float64
deathMon        float64
deathDay        float64
deathCountry     object
deathState       object
deathCity        object
dtype: object

In [21]:
master = master.loc[master['lastNHL'] >= 1980]
master.shape

(4627, 31)

In [22]:
master.columns


Index(['playerID', 'coachID', 'hofID', 'firstName', 'lastName', 'nameNote',
       'nameGiven', 'nameNick', 'height', 'weight', 'shootCatch', 'legendsID',
       'ihdbID', 'hrefID', 'firstNHL', 'lastNHL', 'firstWHA', 'lastWHA', 'pos',
       'birthYear', 'birthMon', 'birthDay', 'birthCountry', 'birthState',
       'birthCity', 'deathYear', 'deathMon', 'deathDay', 'deathCountry',
       'deathState', 'deathCity'],
      dtype='object')

In [23]:
columns_to_keep = ['playerID', 'firstName', 'lastName', 'pos',
                   'birthYear', 'birthMon', 'birthDay', 'birthCountry', 'birthState',
                   'birthCity']
master[columns_to_keep].head()


Unnamed: 0,playerID,firstName,lastName,pos,birthYear,birthMon,birthDay,birthCountry,birthState,birthCity
0,aaltoan01,Antti,Aalto,C,1975.0,3.0,4.0,Finland,,Lappeenranta
4,abdelju01,Justin,Abdelkader,L,1987.0,2.0,25.0,USA,MI,Muskegon
9,abidra01,Ramzi,Abid,L,1980.0,3.0,24.0,Canada,QC,Montreal
11,abrahth01,Thommy,Abrahamsson,D,1947.0,4.0,12.0,Sweden,,Leksand
14,actonke01,Keith,Acton,C,1958.0,4.0,15.0,Canada,ON,Stouffville


In [24]:
master.filter(columns_to_keep).head()


Unnamed: 0,playerID,firstName,lastName,pos,birthYear,birthMon,birthDay,birthCountry,birthState,birthCity
0,aaltoan01,Antti,Aalto,C,1975.0,3.0,4.0,Finland,,Lappeenranta
4,abdelju01,Justin,Abdelkader,L,1987.0,2.0,25.0,USA,MI,Muskegon
9,abidra01,Ramzi,Abid,L,1980.0,3.0,24.0,Canada,QC,Montreal
11,abrahth01,Thommy,Abrahamsson,D,1947.0,4.0,12.0,Sweden,,Leksand
14,actonke01,Keith,Acton,C,1958.0,4.0,15.0,Canada,ON,Stouffville


In [25]:
master.shape


(4627, 31)

In [27]:
master_modified = master.copy()
master = master.filter(columns_to_keep)
master.shape


(4627, 10)

In [28]:
master = master_modified.copy()
master.shape


(4627, 31)

In [29]:
master.filter(regex="(playerID|pos|^birth)|(Name$)").columns

Index(['playerID', 'firstName', 'lastName', 'pos', 'birthYear', 'birthMon',
       'birthDay', 'birthCountry', 'birthState', 'birthCity'],
      dtype='object')

In [30]:
master.filter(regex="(playerID|pos|^birth|Name$)").columns


Index(['playerID', 'firstName', 'lastName', 'pos', 'birthYear', 'birthMon',
       'birthDay', 'birthCountry', 'birthState', 'birthCity'],
      dtype='object')

In [31]:
master = master.filter(regex="(playerID|pos|^birth|Name$)")
master.shape


(4627, 10)

In [32]:
master.columns


Index(['playerID', 'firstName', 'lastName', 'pos', 'birthYear', 'birthMon',
       'birthDay', 'birthCountry', 'birthState', 'birthCity'],
      dtype='object')

In [33]:
def mem_mib(df):
    print("{0:.2f} MiB".format(
        df.memory_usage().sum() / (1024 * 1024)
    ))

mem_mib(master)
mem_mib(master_original)


0.39 MiB
1.84 MiB


In [34]:
master["pos"].value_counts()


D      1418
C      1037
L       848
R       832
G       463
F        28
L/C       1
Name: pos, dtype: int64

In [35]:
pd.Categorical(master["pos"])


['C', 'L', 'L', 'D', 'C', ..., 'R', 'L', 'L', 'C', 'D']
Length: 4627
Categories (7, object): ['C', 'D', 'F', 'G', 'L', 'L/C', 'R']

In [36]:
def make_categorical(df, col_name):
    df.loc[:, col_name] = pd.Categorical(df[col_name])


In [37]:
make_categorical(master, "pos")
make_categorical(master, "birthCountry")
make_categorical(master, "birthState")
mem_mib(master)


0.30 MiB


In [38]:
master.head()


Unnamed: 0,playerID,firstName,lastName,pos,birthYear,birthMon,birthDay,birthCountry,birthState,birthCity
0,aaltoan01,Antti,Aalto,C,1975.0,3.0,4.0,Finland,,Lappeenranta
4,abdelju01,Justin,Abdelkader,L,1987.0,2.0,25.0,USA,MI,Muskegon
9,abidra01,Ramzi,Abid,L,1980.0,3.0,24.0,Canada,QC,Montreal
11,abrahth01,Thommy,Abrahamsson,D,1947.0,4.0,12.0,Sweden,,Leksand
14,actonke01,Keith,Acton,C,1958.0,4.0,15.0,Canada,ON,Stouffville


In [39]:
master = master.set_index('playerID')
master.head()


Unnamed: 0_level_0,firstName,lastName,pos,birthYear,birthMon,birthDay,birthCountry,birthState,birthCity
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
aaltoan01,Antti,Aalto,C,1975.0,3.0,4.0,Finland,,Lappeenranta
abdelju01,Justin,Abdelkader,L,1987.0,2.0,25.0,USA,MI,Muskegon
abidra01,Ramzi,Abid,L,1980.0,3.0,24.0,Canada,QC,Montreal
abrahth01,Thommy,Abrahamsson,D,1947.0,4.0,12.0,Sweden,,Leksand
actonke01,Keith,Acton,C,1958.0,4.0,15.0,Canada,ON,Stouffville


In [41]:
master.to_pickle(os.path.join('data','modified', 'master.pickle'))
master.to_csv(os.path.join('data', 'modified', 'master.csv'))
