In [2]:
import pandas as pd
import numpy as np
from extract import extract_data

# load data into pandas dataframe
data = extract_data('../data/input/Salaries.csv')
df = pd.DataFrame(data)

df.head()

Unnamed: 0,Id,EmployeeName,JobTitle,BasePay,OvertimePay,OtherPay,Benefits,TotalPay,TotalPayBenefits,Year,Notes,Agency,Status
0,1,NATHANIEL FORD,GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY,167411.18,0.0,400184.25,,567595.43,567595.43,2011,,San Francisco,
1,2,GARY JIMENEZ,CAPTAIN III (POLICE DEPARTMENT),155966.02,245131.88,137811.38,,538909.28,538909.28,2011,,San Francisco,
2,3,ALBERT PARDINI,CAPTAIN III (POLICE DEPARTMENT),212739.13,106088.18,16452.6,,335279.91,335279.91,2011,,San Francisco,
3,4,CHRISTOPHER CHONG,WIRE ROPE CABLE MAINTENANCE MECHANIC,77916.0,56120.71,198306.9,,332343.61,332343.61,2011,,San Francisco,
4,5,PATRICK GARDNER,"DEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT)",134401.6,9737.0,182234.59,,326373.19,326373.19,2011,,San Francisco,


In [21]:
# convert empty strings into Null or NaN
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148654 entries, 0 to 148653
Data columns (total 13 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   Id                148654 non-null  object
 1   EmployeeName      148654 non-null  object
 2   JobTitle          148654 non-null  object
 3   BasePay           148654 non-null  object
 4   OvertimePay       148654 non-null  object
 5   OtherPay          148654 non-null  object
 6   Benefits          148654 non-null  object
 7   TotalPay          148654 non-null  object
 8   TotalPayBenefits  148654 non-null  object
 9   Year              148654 non-null  object
 10  Notes             148654 non-null  object
 11  Agency            148654 non-null  object
 12  Status            148654 non-null  object
dtypes: object(13)
memory usage: 14.7+ MB


The info tells us that all columns are object type although a lot of them are numeric columns. There are missing values in this dataset that are represented by empty strings, we can change both of these at the same time:

In [44]:
# convert Id and Year columns to int dtype
df[['Id', 'Year']] = df[['Id', 'Year']].astype(int)

# convert multiple columns to float, "errors='coerce'" converts any non-numeric values to NaN
float_columns = ['BasePay', 'OvertimePay', 'OtherPay', 'Benefits', 'TotalPay', 'TotalPayBenefits']
for col in float_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce').astype(float)

In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148654 entries, 0 to 148653
Data columns (total 13 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Id                148654 non-null  int32  
 1   EmployeeName      148654 non-null  object 
 2   JobTitle          148654 non-null  object 
 3   BasePay           148045 non-null  float64
 4   OvertimePay       148650 non-null  float64
 5   OtherPay          148650 non-null  float64
 6   Benefits          112491 non-null  float64
 7   TotalPay          148654 non-null  float64
 8   TotalPayBenefits  148654 non-null  float64
 9   Year              148654 non-null  int32  
 10  Notes             148654 non-null  object 
 11  Agency            148654 non-null  object 
 12  Status            148654 non-null  object 
dtypes: float64(6), int32(2), object(5)
memory usage: 13.6+ MB


In [48]:
df.isnull().sum()

Id                      0
EmployeeName            0
JobTitle                0
BasePay               609
OvertimePay             4
OtherPay                4
Benefits            36163
TotalPay                0
TotalPayBenefits        0
Year                    0
Notes                   0
Agency                  0
Status                  0
dtype: int64

In [54]:
# 'Notes' column has 100% missing values, and 'Status' has 74% missing values, checking the dataset on Kaggle.
df.drop(['Notes', 'Status'], axis=1, inplace=True)