In [32]:
import numpy as np
import pandas as pd

In [14]:
train_values = pd.read_csv('data/train_values.csv')
train_label = pd.read_csv('data/train_labels.csv')

In [15]:
# merge the two data frames
df = train_values.merge(train_label, on='id')

In [16]:
# check that all rows were in the joined dataset
len(df) == len(train_label) == len(train_values)

True

In [17]:
df.columns

Index(['id', 'amount_tsh', 'date_recorded', 'funder', 'gps_height',
       'installer', 'longitude', 'latitude', 'wpt_name', 'num_private',
       'basin', 'subvillage', 'region', 'region_code', 'district_code', 'lga',
       'ward', 'population', 'public_meeting', 'recorded_by',
       'scheme_management', 'scheme_name', 'permit', 'construction_year',
       'extraction_type', 'extraction_type_group', 'extraction_type_class',
       'management', 'management_group', 'payment', 'payment_type',
       'water_quality', 'quality_group', 'quantity', 'quantity_group',
       'source', 'source_type', 'source_class', 'waterpoint_type',
       'waterpoint_type_group', 'status_group'],
      dtype='object')

In [18]:
# search for duplicates 
df[df['id'].duplicated()]

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group


In [19]:
missing = pd.DataFrame(df.isnull().sum()).reset_index()
missing.columns = ['column', 'number of missing values']
missing[missing['number of missing values']>0]

Unnamed: 0,column,number of missing values
3,funder,3637
5,installer,3655
8,wpt_name,2
11,subvillage,371
18,public_meeting,3334
20,scheme_management,3878
21,scheme_name,28810
22,permit,3056


### Remove Unwanted Fields

In [None]:
# uneeded columns (fields that don't help predict the target label)
uneeded_col = ['id', 'recorded_by']

# redundant data (fields with information provided in other columns)
redundant_col = ['latitude', 'longitude', 'extraction_type', 'extraction_type_group', 'payment', 'quantity', 'source', 'source_type', 'waterpoint_type']



unwanted_col = uneeded_col.extend(redundant_col)
df = df.drop(unwanted_col, axis=1)

### Handle Date Values

In [39]:
# derive age of water pump (i.e., years since construction)
df['construction_year'] = pd.to_datetime(df['construction_year'])
df['date_recorded'] = pd.to_datetime(df['date_recorded'])
df['age'] = (df['date_recorded'] - df['construction_year']) / np.timedelta64(1, 'Y')

# delete construction_year and date_recorded columns 
df = df.drop(['construction_year', 'date_recorded'], axis=1)

In [46]:
for col in ['extraction_type', 'extraction_type_group', 'extraction_type_class']:
    print(df[col].nunique())

18
13
7


In [47]:
for col in ['payment', 'payment_type']:
    print(df[col].nunique())

7
7


In [48]:
for col in ['quantity', 'quantity_group']:
    print(df[col].nunique())

5
5


In [50]:
for col in ['source', 'source_type', 'source_class']:
    print(df[col].nunique())

10
7
3


In [51]:
for col in ['waterpoint_type', 'waterpoint_type_group']:
    print(df[col].nunique())

7
6
