In [72]:
%matplotlib inline
import category_encoders as ce
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.tree import DecisionTreeClassifier

LOCAL = '../data/tanzania/'
WEB = 'https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Tree-Ensembles/master/data/tanzania/'
source = LOCAL

# Merge train_features.csv & train_labels.csv
train = pd.merge(pd.read_csv(source + 'train_features.csv'), 
                 pd.read_csv(source + 'train_labels.csv'))

# Read test_features.csv & sample_submission.csv
test = pd.read_csv(source + 'test_features.csv')
sample_submission = pd.read_csv(source + 'sample_submission.csv')

In [73]:
# Split train into train & val
train, val = train_test_split(train, train_size=0.80, test_size=0.20, 
                              stratify=train['status_group'], random_state=42)

print(train.shape, val.shape, test.shape)

(47520, 41) (11880, 41) (14358, 40)


In [74]:
def clean_data (X):
    X = X.copy()
    
    # convert all strings to lowercase
    cat_features = X.select_dtypes('object').columns.tolist()
    for feat in cat_features:
        X[feat] = X[feat].str.lower()
        
    # Replace -2.00000e-08 with np.nan
    X['latitude'] = X['latitude'].replace(-2.000000e-08, np.nan)
    
    clean_features = [
        'gps_height',
        'population',
        'amount_tsh',
        'construction_year',
        'latitude',
        'longitude'
    ]
    
    for feat in clean_features:
        # Replace values=0.0 with np.nan
        X[feat] = X[feat].replace(0, np.nan)
        
        # fill nan with mean using region and district code
        X[feat] = X[feat].fillna(X.groupby(['region', 'district_code'])[feat].transform('mean'))
        
        # fill nan with mean of region if district code is missing
        X[feat] = X[feat].fillna(X.groupby(['region'])[feat].transform('mean'))
        
        # fill nan with general mean if no region or district code
        X[feat] = X[feat].fillna(X[feat].mean())
        
    # fillna of scheme_management with 'unknow' and combine low count values into 'other'
    X['scheme_management'] = X['scheme_management'].fillna('unknown')
    X['scheme_management'] = X['scheme_management'].replace({
        'swc':'Other',
        'trust':'Other',
        'none':'Other',
        'company': 'Other'
    })
    
    # make date_recorded datetime type
    X['date_recorded'] = pd.to_datetime(X['date_recorded'])
        
    return X

In [75]:
train = clean_data(train)
val = clean_data(val)
test = clean_data(test)

train.shape, val.shape, test.shape

((47520, 41), (11880, 41), (14358, 40))

In [77]:
def feature_eng(X):
    X = X.copy()
    
    # create month and year features from date_recorded
    X['month'] = X['date_recorded'].dt.month
    X['year'] = X['date_recorded'].dt.year
    
    # create a pump age feature
    X['pump_age'] = (X['year'].max() - X['construction_year'])
    
    # create seasons based on month
    X['hot_dry'] = (X['month'] == 12) | (X['month'] < 3)     # Dec. - Feb.
    X['cool_dry'] = (X['month'] > 5) & (X['month'] < 11)     # Jun. - Oct
    X['light_rain'] = (X['month'] == 3) | (X['month'] == 11) # Mar. & Nov.
    X['heavy_rain'] = (X['month'] == 4) | (X['month'] == 5)  # Apr. & May
    
    # create installer features
    X['dwe_installer'] = (X['installer'] == 'dwe')
    X['gov_installer'] = (X['installer'] == 'government')
     
    one_time_installer = X['installer'].value_counts()[X['installer'].value_counts() == 1]
    X['one_time_installer'] = X['installer'].isin(one_time_installer.index)
    
    minor_installer = X['installer'].value_counts()[X['installer'].value_counts() <= 7]
    X['minor_installer'] = X['installer'].isin(minor_installer.index)
    
    major_installer = X['installer'].value_counts()[X['installer'].value_counts() > 7]
    X['major_installer'] = X['installer'].isin(major_installer.index)
    
    # create funder features
    X['gov_funder'] = (X['funder'] == 'government of tanzania')
     
    one_time_funder = X['funder'].value_counts()[X['funder'].value_counts() == 1]
    X['one_time_funder'] = X['funder'].isin(one_time_funder.index)
    
    minor_funder = X['funder'].value_counts()[X['funder'].value_counts() <= 7]
    X['minor_funder'] = X['funder'].isin(minor_funder.index)
    
    major_funder = X['funder'].value_counts()[X['funder'].value_counts() > 7]
    X['major_funder'] = X['funder'].isin(major_funder.index)
    
    # amount per person 
    X['amount_per_person'] = (X['amount_tsh'] / X['population'])
    
    # gps height / ampunt
    X['gps_per_person'] = X['gps_height'] / X['amount_tsh']
    
    return X

In [78]:
train = feature_eng(train)
val = feature_eng(val)
test = feature_eng(test)

train.shape, val.shape, test.shape

((47520, 59), (11880, 59), (14358, 58))

In [79]:
def drop_cols(X):
    X = X.copy()
    
    # drop unneeded cols
    drop_cols = [
        'recorded_by',
        'id',
        'quantity_group'
    ]
    
    X = X.drop(columns=drop_cols)
    
    return X

In [80]:
train = drop_cols(train)
val = drop_cols(val)
test = drop_cols(test)

train.shape, val.shape, test.shape

((47520, 56), (11880, 56), (14358, 55))