In [66]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import SMOTENC, BorderlineSMOTE, SMOTE

import category_encoders as ce

# Load Data

In [127]:
source = '../data/tanzania/'

# Merge train features and train labels
train = pd.merge(pd.read_csv(source + 'train_features.csv'),
                 pd.read_csv(source + 'train_labels.csv'))

# read test and sample submission
test = pd.read_csv(source + 'test_features.csv')
sample_sub = pd.read_csv(source + 'sample_submission.csv')

train.shape, test.shape

((59400, 41), (14358, 40))

## Look at distribution of target labels

In [88]:
train['status_group'].value_counts(normalize=True)

functional                 0.543081
non functional             0.384242
functional needs repair    0.072677
Name: status_group, dtype: float64

## split the data into train and valuate sets

In [128]:
train, val = train_test_split(train, 
                              test_size=0.2, 
                              stratify=train['status_group'],
                             random_state=42)

train.shape, val.shape, test.shape

((47520, 41), (11880, 41), (14358, 40))

## Clean data and engineer features

In [129]:
def clean_data (X):
    X = X.copy()
    
    # convert all strings to lowercase
    cat_features = X.select_dtypes('object').columns.tolist()
    for feat in cat_features:
        X[feat] = X[feat].str.lower()
        
    # Replace -2.00000e-08 with np.nan
    X['latitude'] = X['latitude'].replace(-2.000000e-08, np.nan)
    
    clean_features = [
        'gps_height',
        'population',
        'amount_tsh',
        'construction_year',
        'latitude',
        'longitude'
    ]
    
    for feat in clean_features:
        # Replace values=0.0 with np.nan
        X[feat] = X[feat].replace(0, np.nan)
        
        # fill nan with mean using region and district code
        X[feat] = X[feat].fillna(X.groupby(['region', 'district_code'])[feat].transform('mean'))
        
        # fill nan with mean of region if district code is missing
        X[feat] = X[feat].fillna(X.groupby(['region'])[feat].transform('mean'))
        
        # fill nan with general mean if no region or district code
        X[feat] = X[feat].fillna(X[feat].mean())
        
    # fillna of scheme_management with 'unknow' and combine low count values into 'other'
    X['scheme_management'] = X['scheme_management'].fillna('unknown')
    X['scheme_management'] = X['scheme_management'].replace({
        'swc':'Other',
        'trust':'Other',
        'none':'Other',
        'company': 'Other'
    })
    
    # make date_recorded datetime type
    X['date_recorded'] = pd.to_datetime(X['date_recorded'])
        
    return X

In [130]:
def feature_eng(X):
    X = X.copy()
    
    # create month and year features from date_recorded
    X['month'] = X['date_recorded'].dt.month
    X['year'] = X['date_recorded'].dt.year
    
    # create a pump age feature
    X['pump_age'] = (X['year'].max() - X['construction_year'])
    
    # create seasons based on month
    X['hot_dry'] = (X['month'] == 12) | (X['month'] < 3)     # Dec. - Feb.
    X['cool_dry'] = (X['month'] > 5) & (X['month'] < 11)     # Jun. - Oct
    X['light_rain'] = (X['month'] == 3) | (X['month'] == 11) # Mar. & Nov.
    X['heavy_rain'] = (X['month'] == 4) | (X['month'] == 5)  # Apr. & May
    
    # create installer features
    X['dwe_installer'] = (X['installer'] == 'dwe')
    X['gov_installer'] = (X['installer'] == 'government')
     
    one_time_installer = X['installer'].value_counts()[X['installer'].value_counts() == 1]
    X['one_time_installer'] = X['installer'].isin(one_time_installer.index)
    
    minor_installer = X['installer'].value_counts()[X['installer'].value_counts() <= 7]
    X['minor_installer'] = X['installer'].isin(minor_installer.index)
    
    major_installer = X['installer'].value_counts()[X['installer'].value_counts() > 7]
    X['major_installer'] = X['installer'].isin(major_installer.index)
    
    # create funder features
    X['gov_funder'] = (X['funder'] == 'government of tanzania')
     
    one_time_funder = X['funder'].value_counts()[X['funder'].value_counts() == 1]
    X['one_time_funder'] = X['funder'].isin(one_time_funder.index)
    
    minor_funder = X['funder'].value_counts()[X['funder'].value_counts() <= 7]
    X['minor_funder'] = X['funder'].isin(minor_funder.index)
    
    major_funder = X['funder'].value_counts()[X['funder'].value_counts() > 7]
    X['major_funder'] = X['funder'].isin(major_funder.index)
    
    # amount per person 
    X['amount_per_person'] = (X['amount_tsh'] / X['population'])
    
    # gps height / ampunt
    X['gps_per_person'] = X['gps_height'] / X['amount_tsh']
    
    return X

In [131]:
def drop_cols(X):
    X = X.copy()
    
    # drop unneeded cols
    drop_cols = [
        'recorded_by',
        'id',
        'quantity_group',
        'public_meeting',
        'permit',
        'date_recorded'
    ]
    
    X = X.drop(columns=drop_cols)
    
    return X

### clean data using function above

In [132]:
train = clean_data(train)
val = clean_data(val)
test = clean_data(test)

train.shape, val.shape, test.shape

((47520, 41), (11880, 41), (14358, 40))

### feature engineering with function above

In [133]:
train = feature_eng(train)
val = feature_eng(val)
test = feature_eng(test)

train.shape, val.shape, test.shape

((47520, 59), (11880, 59), (14358, 58))

### drop columns that are not needed

In [134]:
train = drop_cols(train)
val = drop_cols(val)
test = drop_cols(test)

train.shape, val.shape, test.shape

((47520, 53), (11880, 53), (14358, 52))

## Split data into X features and Y target for train and val

In [135]:
# The status_group column is the target
target = 'status_group'

# Get a dataframe with all train columns except the target
train_features = train.drop(columns=[target])

# Get a list of the numeric features
numeric_features = train_features.select_dtypes(include='number').columns.tolist()

# Get a series with the cardinality of the nonnumeric features
cardinality = train_features.select_dtypes(exclude='number').nunique()

# Get a list of all categorical features with cardinality <= 50
categorical_features = cardinality[cardinality <= 1000].index.tolist()

# Combine the lists 
features = numeric_features + categorical_features

In [159]:
X_train = train[features]
y_train = train[target]

X_val = val[features]
y_val = val[target]

X_test = test[features]

X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape

((47520, 46), (47520,), (11880, 46), (11880,), (14358, 46))

## try SMOTE to over sample minority class before running model

In [160]:
encoder = ce.OneHotEncoder(use_cat_names=True)

X_train = encoder.fit_transform(X_train)
X_val = encoder.transform(X_val)
X_test = encoder.transform(X_test)

In [162]:
X_train.shape, X_val.shape, X_test.shape

((47520, 312), (11880, 312), (14358, 312))

In [163]:
# oversample using SMOTE to boost needs repair class
smote = SMOTE('minority')

X_train_sm, y_train_sm = smote.fit_sample(X_train, y_train)

X_train_sm.shape, y_train_sm.shape

((69873, 312), (69873,))

In [164]:
pd.Series(y_train_sm).value_counts(normalize=True)

functional                 0.369342
functional needs repair    0.369342
non functional             0.261317
dtype: float64

In [165]:
clf_rf = RandomForestClassifier(n_estimators=500,
                           random_state=42, 
                           n_jobs=-1)

clf_rf.fit(X_train_sm, y_train_sm)
clf_rf.score(X_val, y_val)

y_pred = clf_rf.predict(X_test)

In [166]:
confusion_matrix(y_val, y_pred)

ValueError: Found input variables with inconsistent numbers of samples: [11880, 14358]