In [133]:
# First let's import
# Let's do our imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import pandas_profiling
import seaborn as sns; sns.set()
import category_encoders as ce
from statistics import mode
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
%matplotlib inline

In [134]:
# Let's get our data
Location = "../data/tanzania/"

train = pd.merge(pd.read_csv(Location + 'train_features.csv'),
                 pd.read_csv(Location + 'train_labels.csv'))
test = pd.read_csv(Location + 'test_features.csv')
sample_submission = pd.read_csv(Location + 'sample_submission.csv')

In [135]:
# Now Let's do our test(val)-train split

train, val = train_test_split(train, train_size=0.80, test_size=0.20, random_state=42, 
                              stratify=train['status_group'])

train.shape, val.shape, test.shape

((47520, 41), (11880, 41), (14358, 40))

In [136]:
train.sample(10)

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
6692,41501,0.0,2011-07-24,Ridep,0,RIDEP,32.95133,-2.719656,Kwa Buge,0,...,milky,milky,enough,enough,shallow well,shallow well,groundwater,other,other,non functional
32683,45511,0.0,2011-07-05,Tanzakesho,0,DWE,33.062487,-9.312613,Simbeye,0,...,soft,good,enough,enough,spring,spring,groundwater,other,other,functional
27501,53086,500.0,2004-01-07,Kkkt,1611,Villagers,34.900561,-8.873813,Kwa Barnabasi Kilumile,0,...,soft,good,enough,enough,hand dtw,borehole,groundwater,hand pump,hand pump,functional
38576,16654,0.0,2012-10-15,Dwe,0,DWE,33.092835,-4.025903,Ubada,0,...,unknown,unknown,unknown,unknown,spring,spring,groundwater,communal standpipe,communal standpipe,non functional
51239,55809,0.0,2011-03-03,Roman,2323,Commu,34.252534,-9.269225,none,0,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
22710,32107,500.0,2013-03-24,Government Of Tanzania,1852,DWE,36.647174,-3.253586,Kwa Loitai,0,...,soft,good,insufficient,insufficient,spring,spring,groundwater,communal standpipe,communal standpipe,functional
37108,27354,0.0,2011-03-13,Rc,1942,RC,34.499608,-9.247328,Daudi Mbilinyi,0,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
52966,73077,100.0,2013-02-15,Snv,1032,UMOJA DRILLING CONTRACTOR,35.877711,-10.244966,Kwa Mtupa,0,...,soft,good,insufficient,insufficient,shallow well,shallow well,groundwater,hand pump,hand pump,functional
53017,42242,0.0,2011-03-28,Government Of Tanzania,1286,DWE,38.355593,-4.901969,Kwa Mzee Hope,65,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
53275,37936,0.0,2013-03-26,Losaa-kia Water Supply,1289,Losaa-Kia water supp,37.111317,-3.188091,Kwa Aletaulo Munuo,0,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional


In [137]:
# We're familiar with this data
# First let's engineer some of our features
def organize(X):
    """Function will organize the features of train, validate and test sets in the same way"""
    X = X.copy()
    
    # We'll start with latitude and it's tiny values
    # First, we have to replace these values with 0
    X['latitude'] = X['latitude'].replace(-2e-08, 0)
    
    # Now those values are the same as the incorrect null values in some of our other features
    colsw0s = ['longitude', 'latitude', 'population','construction_year'] #also construction year, but we'll get to that
    
    # We'll turn those 0's into np.nan the replace them with the mean of their columns
    for col in colsw0s:
        X[col] = X[col].replace(0, np.nan)
        X[col] = X[col].fillna(X[col].mean())
    
    # For construction_year we'll replace with the mode
#     X['construction_year'] = X['construction_year'].replace(0, np.nan)
#     XnoNan = X['construction_year'].dropna()
#     md = XnoNan.mode()
#     X['construction_year'] = X['construction_year'].fillna(md)
    
    # Next we'll convert date_recorded to datetime
    X['date_recorded'] = pd.to_datetime(X['date_recorded'], infer_datetime_format=True)
    
    # And we'll extract year_recorded also
    X['year_recorded'] = X['date_recorded'].dt.year
    
    # And we'll drop the column quantity_group as it is a duplicate of quantity
    X = X.drop(columns='quantity_group')
    
    # And we'll fill the missing values for categorical features with 'MISSING'
    cats = X.select_dtypes(exclude='number').columns
    for col in cats:
        X[col] = X[col].fillna('MISSING')
        
    return X

train = organize(train)
val = organize(val)
test = organize(test)

In [138]:
train.sample(20)

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,water_quality,quality_group,quantity,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group,year_recorded
54389,50506,500.0,2013-08-02,Danida,448,DANIDA,34.841433,-11.330271,Kwa Mzee Zuru,0,...,soft,good,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional,2013
8044,61827,1000.0,2011-07-28,Il,1688,CONS,31.243547,-7.858653,Kwa Mwalimu,0,...,soft,good,insufficient,river,river/lake,surface,communal standpipe,communal standpipe,functional,2011
56068,59173,0.0,2011-07-10,Hesawa,0,DWE,31.424816,-1.202875,Kwasaid,0,...,salty,salty,insufficient,shallow well,shallow well,groundwater,hand pump,hand pump,non functional,2011
4784,17163,50.0,2011-03-16,Private Individual,98,WU,38.426893,-6.645277,Sefu Mgambo,0,...,soft,good,enough,river,river/lake,surface,communal standpipe,communal standpipe,non functional,2011
56659,24487,20.0,2011-03-17,World Bank,458,World,37.829495,-6.774188,Kwa Rajabu Chedi,0,...,soft,good,enough,river,river/lake,surface,communal standpipe,communal standpipe,functional,2011
53884,35195,0.0,2013-03-12,0,-15,0,39.350549,-6.839205,Kwa Mzee Peter Kabekege,0,...,soft,good,dry,machine dbh,borehole,groundwater,other,other,non functional,2013
54544,46762,1000.0,2013-03-27,World Vision,964,World Vision,37.875679,-4.290554,Kwa Grayson,0,...,soft,good,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional,2013
48451,72357,30.0,2013-01-11,MISSING,1601,MISSING,34.878513,-4.670221,Amana Primary,0,...,soft,good,insufficient,machine dbh,borehole,groundwater,communal standpipe,communal standpipe,functional,2013
28695,42994,0.0,2013-01-18,Wateraid,0,SEMA,32.782117,-4.381812,Mbutu Kati,0,...,salty,salty,enough,machine dbh,borehole,groundwater,hand pump,hand pump,functional,2013
9378,11457,0.0,2011-08-12,Kkkt,0,KKKT,32.995464,-8.929842,Mpito A,0,...,soft,good,seasonal,shallow well,shallow well,groundwater,other,other,non functional,2011


In [139]:
# Now we can work with our features
# We'll set the target first
target = 'status_group'

# Now we'll set a df with all train features except for our target and the id column
trainfeat = train.drop(columns=[target, 'id'])

# We'll also separate the numeric features into one list...
numfeat = trainfeat.select_dtypes(include='number').columns.tolist()

# And we'll get the cardinality of the non-numeric features...
cardinality = trainfeat.select_dtypes(exclude='number').nunique()

# And then we'll get the list for categorical features with cardinality <= 50
catfeat = cardinality[cardinality <= 50].index.tolist()

# Finally we'll combine those lists
feats = numfeat + catfeat

In [140]:
# Now let's take a look at cardinality
cardinality.sort_values()

recorded_by                  1
public_meeting               3
source_class                 3
permit                       3
quantity                     5
management_group             5
quality_group                6
waterpoint_type_group        6
payment_type                 7
payment                      7
source_type                  7
waterpoint_type              7
extraction_type_class        7
water_quality                8
basin                        9
source                      10
management                  12
scheme_management           13
extraction_type_group       13
extraction_type             18
region                      21
lga                        124
date_recorded              349
funder                    1717
installer                 1930
ward                      2082
scheme_name               2564
subvillage               17232
wpt_name                 30661
dtype: int64

In [141]:
# and also catfeat
catfeat

['basin',
 'region',
 'public_meeting',
 'recorded_by',
 'scheme_management',
 'permit',
 'extraction_type',
 'extraction_type_group',
 'extraction_type_class',
 'management',
 'management_group',
 'payment',
 'payment_type',
 'water_quality',
 'quality_group',
 'quantity',
 'source',
 'source_type',
 'source_class',
 'waterpoint_type',
 'waterpoint_type_group']

In [142]:
# and finally feats
feats

['amount_tsh',
 'gps_height',
 'longitude',
 'latitude',
 'num_private',
 'region_code',
 'district_code',
 'population',
 'construction_year',
 'year_recorded',
 'basin',
 'region',
 'public_meeting',
 'recorded_by',
 'scheme_management',
 'permit',
 'extraction_type',
 'extraction_type_group',
 'extraction_type_class',
 'management',
 'management_group',
 'payment',
 'payment_type',
 'water_quality',
 'quality_group',
 'quantity',
 'source',
 'source_type',
 'source_class',
 'waterpoint_type',
 'waterpoint_type_group']

In [143]:
len(feats)

31

In [144]:
# Now, let's choose some features to use for our decision tree
features = ['source_type', 'latitude','longitude']

In [145]:
# Now we can encode, then scale our features
# First we arrange our data into X features matrix and y target vector
xtrain = train[features]
ytrain = train[target]
xval = val[features]
yval = val[target]
xtest = test[features]

# Reminder-Encoder: fit_transform on train, transform on val & test
encoder = ce.OneHotEncoder(use_cat_names=True)
xtrain_encoded = encoder.fit_transform(xtrain)
xval_encoded = encoder.transform(xval)
xtest_encoded = encoder.transform(xtest)

# Reminder-Scaler: fit_transform on train, transform on val & test
scaler = RobustScaler()
xtrain_scaled = scaler.fit_transform(xtrain_encoded)
xval_scaled = scaler.transform(xval_encoded)
xtest_scaled = scaler.transform(xtest_encoded)

In [146]:
# We'll use logistic regression to get a baseline
lr = LogisticRegression(solver='lbfgs', multi_class='auto', n_jobs=-1)
lr.fit(xtrain_scaled, ytrain)
print('Logistic Regression Model/Baseline')
print(f'Train Acc: {lr.score(xtrain_scaled, ytrain)}')
print(f'Val Acc: {lr.score(xval_scaled, yval)}')

Logistic Regression Model/Baseline
Train Acc: 0.5451388888888888
Val Acc: 0.5454545454545454


In [147]:
# Now let's try to use a decision tree Classifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(xtrain_scaled, ytrain)
print('Decision Tree Model')
print(f'Train Acc: {dt.score(xtrain_scaled, ytrain)}')
print(f'Val Acc: {dt.score(xval_scaled, yval)}')

Decision Tree Model
Train Acc: 0.9856271043771043
Val Acc: 0.6557239057239057


In [148]:
# Big improvement from our baseline, but with some obvious overfitting,
# Now let's try to use the same Decision tree Classifier, but with max_depth=10
maxdtdepth=10
dt = DecisionTreeClassifier(max_depth=maxdtdepth, random_state=42)
dt.fit(xtrain_scaled, ytrain)
print(f'Decision Tree Model: Max Depth = {maxdtdepth}')
print(f'Train Acc: {dt.score(xtrain_scaled, ytrain)}')
print(f'Val Acc: {dt.score(xval_scaled, yval)}')

Decision Tree Model: Max Depth = 10
Train Acc: 0.6760101010101011
Val Acc: 0.6407407407407407


In [182]:
# We'll keep our max_depth at 10 to avoid overfitting
# Let's pick some different features and try to get it all done in one workflow
features = ['source_type', 'waterpoint_type', 'extraction_type_group', 'quantity', 'population', 
            'construction_year', 'latitude','longitude']

xtrain = train[features]
ytrain = train[target]
xval = val[features]
yval = val[target]
xtest = test[features]

# Reminder-Encoder: fit_transform on train, transform on val & test
encoder = ce.OneHotEncoder(use_cat_names=True)
xtrain_encoded = encoder.fit_transform(xtrain)
xval_encoded = encoder.transform(xval)
xtest_encoded = encoder.transform(xtest)

# Reminder-Scaler: fit_transform on train, transform on val & test
scaler = RobustScaler()
xtrain_scaled = scaler.fit_transform(xtrain_encoded)
xval_scaled = scaler.transform(xval_encoded)
xtest_scaled = scaler.transform(xtest_encoded)

maxdtdepth=10

dt = DecisionTreeClassifier(max_depth=maxdtdepth, random_state=42)
dt.fit(xtrain_scaled, ytrain)
print(f'Decision Tree Model: Max Depth = {maxdtdepth}')
print(f'Train Acc: {dt.score(xtrain_scaled, ytrain)}')
print(f'Val Acc: {dt.score(xval_scaled, yval)}')

Decision Tree Model: Max Depth = 10
Train Acc: 0.7529250841750842
Val Acc: 0.744023569023569


In [183]:
# Ok, we have a pretty good validation score, so let's predict on xtest and submit
ypred = dt.predict(xtest_scaled)

submission = sample_submission.copy()
submission['status_group'] = ypred
submission.to_csv('dtsubmission-03.csv', index=False)