In [3]:
# First let's import
# Let's do our imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import pandas_profiling
import seaborn as sns; sns.set()
import category_encoders as ce
from statistics import mode
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
%matplotlib inline

In [4]:
# Let's get our data
Location = "../data/tanzania/"

train = pd.merge(pd.read_csv(Location + 'train_features.csv'),
                 pd.read_csv(Location + 'train_labels.csv'))
test = pd.read_csv(Location + 'test_features.csv')
sample_submission = pd.read_csv(Location + 'sample_submission.csv')

In [5]:
# Now Let's do our test(val)-train split

train, val = train_test_split(train, train_size=0.80, test_size=0.20, random_state=42, 
                              stratify=train['status_group'])

train.shape, val.shape, test.shape

((47520, 41), (11880, 41), (14358, 40))

In [6]:
train.sample(10)

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
8410,68292,0.0,2011-03-29,World Bank,0,DWE,35.423135,-5.221289,Singida-Road,0,...,salty abandoned,salty,enough,enough,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,functional
53834,68099,0.0,2011-07-23,Hifab,0,Hesawa,33.407894,-3.108186,Nyawishi B,0,...,soft,good,enough,enough,shallow well,shallow well,groundwater,hand pump,hand pump,functional
8742,5571,0.0,2013-02-24,Villagers,1296,Villagers,35.344379,-9.83211,Kwa Mama Aivoni,0,...,soft,good,enough,enough,river,river/lake,surface,communal standpipe,communal standpipe,functional
59278,62508,20.0,2011-03-13,Private Individual,180,WU,38.347827,-6.636405,Mama Msoma,0,...,soft,good,enough,enough,river,river/lake,surface,communal standpipe,communal standpipe,non functional
58778,48871,0.0,2012-10-18,Dwe,0,DWE,33.030925,-4.116356,Lulunguti,0,...,soft,good,dry,dry,shallow well,shallow well,groundwater,other,other,non functional
47679,46186,0.0,2013-03-18,Usaid/wfp,1950,Active MKM,35.567338,-2.168811,Shuleni,0,...,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional
52225,558,0.0,2013-03-01,World Vision,0,World vision,33.731347,-3.284633,Mwamagulya,0,...,soft,good,seasonal,seasonal,shallow well,shallow well,groundwater,hand pump,hand pump,functional
12654,61477,0.0,2012-12-12,Tabora Municipal Council,0,MWE,32.941909,-5.069374,Kwa Mzee Haruna,0,...,salty,salty,seasonal,seasonal,shallow well,shallow well,groundwater,hand pump,hand pump,functional
42019,6879,500.0,2011-03-08,Government Of Tanzania,1615,DWE,38.237237,-4.482704,Kwa Mzee Kadema,0,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
14659,34261,0.0,2013-03-14,Lawatefuka Water Supply,1188,Lawatefuka water sup,37.103075,-3.213831,Kwa Joel Mlay,0,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional


In [7]:
# We're familiar with this data
# First let's engineer some of our features
def organize(X):
    """Function will organize the features of train, validate and test sets in the same way"""
    X = X.copy()
    
    # We'll start with latitude and it's tiny values
    # First, we have to replace these values with 0
    X['latitude'] = X['latitude'].replace(-2e-08, 0)
    
    # Now those values are the same as the incorrect null values in some of our other features
    colsw0s = ['longitude', 'latitude', 'population','construction_year'] #also construction year, but we'll get to that
    
    # We'll turn those 0's into np.nan the replace them with the mean of their columns
    for col in colsw0s:
        X[col] = X[col].replace(0, np.nan)
        X[col] = X[col].fillna(X[col].mean())
    
    # For construction_year we'll replace with the mode
#     X['construction_year'] = X['construction_year'].replace(0, np.nan)
#     XnoNan = X['construction_year'].dropna()
#     md = XnoNan.mode()
#     X['construction_year'] = X['construction_year'].fillna(md)
    
    # Next we'll convert date_recorded to datetime
    X['date_recorded'] = pd.to_datetime(X['date_recorded'], infer_datetime_format=True)
    
    # And we'll extract year_recorded also
    X['year_recorded'] = X['date_recorded'].dt.year
    
    # And we'll drop the column quantity_group as it is a duplicate of quantity
    X = X.drop(columns='quantity_group')
    
    # And we'll fill the missing values for categorical features with 'MISSING'
    cats = X.select_dtypes(exclude='number').columns
    for col in cats:
        X[col] = X[col].fillna('MISSING')
        
    return X

train = organize(train)
val = organize(val)
test = organize(test)

In [8]:
train.sample(20)

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,water_quality,quality_group,quantity,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group,year_recorded
53353,49957,0.0,2011-03-14,Tardo,1761,Tardo,38.258535,-4.661225,Kwa Mzee Kaniki,0,...,soft,good,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional,2011
21224,51616,500.0,2013-03-22,World Vision,521,World Vision,37.997015,-4.605674,Msikitini,0,...,soft,good,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional,2013
53661,53527,50.0,2011-03-14,Islamic Found,136,WU,38.342418,-6.610515,Msikitini,0,...,soft,good,enough,river,river/lake,surface,communal standpipe,communal standpipe,functional,2011
48443,41068,20.0,2013-02-22,Jica,1543,Jica,35.550224,-4.337431,Mq 5,0,...,soft,good,dry,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional,2013
3418,73444,0.0,2013-02-23,District Council,1506,District council,37.60198,-3.210541,Kwa Baltazari,0,...,soft,good,enough,spring,spring,groundwater,communal standpipe,communal standpipe,non functional,2013
55839,43404,0.0,2013-02-15,Okutu Village Community,1371,Gerald Mila,36.871971,-4.046898,Ndovu,0,...,unknown,unknown,unknown,machine dbh,borehole,groundwater,other,other,non functional,2013
23692,52454,0.0,2011-07-14,Danida,0,Central government,33.860824,-9.613851,Kwa Sala Kakobe,0,...,soft,good,dry,spring,spring,groundwater,communal standpipe,communal standpipe,non functional,2011
37859,18794,0.0,2011-02-19,Sao H,1858,Sao,35.215674,-8.410828,none,0,...,soft,good,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional,2011
49913,39086,0.0,2011-07-11,Hesawa,0,DWE,31.120843,-1.745701,Kikoni,0,...,soft,good,enough,shallow well,shallow well,groundwater,other,other,non functional,2011
35075,4581,0.0,2012-11-30,Serikali,1211,Serikali,34.465225,-1.466683,Kwa Mwita Goni,0,...,soft,good,enough,machine dbh,borehole,groundwater,hand pump,hand pump,functional,2012


In [30]:
train['gps_height'].value_counts()

 0       16305
-15         48
 1290       45
-20         43
-14         43
-16         42
-13         42
-18         40
 280        39
 1269       37
 320        37
 1304       37
 1538       37
 303        36
 1295       36
 1303       36
 1264       36
-8          36
 338        36
 1286       36
 1359       35
 1342       35
 1240       35
 1613       35
 1335       35
 1319       35
 1602       35
 1401       35
 1288       35
-27         34
         ...  
 2292        1
 2388        1
 2420        1
 2484        1
 2295        1
 2417        1
 2385        1
 2353        1
 2319        1
 2285        1
 2413        1
 2509        1
 2220        1
 621         1
 653         1
 2159        1
 2255        1
 2287        1
 2351        1
 2321        1
 622         1
 1998        1
 2126        1
 2254        1
 2286        1
 2318        1
 591         1
 2193        1
 2289        1
 2031        1
Name: gps_height, Length: 2401, dtype: int64

In [9]:
# Now we can work with our features
# We'll set the target first
target = 'status_group'

# Now we'll set a df with all train features except for our target and the id column
trainfeat = train.drop(columns=[target, 'id'])

# We'll also separate the numeric features into one list...
numfeat = trainfeat.select_dtypes(include='number').columns.tolist()

# And we'll get the cardinality of the non-numeric features...
cardinality = trainfeat.select_dtypes(exclude='number').nunique()

# And then we'll get the list for categorical features with cardinality <= 50
catfeat = cardinality[cardinality <= 50].index.tolist()

# Finally we'll combine those lists
feats = numfeat + catfeat

In [10]:
# Now let's take a look at cardinality
cardinality.sort_values()

recorded_by                  1
public_meeting               3
source_class                 3
permit                       3
quantity                     5
management_group             5
quality_group                6
waterpoint_type_group        6
payment_type                 7
payment                      7
source_type                  7
waterpoint_type              7
extraction_type_class        7
water_quality                8
basin                        9
source                      10
management                  12
scheme_management           13
extraction_type_group       13
extraction_type             18
region                      21
lga                        124
date_recorded              349
funder                    1717
installer                 1930
ward                      2082
scheme_name               2564
subvillage               17232
wpt_name                 30661
dtype: int64

In [11]:
# and also catfeat
catfeat

['basin',
 'region',
 'public_meeting',
 'recorded_by',
 'scheme_management',
 'permit',
 'extraction_type',
 'extraction_type_group',
 'extraction_type_class',
 'management',
 'management_group',
 'payment',
 'payment_type',
 'water_quality',
 'quality_group',
 'quantity',
 'source',
 'source_type',
 'source_class',
 'waterpoint_type',
 'waterpoint_type_group']

In [12]:
# and finally feats
feats

['amount_tsh',
 'gps_height',
 'longitude',
 'latitude',
 'num_private',
 'region_code',
 'district_code',
 'population',
 'construction_year',
 'year_recorded',
 'basin',
 'region',
 'public_meeting',
 'recorded_by',
 'scheme_management',
 'permit',
 'extraction_type',
 'extraction_type_group',
 'extraction_type_class',
 'management',
 'management_group',
 'payment',
 'payment_type',
 'water_quality',
 'quality_group',
 'quantity',
 'source',
 'source_type',
 'source_class',
 'waterpoint_type',
 'waterpoint_type_group']

In [13]:
len(feats)

31

In [14]:
# Now, let's choose some features to use for our decision tree
features = ['source_type', 'latitude','longitude']

In [15]:
# Now we can encode, then scale our features
# First we arrange our data into X features matrix and y target vector
xtrain = train[features]
ytrain = train[target]
xval = val[features]
yval = val[target]
xtest = test[features]

# Reminder-Encoder: fit_transform on train, transform on val & test
encoder = ce.OneHotEncoder(use_cat_names=True)
xtrain_encoded = encoder.fit_transform(xtrain)
xval_encoded = encoder.transform(xval)
xtest_encoded = encoder.transform(xtest)

# Reminder-Scaler: fit_transform on train, transform on val & test
scaler = RobustScaler()
xtrain_scaled = scaler.fit_transform(xtrain_encoded)
xval_scaled = scaler.transform(xval_encoded)
xtest_scaled = scaler.transform(xtest_encoded)

In [16]:
# We'll use logistic regression to get a baseline
lr = LogisticRegression(solver='lbfgs', multi_class='auto', n_jobs=-1)
lr.fit(xtrain_scaled, ytrain)
print('Logistic Regression Model/Baseline')
print(f'Train Acc: {lr.score(xtrain_scaled, ytrain)}')
print(f'Val Acc: {lr.score(xval_scaled, yval)}')

Logistic Regression Model/Baseline
Train Acc: 0.5451388888888888
Val Acc: 0.5454545454545454


In [17]:
# Now let's try to use a decision tree Classifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(xtrain_scaled, ytrain)
print('Decision Tree Model')
print(f'Train Acc: {dt.score(xtrain_scaled, ytrain)}')
print(f'Val Acc: {dt.score(xval_scaled, yval)}')

Decision Tree Model
Train Acc: 0.9856271043771043
Val Acc: 0.6557239057239057


In [18]:
# Big improvement from our baseline, but with some obvious overfitting,
# Now let's try to use the same Decision tree Classifier, but with max_depth=10
maxdtdepth=10
dt = DecisionTreeClassifier(max_depth=maxdtdepth, random_state=42)
dt.fit(xtrain_scaled, ytrain)
print(f'Decision Tree Model: Max Depth = {maxdtdepth}')
print(f'Train Acc: {dt.score(xtrain_scaled, ytrain)}')
print(f'Val Acc: {dt.score(xval_scaled, yval)}')

Decision Tree Model: Max Depth = 10
Train Acc: 0.6760101010101011
Val Acc: 0.6407407407407407


In [29]:
# We'll keep our max_depth at 10 to avoid overfitting
# Let's pick some different features and try to get it all done in one workflow
features = ['source_type', 'waterpoint_type', 'extraction_type_group', 'quantity','population', 
            'day_recorded', 'construction_year', 'latitude','longitude']

xtrain = train[features]
ytrain = train[target]
xval = val[features]
yval = val[target]
xtest = test[features]

# Reminder-Encoder: fit_transform on train, transform on val & test
encoder = ce.OneHotEncoder(use_cat_names=True)
xtrain_encoded = encoder.fit_transform(xtrain)
xval_encoded = encoder.transform(xval)
xtest_encoded = encoder.transform(xtest)

# # Reminder-Scaler: fit_transform on train, transform on val & test
# scaler = RobustScaler()
# xtrain_scaled = scaler.fit_transform(xtrain_encoded)
# xval_scaled = scaler.transform(xval_encoded)
# xtest_scaled = scaler.transform(xtest_encoded)

maxdtdepth=10

dt = DecisionTreeClassifier(max_depth=maxdtdepth, random_state=42)
dt.fit(xtrain_encoded, ytrain)
print(f'Decision Tree Model: Max Depth = {maxdtdepth}')
print(f'Train Acc: {dt.score(xtrain_encoded, ytrain)}')
print(f'Val Acc: {dt.score(xval_encoded, yval)}')

KeyError: "['day_recorded'] not in index"

In [22]:
# Ok, we have a pretty good validation score, so let's predict on xtest and submit
ypred = dt.predict(xtest_scaled)

submission = sample_submission.copy()
submission['status_group'] = ypred
submission.to_csv('dtsubmission-03.csv', index=False)