In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
pd.options.display.max_columns = 60
pd.options.display.max_colwidth = 100

In [3]:
training_data = pd.read_csv('data/training_data.csv')
training_labels = pd.read_csv('data/training_labels.csv')

print('%i records in training data set' % (len(training_data)))
print('%i records in training labels data set' % (len(training_labels)))

59400 records in training data set
59400 records in training labels data set


In [4]:
all_data = pd.merge(training_data, training_labels, on='id', how='outer')
print('%i records in combined data set' % (len(all_data)))
all_data.head(2)

59400 records in combined data set


Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109,True,GeoData Consultants Ltd,VWC,Roman,False,1999,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,280,,GeoData Consultants Ltd,Other,,True,2010,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional


In [5]:
all_data['status_group'].value_counts()

functional                 32259
non functional             22824
functional needs repair     4317
Name: status_group, dtype: int64

In [6]:
status_group_codes = {
    "non functional": 0,
    "functional needs repair": 1,
    "functional": 2
}

all_data["status_group_code"] = all_data['status_group'].map(status_group_codes)

In [7]:
all_data.isna().any()

id                       False
amount_tsh               False
date_recorded            False
funder                    True
gps_height               False
installer                 True
longitude                False
latitude                 False
wpt_name                 False
num_private              False
basin                    False
subvillage                True
region                   False
region_code              False
district_code            False
lga                      False
ward                     False
population               False
public_meeting            True
recorded_by              False
scheme_management         True
scheme_name               True
permit                    True
construction_year        False
extraction_type          False
extraction_type_group    False
extraction_type_class    False
management               False
management_group         False
payment                  False
payment_type             False
water_quality            False
quality_

In [8]:
all_data.dropna(subset=['permit', 'public_meeting'], inplace=True)
print(
    '%i records in data set after dropping rows with NaN values in the permit and public_meeting columns' 
    % (len(all_data))
)

53281 records in data set after dropping rows with NaN values in the permit and public_meeting columns


In [9]:
all_data.isna().sum()

id                           0
amount_tsh                   0
date_recorded                0
funder                    1176
gps_height                   0
installer                 1186
longitude                    0
latitude                     0
wpt_name                     0
num_private                  0
basin                        0
subvillage                 370
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population                   0
public_meeting               0
recorded_by                  0
scheme_management         3440
scheme_name              24858
permit                       0
construction_year            0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_

In [10]:
nan_columns = all_data.columns[all_data.isna().any()].tolist()

for column in nan_columns:
    all_data[column] = all_data[column].fillna('Unknown')

all_data.isna().any()

id                       False
amount_tsh               False
date_recorded            False
funder                   False
gps_height               False
installer                False
longitude                False
latitude                 False
wpt_name                 False
num_private              False
basin                    False
subvillage               False
region                   False
region_code              False
district_code            False
lga                      False
ward                     False
population               False
public_meeting           False
recorded_by              False
scheme_management        False
scheme_name              False
permit                   False
construction_year        False
extraction_type          False
extraction_type_group    False
extraction_type_class    False
management               False
management_group         False
payment                  False
payment_type             False
water_quality            False
quality_

In [11]:
all_data.isna().sum()

id                       0
amount_tsh               0
date_recorded            0
funder                   0
gps_height               0
installer                0
longitude                0
latitude                 0
wpt_name                 0
num_private              0
basin                    0
subvillage               0
region                   0
region_code              0
district_code            0
lga                      0
ward                     0
population               0
public_meeting           0
recorded_by              0
scheme_management        0
scheme_name              0
permit                   0
construction_year        0
extraction_type          0
extraction_type_group    0
extraction_type_class    0
management               0
management_group         0
payment                  0
payment_type             0
water_quality            0
quality_group            0
quantity                 0
quantity_group           0
source                   0
source_type              0
s

In [12]:
string_features = [
    'funder',
    'installer', 
    'basin', 
    'scheme_management', 
    'permit', 
    'extraction_type', 
    'extraction_type_group', 
    'extraction_type_class', 
    'management', 
    'payment', 
    'payment_type', 
    'water_quality', 
    'quantity', 
    'source', 
    'source_type', 
    'source_class', 
    'waterpoint_type'
]

In [13]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for item in string_features:
    all_data[item] = le.fit_transform(all_data[item])

all_data.head(2)

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group,status_group_code
0,69572,6000.0,2011-03-14,1235,1390,1357,34.938093,-9.856322,none,0,1,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109,True,GeoData Consultants Ltd,8,Roman,0,1999,3,1,0,7,user-group,2,0,6,good,1,enough,8,6,0,1,communal standpipe,functional,2
2,34310,25.0,2013-02-25,764,686,1847,37.460664,-3.821329,Kwa Mahundi,0,5,Majengo,Manyara,21,4,Simanjiro,Ngorika,250,True,GeoData Consultants Ltd,8,Nyumba ya mungu pipe scheme,1,2009,3,1,0,7,user-group,4,5,6,good,1,enough,0,1,1,2,communal standpipe,functional,2


In [14]:
features_for_analysis = [
    'amount_tsh',
    'funder',
    'gps_height', 
    'installer', 
    'basin', 
    'region_code', 
    'district_code', 
    'population', 
    'scheme_management', 
    'permit', 
    'construction_year',
    'extraction_type', 
    'extraction_type_group', 
    'extraction_type_class', 
    'management', 
    'payment', 
    'payment_type', 
    'water_quality', 
    'quantity', 
    'source', 
    'source_type', 
    'source_class', 
    'waterpoint_type'
]
features = all_data[features_for_analysis]
status_group = all_data['status_group_code']

features.head(2)

Unnamed: 0,amount_tsh,funder,gps_height,installer,basin,region_code,district_code,population,scheme_management,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,payment,payment_type,water_quality,quantity,source,source_type,source_class,waterpoint_type
0,6000.0,1235,1390,1357,1,11,5,109,8,0,1999,3,1,0,7,2,0,6,1,8,6,0,1
2,25.0,764,686,1847,5,21,4,250,8,1,2009,3,1,0,7,4,5,6,1,0,1,1,2


3     24339
8      7265
9      5641
14     3972
15     3491
7      2516
4      2261
0      1530
6      1334
12      278
13      208
17      104
1        90
5        88
11       84
16       46
2        32
10        2
Name: extraction_type, dtype: int64

In [15]:
from sklearn.model_selection import train_test_split
train_data, test_data, train_labels, test_labels = train_test_split(features, status_group, test_size=0.25, random_state=25)

In [16]:
# import relevant libraries
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import RFECV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB

In [20]:
# create list to store data about the performance of each model
results = []

# create list of models
models = [
#     SVC(),
#     DecisionTreeClassifier(),
    KNeighborsClassifier(),
    RandomForestClassifier(),
#     GradientBoostingClassifier(),
#     GaussianNB()
]

# iterate through list of models and run cross validation 10 times and store accuracy values
for model in models:
    result = {}
    classifier_name = model.__class__.__name__
    kfold = KFold(n_splits=10, random_state=25)
    cross_val_results = cross_val_score(model, train_data, train_labels, cv=kfold, scoring='accuracy')
    result['classifier_name'] = classifier_name
    result['classifier_results'] = cross_val_results
    results_message = "Accuracy of %s: %0.2f (+/- %0.2f)" % (classifier_name, cross_val_results.mean(), cross_val_results.std())
    print(results_message)
    results.append(cross_val_results)

Accuracy of KNeighborsClassifier: 0.69 (+/- 0.01)
Accuracy of RandomForestClassifier: 0.78 (+/- 0.01)


In [24]:
# create Random Forest Classifier and fit model with training data and labels
rf_classifier = RandomForestClassifier()
rf_classifier.fit(train_data, train_labels)

# generate predictions using test data
rf_classifier_predictions = rf_classifier.predict(test_data)
rf_classifier_accuracy_score = accuracy_score(test_labels, rf_classifier_predictions)

# print accuracy score
print('The accuracy score of the original Random Forest Classifier: %f' % (rf_classifier_accuracy_score))

The accuracy score of the original Random Forest Classifier: 0.782749


In [26]:
feature_labels = list(all_data.columns.values)

from sklearn.feature_selection import SelectFromModel

# for feature in zip(feature_labels, rf_classifier.feature_importances_):
#     print(feature)

sfm = SelectFromModel(rf_classifier, threshold=0.05)

# Train the selector
sfm.fit(train_data, train_labels)

for feature_list_index in sfm.get_support(indices=True):
    print(feature_labels[feature_list_index])
    

amount_tsh
date_recorded
funder
latitude
basin
public_meeting
permit
