In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_curve, auc
from sklearn.preprocessing import OneHotEncoder
from sklearn import tree
from scipy.spatial.distance import euclidean
SEED = 10

In [2]:
# load the data from two files
dfX = pd.read_csv('../data/training_set_values.csv')
dfy = pd.read_csv('../data/training_set_labels.csv')
# concatenate the files
df = pd.concat([dfX, dfy.status_group], axis = 1)

# drop duplicates, singling out the id column
df.drop(df[df.duplicated(subset=df.columns.difference(['id']))].index, inplace=True)
# drop columns with missing values
df.dropna(axis='columns', inplace=True)
# drop id column and columns with problematic zero values
df.drop(columns=['id', 'num_private', 'construction_year', 'population'], inplace=True)
# convert region_code to string object
df.region_code = df.region_code.astype('string')
# convert district_code to string object
df.district_code = df.district_code.astype('string')
# drop columns related to lat/long and elevation (which might still prove to be useful)
df.drop(columns=['longitude', 'latitude', 'gps_height'], inplace=True)
# drop date column
df.drop(columns=['date_recorded'], inplace=True)
# show row and column counts
df.shape

(59364, 26)

In [3]:
labels = df.status_group
df.drop('status_group', axis=1, inplace=True)

In [4]:
# encode the status_group as 1s ('functional') and 0s ('non functional' or 'functional needs repair')
labels = labels.apply(lambda x: 1 if x == 'functional' else 0)
# # convert date_recorded to datetime object
# df.date_recorded = pd.to_datetime(df.date_recorded, format = "%Y-%m-%d")

In [5]:
# drop all categorical columns with more than 10 unique values
df.drop(columns = list(df.select_dtypes(include=['object']).loc[:, df.nunique() > 10].columns), inplace=True)
# show rows and columns
df.shape

(59364, 18)

In [6]:
# one-hot encode the categorical columns
one_hot_df = pd.get_dummies(df)
# show row and column counts
one_hot_df.shape

(59364, 96)

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(one_hot_df, labels, test_size=0.25, random_state=42)

In [8]:
# Import StandardScaler
from sklearn.preprocessing import StandardScaler

# Instantiate StandardScaler
scaler = StandardScaler()

# Transform the training and test sets
scaled_data_train = scaler.fit_transform(X_train)
scaled_data_test = scaler.transform(X_test)

# Convert into a DataFrame
scaled_df_train = pd.DataFrame(scaled_data_train, columns=one_hot_df.columns)
scaled_df_train.head()

Unnamed: 0,amount_tsh,region_code,district_code,basin_Internal,basin_Lake Nyasa,basin_Lake Rukwa,basin_Lake Tanganyika,basin_Lake Victoria,basin_Pangani,basin_Rufiji,...,waterpoint_type_dam,waterpoint_type_hand pump,waterpoint_type_improved spring,waterpoint_type_other,waterpoint_type_group_cattle trough,waterpoint_type_group_communal standpipe,waterpoint_type_group_dam,waterpoint_type_group_hand pump,waterpoint_type_group_improved spring,waterpoint_type_group_other
0,-0.117103,-0.187294,-0.37623,-0.388626,-0.305955,4.824567,-0.348184,-0.453905,-0.42003,-0.396665,...,-0.010598,-0.645358,-0.114484,-0.348832,-0.041892,0.846277,-0.010598,-0.645358,-0.114484,-0.348832
1,-0.117103,-0.073704,-0.067536,-0.388626,-0.305955,-0.207272,2.872046,-0.453905,-0.42003,-0.396665,...,-0.010598,-0.645358,-0.114484,-0.348832,-0.041892,0.846277,-0.010598,-0.645358,-0.114484,-0.348832
2,-0.105906,0.323859,-0.170434,-0.388626,-0.305955,-0.207272,-0.348184,-0.453905,2.38078,-0.396665,...,-0.010598,-0.645358,-0.114484,-0.348832,-0.041892,0.846277,-0.010598,-0.645358,-0.114484,-0.348832
3,-0.117103,0.153474,-0.273332,-0.388626,-0.305955,-0.207272,-0.348184,2.203104,-0.42003,-0.396665,...,-0.010598,-0.645358,-0.114484,2.866708,-0.041892,-1.181646,-0.010598,-0.645358,-0.114484,2.866708
4,-0.117103,0.153474,-0.479129,-0.388626,-0.305955,-0.207272,-0.348184,2.203104,-0.42003,-0.396665,...,-0.010598,-0.645358,-0.114484,-0.348832,-0.041892,0.846277,-0.010598,-0.645358,-0.114484,-0.348832


In [9]:
# Import KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier

# Instantiate KNeighborsClassifier
clf = KNeighborsClassifier()

# Fit the classifier
clf.fit(scaled_data_train, y_train)

# Predict on the test set
test_preds = clf.predict(scaled_data_test)

In [10]:
# Import the necessary functions
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

In [11]:
# Complete the function
def print_metrics(labels, preds):
    print("Precision Score: {}".format(precision_score(labels, preds)))
    print("Recall Score: {}".format(recall_score(labels, preds)))
    print("Accuracy Score: {}".format(accuracy_score(labels, preds)))
    print("F1 Score: {}".format(f1_score(labels, preds)))
    
print_metrics(y_test, test_preds)

Precision Score: 0.7683384110616358
Recall Score: 0.8167663178873941
Accuracy Score: 0.7676706421400176
F1 Score: 0.7918125830213742
