# Load data

In [2]:
import pandas as pd

training_labels_df = pd.read_csv('../data/water_pump/training_labels.csv', header=0)
training_values_df = pd.read_csv('../data/water_pump/training.csv', header=0)

#training_labels_df = pd.read_csv(training_labels_file)
#training_values_df = pd.read_csv(training_values_file)

#training_values_df['status_group'] = training_labels_df['status_group']

# Run classification algorithms

In [40]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from IPython.display import display
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline

y_df = training_labels_df['status_group']
X_df = training_values_df[['construction_year','population','gps_height','longitude','latitude']] 

cat_list = ['quantity','district_code','scheme_management','extraction_type_group','water_quality','basin']

for cat_feature in cat_list:
    temp_df = pd.get_dummies(training_values_df[cat_feature],prefix=cat_feature,prefix_sep='_')
    X_df = X_df.join(temp_df)

# cleaning
construction_mean = X_df[X_df.construction_year>0].mean()
X_df['construction_year'].replace(0,construction_mean)
pop_mean = X_df[X_df.population>0].mean()
X_df['population'].replace(0,pop_mean)
gps_height_mean = X_df[X_df.gps_height>0].mean()
X_df['gps_height'].replace(0,gps_height_mean)


# split training/test
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.25)

# run classification algorithms
log_regr = LogisticRegression()
y_predict_log_regr = log_regr.fit(X_train,y_train).predict(X_test)
print "Accuracy with logistic regression = " + str(accuracy_score(y_test,y_predict_log_regr))

naive_b = GaussianNB()
y_predict_naive_b = naive_b.fit(X_train,y_train).predict(X_test)
print "Accuracy with Naive Bayes = " + str(accuracy_score(y_test,y_predict_naive_b))

d_tree = DecisionTreeClassifier()
y_predict_d_tree = d_tree.fit(X_train,y_train).predict(X_test)
print "Accuracy with Decision Tree = " + str(accuracy_score(y_test,y_predict_d_tree))

r_forest = RandomForestClassifier(n_estimators=100)
y_predict_r_forest = r_forest.fit(X_train,y_train).predict(X_test)
print "Accuracy with Random Forest = " + str(accuracy_score(y_test,y_predict_r_forest))

# commenting out for now since it takes too long
#svc = SVC()
#y_predict_svc = svc.fit(X_train,y_train).predict(X_test)
#print "Accuracy with SVM = " + str(accuracy_score(y_test,y_predict_svc))

Accuracy with logistic regression = 0.706060606061
Accuracy with Naive Bayes = 0.495084175084
Accuracy with Decision Tree = 0.74303030303
Accuracy with Random Forest = 0.792323232323
