In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, plot_confusion_matrix
from sklearn.model_selection import cross_val_score, GridSearchCV

For more information about features:
https://www.drivendata.org/competitions/7/pump-it-up-data-mining-the-water-table/page/25/

In [133]:
# amount_tsh - Total static head (amount water available to waterpoint)
# date_recorded - The date the row was entered - do not include
# funder - Who funded the well
# gps_height - Altitude of the well
# installer - Organization that installed the well
# longitude - GPS coordinate - do not include
# latitude - GPS coordinate - do not include
# wpt_name - Name of the waterpoint if there is one
# num_private -
# basin - Geographic water basin
# subvillage - Geographic location
# region - Geographic location
# region_code - Geographic location (coded)
# district_code - Geographic location (coded)
# lga - Geographic location
# ward - Geographic location
# population - Population around the well
# public_meeting - True/False - ?
# recorded_by - Group entering this row of data
# scheme_management - Who operates the waterpoint
# scheme_name - Who operates the waterpoint
# permit - If the waterpoint is permitted
# construction_year - Year the waterpoint was constructed
# extraction_type - The kind of extraction the waterpoint uses
# extraction_type_group - The kind of extraction the waterpoint uses
# extraction_type_class - The kind of extraction the waterpoint uses
# management - How the waterpoint is managed
# management_group - How the waterpoint is managed
# payment - What the water costs
# payment_type - What the water costs
# water_quality - The quality of the water
# quality_group - The quality of the water
# quantity - The quantity of water
# quantity_group - The quantity of water
# source - The source of the water
# source_type - The source of the water
# source_class - The source of the water
# waterpoint_type - The kind of waterpoint
# waterpoint_type_group - The kind of waterpoint

## Import Data

In [150]:
X = pd.read_csv("../../references/training_set_values.csv", index_col = 'id')

In [151]:
y = pd.read_csv("../../references/training_set_labels.csv", index_col = 'id')['status_group']

In [152]:
X.dtypes.value_counts()

object     30
int64       6
float64     3
dtype: int64

In [153]:
# X.info()

In [154]:
columns_with_na = ['permit', 'scheme_name', 'scheme_management', 'public_meeting', 'subvillage', 'installer', 'funder']
columns_with_specific_location_and_names = ['ward', 'wpt_name']

#### Drop Date and NA and unique identifier columns for now

In [155]:
X.drop(columns_with_na, axis=1, inplace=True)

In [156]:
X.drop(columns_with_specific_location_and_names, axis=1, inplace=True)

In [157]:
X.drop('date_recorded', axis=1, inplace=True)

#### Train Test Split

In [165]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .25)

#### Split Data into continuous and categorical

In [166]:
X_train_cat = X_train.select_dtypes(include='object')
X_train_cont = X_train.select_dtypes(exclude='object')
X_test_cat = X_test.select_dtypes(include='object')
X_test_cont = X_test.select_dtypes(exclude='object')

#### Simple Imputer to fill null numeric values

In [167]:
si = SimpleImputer()
X_train_imp = pd.DataFrame(si.fit_transform(X_train_cont), index = X_train_cont.index, columns = X_train_cont.columns)
X_test_imp = pd.DataFrame(si.transform(X_test_cont), index = X_test_cont.index, columns = X_test_cont.columns)

#### Standard Scaler to scale numeric values

In [168]:
ss = StandardScaler()
X_train_sc = pd.DataFrame(ss.fit_transform(X_train_imp), index = X_train_imp.index, columns = X_train_imp.columns)
X_test_sc = pd.DataFrame(ss.transform(X_test_imp), index = X_test_imp.index, columns = X_test_imp.columns)

#### OneHotEncoder for categorical columns

In [169]:
# X_train_cat.info()

In [172]:
ohe = OneHotEncoder(drop = 'if_binary', sparse = False)
X_train_ohe = pd.DataFrame(ohe.fit_transform(X_train_cat),
                          columns = ohe.get_feature_names(X_train_cat.columns), index = X_train_cat.index)
X_test_ohe = pd.DataFrame(ohe.transform(X_test_cat),
                          columns = ohe.get_feature_names(X_test_cat.columns), index = X_test_cat.index)

#### Join numeric and object DataFrames back together after preprocessing

In [173]:
X_train_fin = X_train_sc.join(X_train_ohe)
X_test_fin = X_test_sc.join(X_test_ohe)

#### Create a Baseline Model

In [176]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_fin, y_train)
lr.score(X_train_fin, y_train)

LogisticRegression(max_iter=1000)

#### Score model on training data

In [180]:
# Score on our training data
y_pred = lr.predict(X_train_fin)
print('Scores for our training data')
print('Accuracy Score: ' + str(accuracy_score(y_train, y_pred)))

Scores for our training data
Accuracy Score: 0.7495847362514029


In [None]:
val = cross_val_score(lr, X_train_fin, y_train, cv=5)
val

#### Score model on testing data

In [186]:
lr.score(X_test_fin, y_test)

0.7482828282828283