# Pump it Up: Data Mining the Water Table

### Can you predict which water pumps are faulty?
Using data from Taarifa and the Tanzanian Ministry of Water, can you predict which pumps are functional, which need some repairs, and which don't work at all? This is an intermediate-level practice competition. Predict one of these three classes based on a number of variables about what kind of pump is operating, when it was installed, and how it is managed. A smart understanding of which waterpoints will fail can improve maintenance operations and ensure that clean, potable water is available to communities across Tanzania.

Competition:
https://www.drivendata.org/competitions/7/

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

import os
import sys

import numpy as np
import pandas as pd

import sklearn as sk
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, StandardScaler
from sklearn.model_selection import train_test_split

import tensorflow as tf

In [None]:
TRAIN_PATH = 'data/train_clean.csv'
TEST_PATH = 'data/test_clean.csv'

train_df = pd.read_csv(TRAIN_PATH,
                       index_col='id')
test_df = pd.read_csv(TEST_PATH,
                      index_col='id')

In [None]:
train_labels = train_df.pop('status_group').copy()
train_data = train_df.copy()
test_data = test_df.copy()

In [None]:
def print_shapes():
    print('train_data shape: {}'.format(train_data.shape))
    print('train_labels shape: {}'.format(train_labels.shape))
    print('test_data shape: {}'.format(test_data.shape))

print_shapes()

## Normalization of numerical attributes

In [None]:
# all numeric columns to float
def int_columns_to_float(df: pd.DataFrame):
    for cname in df.columns:
        if df[cname].dtype == int:
            df[cname] = df[cname].astype(float)

int_columns_to_float(train_data)
int_columns_to_float(test_data)

In [None]:
def numeric_columns(df: pd.DataFrame):
    cols = []
    for cname in df.columns:
        if df[cname].dtype != object:
            cols.append(cname)
    return cols
    
numeric_cols = numeric_columns(train_data)

scaler = StandardScaler()
scaler.fit(train_data[numeric_cols])
train_data[numeric_cols] = scaler.transform(train_data[numeric_cols])
test_data[numeric_cols] = scaler.transform(test_data[numeric_cols])

## Encode categorical attributes
- LabelEncoder (ordered)
- LabelBinarizer / get_dummies (unordered)

In [None]:
train_data = pd.get_dummies(train_data)
train_data.columns.size

In [None]:
test_data = pd.get_dummies(test_data)
test_data.columns.size

In [None]:
print_shapes()

In [None]:
train_labels = train_labels.as_matrix()
train_data = train_data.as_matrix()
test_data = test_data.as_matrix()

## Model selection

#### Simple train/test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_data, train_labels, test_size=0.2, random_state=0)

## Train for submission

In [None]:
X_train, y_train = (train_data, train_labels)

## Predictions for submission

In [None]:
prediction = model.predict(test_data)
plt.hist(prediction)

In [None]:
def save_submission(predictions, test):
    data = {'id': test.index, 'status_group': predictions}

    submit = pd.DataFrame(data=data)

    vals_to_replace = {0:'non functional',
                       1:'functional needs repair',
                       2:'functional'}

    submit['status_group'] = submit['status_group'].replace(vals_to_replace)        

    submit.to_csv('pump_predictions.csv', index=False)
    
save_submission(prediction, test_df)