# Project 3 - Water Pump Clasificaiton

# Setup

In [None]:
from __future__ import print_function

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from os import system

import matplotlib.pyplot as plt
%matplotlib inline

from __future__ import division
pd.set_option('display.width',5000)

In [None]:
import patsy

from sklearn import linear_model as lm
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation
from sklearn import metrics
from sklearn.metrics import confusion_matrix

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [None]:
# system('say I am Done!')

# Data Import - Training Data

This data is part of the Data Driven Competition

https://www.drivendata.org/competitions/7/pump-it-up-data-mining-the-water-table/page/23/


In [None]:
# 'Values' provided for each pump location - features

df_values = pd.read_csv('/Users/amycurneen/ds/metis/metisgh/Metis-Curneen/3 - Water Pumps/Data Downloads/Training set values.csv')
df_values.sample(1)

In [None]:
# 'Labels' provided for each pump location - clasificaiton - what I am predicting

df_labels = pd.read_csv('/Users/amycurneen/ds/metis/metisgh/Metis-Curneen/3 - Water Pumps/Data Downloads/Training set labels.csv')
df_labels.sample(1)

# Data Import - Challenge Data

In [None]:
# 'Values' provided for each competition pump location - features

df_test_values = pd.read_csv('/Users/amycurneen/ds/metis/metisgh/Metis-Curneen/3 - Water Pumps/Data Downloads/Test set values.csv')
df_test_values.sample(1)

In [None]:
# 'Labels' I will provide for each pump location - clasificaiton

df_sub = pd.read_csv('/Users/amycurneen/ds/metis/metisgh/Metis-Curneen/3 - Water Pumps/Data Downloads/SubmissionFormat.csv')
df_sub = df_sub.drop('status_group', axis = 1)
df_sub.sample(1)

# Feature Analysis

My goal is to predict the operating condition of a waterpoint for each record in the dataset. I was provided the following set of information about the waterpoints:

* amount_tsh - Total static head (amount water available to waterpoint)
    * 98 unique
* date_recorded - The date the row was entered
    * 365 unique
    * year - month - day
* funder - Who funded the well
    * 1897 unique
    * look at top ones?
* installer - Organization that installed the well
    * 2145 unique
    * DWE is main one - 10x closest other, 17k
* wpt_name - Name of the waterpoint if there is one
    * 37400 unique
    * look at top ones?
* num_private - (NO PROVIDED DESC)
    * 65 unique
    * USELESS FEATURE
* population - Population around the well
    * 1049 unique
    * a lot are zero
* public_meeting - True/False
    * 2 unique
* recorded_by - Group entering this row of data
    * 1 unique
    * all the same - USELESS FEATURE
* scheme_management - Who operates the waterpoint
    * 12 unique
* scheme_name - Who operates the waterpoint
    * 2696 unique
    * USELESS FEATURE
* permit - If the waterpoint is permitted
    * 2 unique
* construction_year - Year the waterpoint was constructed
    * 55 unique
    * third are 0 - USELESS FEATURE
    

* Geography
    * gps_height - Altitude of the well
        * numerical
    * longitude - GPS coordinate
        * numerical
    * latitude - GPS coordinate
        * numerical
    * basin - Geographic water basin
        * 9 unique
    * subvillage - Geographic location
        * 19287 unique
    * region - Geographic location
        * 21 unique
    * region_code - Geographic location (coded)
        * 27 unique
    * district_code - Geographic location (coded)
        * 20 unique
    * lga - Geographic location
        * 125 unique
    * ward - Geographic location
        * 2092 unique


* Extraction
    * extraction_type - The kind of extraction the waterpoint uses
        * 18 unique
        * Most descriptive of extraction
    * extraction_type_group - The kind of extraction the waterpoint uses
        * 13 unique
        * Parent of extraction_type
    * extraction_type_class - The kind of extraction the waterpoint uses
        * 7 unique
        * Parent of extraction_type_group


* Overhead
    * management - How the waterpoint is managed
        * 12 unique
    * management_group - How the waterpoint is managed
        * 5 unique
    * payment - What the water costs
        * 7 unique
        * same as payment type
    * payment_type - What the water costs
        * 7 unique
        * same as payment


* Water
    * water_quality - The quality of the water 
        * 3 unique
        * Subset of quality_group
    * quality_group - The quality of the water
        * 6 unique
        * Parent group of water_quality
    * quantity - The quantity of water
        * 5 unique
        * Same as quantity_group
    * quantity_group - The quantity of water
        * 5 unique
        * Same as quantity
    * source - The source of the water
        * 10 unique
    * source_type - The source of the water
        * 7 unique
        * Subset of source
    * source_class - The source of the water
        * 3 unique
        * Subset of source_type
    * waterpoint_type - The kind of waterpoint
        * 6 unique
        * Parent of waterpoint_type_group
    * waterpoint_type_group - The kind of waterpoint
        * 7 unique
        * Subset of waterpoint_type

In [None]:
myregions = list(df_values.region.unique())
myregions.sort()

In [None]:
df_values.region.value_counts()

In [None]:
payment = list(df_values.payment.unique())
payment.sort()

In [None]:
payment

# Sorting features

In [None]:
total = list(df_values.columns)

In [None]:
useless = ['id','date_recorded','num_private','recorded_by','scheme_name','construction_year','subvillage','ward',
          'payment_type','quantity_group','wpt_name']

In [None]:
subsets_to_go = ['quality_group','extraction_type_group','extraction_type','source','source_type', 
                 'waterpoint_type_group','management']

In [None]:
numerical = ['amount_tsh','population','latitude','longitude','gps_height']

In [None]:
non_numerical = list(set(total) - set(useless) - set(subsets_to_go) - set(numerical))

In [None]:
my_features = numerical+non_numerical

In [None]:
# get rid of features that dont show often

map_funder = df_values.funder.value_counts().to_dict()
for i in range(len(map_funder.keys())):
    keys = list(map_funder.keys())
    a = keys[i]
    if map_funder[a] > 800:
        map_funder[a] = a
    else:
        map_funder[a] = 'other'

In [None]:
df_values['funder'].replace(map_funder, inplace=True)

In [None]:
# get rid of features that dont show often

map_installer = df_values.installer.value_counts().to_dict()
for i in range(len(map_installer.keys())):
    keys = list(map_installer.keys())
    a = keys[i]
    if map_installer[a] > 800:
        map_installer[a] = a
    else:
        map_installer[a] = 'other'    

In [None]:
df_values['installer'].replace(map_installer, inplace=True)

## Nans

In [None]:
df_values = df_values.fillna('other')

# Setup chalange data

In [None]:
funder_list = list(set(map_funder.values()))
funder_list.remove('other')

In [None]:
installer_list = list(set(map_installer.values()))
installer_list.remove('other')

In [None]:
for i in range(len(df_test_values.funder)):
    if df_test_values.funder[i] not in funder_list:
        df_test_values.at[i, 'funder'] = 'other'

In [None]:
for i in range(len(df_test_values.funder)):
    if df_test_values.installer[i] not in funder_list:
        df_test_values.at[i, 'installer'] = 'other'

In [None]:
df_test_values = df_test_values.fillna('other')

In [None]:
df_test = df_test_values[my_features]

In [None]:
df_test2 = df_test_values[my_features2]

# Create selected feature DataFrame

## Randomize

In [None]:
# randomize data
# df_lables, df_values - combine and shuffle this data

df = pd.merge(df_labels,df_values,how = 'left')
df = df.sample(frac=1).reset_index(drop=True)

In [None]:
my_features.insert(0,'status_group')

In [None]:
df_features = df[my_features]

In [None]:
map_status_group = {'functional':0,'functional needs repair':1,'non functional':2}

In [None]:
# turn y into 3 class 0,1,2
df_features['status_group'].replace(map_status_group, inplace=True)

## Create X and Y

In [None]:
y = df_features.status_group
X = df_features.drop('status_group', axis=1)

In [None]:
X.info()

In [None]:
X['region_code'] = X['region_code'].astype(object)
X['district_code'] = X['district_code'].astype(object)

## Test train split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y)

In [None]:
X_train.head()

# Catboost for limited features

In [None]:
from catboost import CatBoostRegressor, CatBoostClassifier, Pool

In [None]:
categorical_features_indices = np.where((X_train.dtypes != np.float) & (X_train.dtypes != np.int))[0]

train_pool = Pool(X_train, y_train, cat_features=categorical_features_indices)
model=CatBoostClassifier(depth=10, learning_rate=0.3,loss_function='MultiClass')

model.fit(train_pool,eval_set=(X_test, y_test),plot=True)

In [None]:
# model2=CatBoostClassifier(depth=10, learning_rate=0.3,loss_function='Recall')

# model2.fit(train_pool,plot=True)

In [None]:
feature_importances = model.get_feature_importance(train_pool)
feature_names = X_train.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

In [None]:
model.save_model('catboost_model.dump')

In [None]:
#model.load_model('catboost_model.dump') 

# Create decision list

In [None]:
answers = list(model.predict(df_test))

In [None]:
answer_better = []

for i in range(len(answers)):
    num = int(answers[i][0])
    answer_better.append(num)

In [None]:
df_sub['status_group'] = answer_better

In [None]:
inv_map_status_group = {v: k for k, v in map_status_group.items()}

In [None]:
# turn y into 3 class 0,1,2
df_sub['status_group'].replace(inv_map_status_group, inplace=True)

In [None]:
df_sub['status_group'].value_counts()

In [None]:
df_sub.to_csv('./Submissions/decisionTree5_10_215pm.csv',index=False)

# Appendix

## A

In [None]:
columns = list(feature_sel.columns)
feature_sel[columns[5:]] = feature_sel[columns[5:]].astype(int)
feature_sel[columns[0]] = feature_sel[columns[0]].astype(int)

In [None]:
feature_sel = feature_sel.sample(frac=1).reset_index(drop=True)
y = feature_sel.status_group
X = feature_sel.drop('status_group', axis=1)

from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.84)

In [None]:
X_train.head(1)

## Adjust competition data

In [None]:
df_test_values = df_test_values[['gps_height', 'longitude', 'latitude', 'basin',
                  'extraction_type_class','payment','quantity','water_quality', 'source_class',
                  'waterpoint_type_group','population','public_meeting','scheme_management']]

In [None]:
# rearange columns
cols = df_test_values.columns.tolist()
cols = ['gps_height',
             'longitude',
             'latitude',
             'population',
             'basin',
             'extraction_type_class',
             'payment',
             'quantity',
             'water_quality',
             'source_class',
             'waterpoint_type_group',
             'public_meeting',
             'scheme_management']
df_test_values = df_test_values[cols]

In [None]:
a = [0] * len(df_test_values['gps_height'])

In [None]:
df_test_values = pd.get_dummies(df_test_values,columns=list(df_test_values.columns[4:]))
df_test_values.sample(5)

In [None]:
columns = list(feature_sel.columns)
feature_sel[columns[5:]] = feature_sel[columns[5:]].astype(int)
feature_sel[columns[0]] = feature_sel[columns[0]].astype(int)

In [None]:
df_test_values['scheme_management_None'] = a
df_test_values.head(10)

In [None]:
list(set(X_train.columns)-set(df_test_values.columns))

##  B

In [None]:
from __future__ import print_function

import numpy as np
import pandas as pd

from IPython.display import Image

from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.cross_validation import train_test_split
from sklearn import svm

from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
pd.Series(y_train).value_counts()

## Principle Component Analysis

In [None]:
pca = PCA(n_components=2)
pca.fit(X_train)

In [None]:
pcafeatures_train = pca.transform(X_train)

In [None]:
from itertools import cycle

In [None]:
# def plot_PCA_2D(data, target, target_names):
#     colors = cycle(['r','g','b'])
#     target_ids = range(len(target_names))
#     plt.figure()
#     for i, c, label in zip(target_ids, colors, target_names):
#         plt.scatter(data[target == i, 0], data[target == i, 1],
#                    c=c, label=label)
#     plt.legend()

In [None]:
# plot_PCA_2D(pcafeatures_train, target=y_train, target_names=digits.target_names)

## Fitting Linear and RBF SVM Models

In [None]:
# fit linear model
model_svm = svm.SVC(kernel='rbf',probability=False,cache_size=2000)
model_svm.fit(X_train, y_train)

In [None]:
# predict out of sample
y_pred = model_svm.predict(X_test)

In [None]:
# check accuracy
accuracy_score(y_test,y_pred)

In [None]:
# confusion matrix
confusion_matrix(y_test,y_pred)

In [None]:
# fit rbf model
# model_svm2 = svm.SVC(kernel='rbf', gamma = 0.001)
# model_svm2.fit(X_train, y_train)

In [None]:
# predict out of sample
y_pred2 = model_svm2.predict(X_test)

In [None]:
# check accuracy
accuracy_score(y_test,y_pred2)

In [None]:
# confusion matrix
confusion_matrix(y_test,y_pred2)

## C

Extra code from class to utilize

In [None]:
df.age=df.age.fillna(df.age.mean())

In [None]:
y,X=dmatrices('survived~ pclass +age+sibsp+parch+fare',data=df,return_type='dataframe')

In [None]:
# Generate a confusion matrix plot: 

def plot_confusion_matrix(cm,title='Confusion matrix', cmap=plt.cm.Reds):
    plt.imshow(cm, interpolation='nearest',cmap=cmap)
    plt.title(title)
    plt.colorbar()
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

#Could be a typical function for classifying:

def train_score(classifier,x,y):
    xtrain, xtest, ytrain, ytest = cross_validation.train_test_split(x, y, test_size=0.2, random_state=1234)
    ytrain=np.ravel(ytrain)
    clf = classifier.fit(xtrain, ytrain)
    # accuracy for test & train:
    train_acc=clf.score(xtrain, ytrain)
    test_acc=clf.score(xtest,ytest)
    print("Training Data Accuracy: %0.2f" %(train_acc))
    print("Test Data Accuracy:     %0.2f" %(test_acc))
    
    y_true = ytest
    y_pred = clf.predict(xtest)


    conf = confusion_matrix(y_true, y_pred)
    print(conf)

    print ('\n')
    print ("Precision:              %0.2f" %(conf[0, 0] / (conf[0, 0] + conf[1, 0])))
    print ("Recall:                 %0.2f"% (conf[0, 0] / (conf[0, 0] + conf[0, 1])))
    
    cm=confusion_matrix(y_true, y_pred, labels=None)
    
    plt.figure()
    plot_confusion_matrix(cm)

In [None]:
log_clf=LogisticRegression()
train_score(log_clf,X,y)

In [None]:
# What about ROC ? 

from sklearn.metrics import roc_curve, auc

xtrain, xtest, ytrain, ytest = cross_validation.train_test_split(X, y, test_size=0.2, random_state=1234)
log = LogisticRegression()
log.fit(xtrain,np.ravel(ytrain))
y_score=log.predict_proba(xtest)[:,1]

fpr, tpr,_ = roc_curve(ytest, y_score)
roc_auc = auc(fpr, tpr)

plt.figure()
# Plotting our Baseline..
plt.plot([0,1],[0,1])
plt.plot(fpr,tpr)
plt.xlabel('FPR')
plt.ylabel('TPR')

In [None]:
tpr

####  Cost Benefit Example: 

We can also optimize our models based on specific costs associated with our classification errors; here we will use specific dollar amounts as weights.

Let's say we were developing a classification model for Aircraft Delay prediction.  For this example let's assume that a true positive would 
lead to a cost savings of 2160 dollars, a false negative would cost us 2900 dollars a false positive would cost 750 dollars.  

cb = np.array([[2160, -750.0], [-2900, 0]])  

Expected_Value = #TPs(2160) - #FNs(2900) -#FPs(750)  