# Project 3 - Water Pump Clasificaiton

# Setup

In [73]:
from __future__ import print_function

In [74]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

from __future__ import division
pd.set_option('display.width',5000)

In [75]:
import patsy

from sklearn import linear_model as lm
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation
from sklearn import metrics
from sklearn.metrics import confusion_matrix

In [92]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [170]:
# from os import system
# system('say I am Done!')

# Data Import - Training Data

This data is part of the Data Driven Competition

https://www.drivendata.org/competitions/7/pump-it-up-data-mining-the-water-table/page/23/


In [53]:
# 'Values' provided for each pump location - features

df_values = pd.read_csv('/Users/amycurneen/ds/metis/metisgh/Metis-Curneen/3 - Water Pumps/Data Downloads/Training set values.csv')
df_values.sample(1)

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
50141,19161,0.0,2013-02-24,Tasaf,698,DWE,36.288768,-11.210622,Shuleni,0,...,never pay,soft,good,insufficient,insufficient,shallow well,shallow well,groundwater,hand pump,hand pump


In [54]:
# 'Labels' provided for each pump location - clasificaiton - what I am predicting

df_labels = pd.read_csv('/Users/amycurneen/ds/metis/metisgh/Metis-Curneen/3 - Water Pumps/Data Downloads/Training set labels.csv')
df_labels.sample(1)

Unnamed: 0,id,status_group
41824,28256,functional


# Data Import - Challenge Data

In [134]:
# 'Values' provided for each competition pump location - features

df_test_values = pd.read_csv('/Users/amycurneen/ds/metis/metisgh/Metis-Curneen/3 - Water Pumps/Data Downloads/Test set values.csv')
df_test_values.sample(1)

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
5150,71766,0.0,2013-02-22,,1374,,34.94291,-4.849501,Mwamba Primary,0,...,unknown,unknown,unknown,dry,dry,machine dbh,borehole,groundwater,communal standpipe,communal standpipe


In [135]:
# 'Labels' I will provide for each pump location - clasificaiton

df_sub = pd.read_csv('/Users/amycurneen/ds/metis/metisgh/Metis-Curneen/3 - Water Pumps/Data Downloads/SubmissionFormat.csv')
df_sub = df_sub.drop('status_group', axis = 1)
df_sub.sample(1)

Unnamed: 0,id
849,63779


# Feature Analysis

My goal is to predict the operating condition of a waterpoint for each record in the dataset. I was provided the following set of information about the waterpoints:

* amount_tsh - Total static head (amount water available to waterpoint)
    * 98 unique
* date_recorded - The date the row was entered
    * 365 unique
    * year - month - day
* funder - Who funded the well
    * 1897 unique
    * look at top ones?
* installer - Organization that installed the well
    * 2145 unique
    * DWE is main one - 10x closest other, 17k
* wpt_name - Name of the waterpoint if there is one
    * 37400 unique
    * look at top ones?
* num_private - (NO PROVIDED DESC)
    * 65 unique
    * USELESS FEATURE
* population - Population around the well
    * 1049 unique
    * a lot are zero
* public_meeting - True/False
    * 2 unique
* recorded_by - Group entering this row of data
    * 1 unique
    * all the same - USELESS FEATURE
* scheme_management - Who operates the waterpoint
    * 12 unique
* scheme_name - Who operates the waterpoint
    * 2696 unique
    * USELESS FEATURE
* permit - If the waterpoint is permitted
    * 2 unique
* construction_year - Year the waterpoint was constructed
    * 55 unique
    * third are 0 - USELESS FEATURE
    

* Geography
    * gps_height - Altitude of the well
        * numerical
    * longitude - GPS coordinate
        * numerical
    * latitude - GPS coordinate
        * numerical
    * basin - Geographic water basin
        * 9 unique
    * subvillage - Geographic location
        * 19287 unique
    * region - Geographic location
        * 21 unique
    * region_code - Geographic location (coded)
        * 27 unique
    * district_code - Geographic location (coded)
        * 20 unique
    * lga - Geographic location
        * 125 unique
    * ward - Geographic location
        * 2092 unique


* Extraction
    * extraction_type - The kind of extraction the waterpoint uses
        * 18 unique
        * Most descriptive of extraction
    * extraction_type_group - The kind of extraction the waterpoint uses
        * 13 unique
        * Parent of extraction_type
    * extraction_type_class - The kind of extraction the waterpoint uses
        * 7 unique
        * Parent of extraction_type_group


* Overhead
    * management - How the waterpoint is managed
        * 12 unique
    * management_group - How the waterpoint is managed
        * 5 unique
    * payment - What the water costs
        * 7 unique
        * same as payment type
    * payment_type - What the water costs
        * 7 unique
        * same as payment


* Water
    * water_quality - The quality of the water 
        * 3 unique
        * Subset of quality_group
    * quality_group - The quality of the water
        * 6 unique
        * Parent group of water_quality
    * quantity - The quantity of water
        * 5 unique
        * Same as quantity_group
    * quantity_group - The quantity of water
        * 5 unique
        * Same as quantity
    * source - The source of the water
        * 10 unique
    * source_type - The source of the water
        * 7 unique
        * Subset of source
    * source_class - The source of the water
        * 3 unique
        * Subset of source_type
    * waterpoint_type - The kind of waterpoint
        * 6 unique
        * Parent of waterpoint_type_group
    * waterpoint_type_group - The kind of waterpoint
        * 7 unique
        * Subset of waterpoint_type

# Sorting features

In [57]:
total = list(df_values.columns)

In [58]:
useless = ['id','date_recorded','num_private','recorded_by','scheme_name','construction_year','subvillage','ward',
          'payment_type','quantity_group','wpt_name']

In [59]:
subsets_to_go = ['quality_group','extraction_type_group','extraction_type','source','source_type', 
                 'waterpoint_type_group','management']

In [60]:
numerical = ['amount_tsh','population','latitude','longitude','gps_height']

In [61]:
non_numerical = list(set(total) - set(useless) - set(subsets_to_go) - set(numerical))

In [79]:
my_features = numerical+non_numerical

In [63]:
# get rid of features that dont show often

map_funder = df_values.funder.value_counts().to_dict()
for i in range(len(map_funder.keys())):
    keys = list(map_funder.keys())
    a = keys[i]
    if map_funder[a] > 800:
        map_funder[a] = a
    else:
        map_funder[a] = 'nan'

In [64]:
df_values['funder'].replace(map_funder, inplace=True)

In [65]:
# get rid of features that dont show often

map_installer = df_values.installer.value_counts().to_dict()
for i in range(len(map_installer.keys())):
    keys = list(map_installer.keys())
    a = keys[i]
    if map_installer[a] > 800:
        map_installer[a] = a
    else:
        map_installer[a] = 'nan'    

In [66]:
df_values['installer'].replace(map_installer, inplace=True)

# Create selected feature DataFrame

## Randomize

In [123]:
# randomize data
# df_lables, df_values - combine and shuffle this data

df = pd.merge(df_labels,df_values,how = 'left')
df = df.sample(frac=1).reset_index(drop=True)

In [124]:
my_features.insert(0,'status_group')

In [125]:
df_features = df[my_features]

In [126]:
# create dummy features

dummy_start = 6
df_features = pd.get_dummies(df_features,columns=list(df_features.columns[dummy_start:]))
df_features.sample(3)

Unnamed: 0,status_group,status_group.1,amount_tsh,population,latitude,longitude,gps_height_-90,gps_height_-63,gps_height_-59,gps_height_-57,...,installer_nan,basin_Internal,basin_Lake Nyasa,basin_Lake Rukwa,basin_Lake Tanganyika,basin_Lake Victoria,basin_Pangani,basin_Rufiji,basin_Ruvuma / Southern Coast,basin_Wami / Ruvu
47853,functional,functional,0.0,0,-2.64088,30.482392,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
22628,functional,functional,0.0,0,-2e-08,0.0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
19830,non functional,non functional,0.0,0,-2.527239,32.483631,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0


In [20]:
map_status_group = {'functional':0,'functional needs repair':1,'non functional':2}

In [85]:
# turn y into 3 class 0,1,2
df_features['status_group'].replace(map_status_group, inplace=True)

In [86]:
columns = list(df_features.columns)
df_features[columns[dummy_start:]] = df_features[columns[dummy_start:]].astype(int)
df_features[columns[0]] = df_features[columns[0]].astype(int)

## Create X and Y

In [89]:
y = df_features.status_group
X = df_features.drop('status_group', axis=1)

## Test train split

In [93]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Create selected feature on comp data

In [140]:
my_features_comp = my_features[1:]

In [141]:
df_test_values = df_test_values[my_features_comp]

In [142]:
df_test_values['funder'].replace(map_funder, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [144]:
df_test_values['installer'].replace(map_installer, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [145]:
# create dummy features

dummy_start_comp = dummy_start - 1
df_test_values = pd.get_dummies(df_test_values,columns=list(df_test_values.columns[dummy_start_comp:]))
df_test_values.sample(3)

Unnamed: 0,amount_tsh,population,latitude,longitude,gps_height,region_code_1,region_code_2,region_code_3,region_code_4,region_code_5,...,installer_wachina,basin_Internal,basin_Lake Nyasa,basin_Lake Rukwa,basin_Lake Tanganyika,basin_Lake Victoria,basin_Pangani,basin_Rufiji,basin_Ruvuma / Southern Coast,basin_Wami / Ruvu
5054,500.0,80,-8.92863,35.104521,1272,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1536,0.0,1,-10.765262,38.624369,369,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
9436,4000.0,450,-8.955581,34.602664,1449,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [146]:
columns = list(df_test_values.columns)
df_test_values[columns[dummy_start_comp:]] = df_test_values[columns[dummy_start_comp:]].astype(int)

## Missing features

In [151]:
miss = list(set(X_train.columns)-set(df_test_values.columns))
for item in miss:
    a = [0] * len(df_test_values['population'])
    df_test_values[item] = a

In [155]:
b = list(X_train.columns)
df_test_values_new = df_test_values[b]

# Create selected feature DataFrame

## Randomize

In [67]:
# randomize data
# df_lables, df_values - combine and shuffle this data

df = pd.merge(df_labels,df_values,how = 'left')
df = df.sample(frac=1).reset_index(drop=True)

In [80]:
my_features.insert(0,'status_group')

In [82]:
df_features = df[my_features]

In [84]:
# create dummy features

dummy_start = 6
df_features = pd.get_dummies(df_features,columns=list(df_features.columns[dummy_start:]))
df_features.sample(3)

Unnamed: 0,status_group,amount_tsh,population,latitude,longitude,gps_height,region_code_1,region_code_2,region_code_3,region_code_4,...,installer_nan,basin_Internal,basin_Lake Nyasa,basin_Lake Rukwa,basin_Lake Tanganyika,basin_Lake Victoria,basin_Pangani,basin_Rufiji,basin_Ruvuma / Southern Coast,basin_Wami / Ruvu
58753,functional,500.0,80,-8.829995,34.931789,1589,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
24759,functional,0.0,0,-3.278277,31.909012,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
11447,functional,0.0,250,-7.620373,37.014316,301,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0


In [20]:
map_status_group = {'functional':0,'functional needs repair':1,'non functional':2}

In [85]:
# turn y into 3 class 0,1,2
df_features['status_group'].replace(map_status_group, inplace=True)

In [86]:
columns = list(df_features.columns)
df_features[columns[dummy_start:]] = df_features[columns[dummy_start:]].astype(int)
df_features[columns[0]] = df_features[columns[0]].astype(int)

## Create X and Y

In [89]:
y = df_features.status_group
X = df_features.drop('status_group', axis=1)

## Test train split

In [93]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Decision Tree

## Create DT on test-train-split

In [24]:
y = feature_sel.status_group
X = feature_sel.drop('status_group', axis=1)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [26]:
decisiontree = DecisionTreeClassifier(random_state=0)
decisiontree.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [27]:
decisiontree.score(X_test,y_test)

0.7430976430976431

## DT on competition datsa

In [115]:
df_test_values = df_test_values[['gps_height', 'longitude', 'latitude', 'population','basin',
                  'extraction_type_class','payment','quantity','water_quality', 'source_class',
                  'waterpoint_type_group','public_meeting','scheme_management']]

In [116]:
df_test_values

Unnamed: 0,gps_height,longitude,latitude,population,basin,extraction_type_class,payment,quantity,water_quality,source_class,waterpoint_type_group,public_meeting,scheme_management
0,1996,35.290799,-4.059696e+00,321,Internal,other,never pay,seasonal,soft,surface,other,True,Parastatal
1,1569,36.656709,-3.309214e+00,300,Pangani,gravity,never pay,insufficient,soft,groundwater,communal standpipe,True,VWC
2,1567,34.767863,-5.004344e+00,500,Internal,other,never pay,insufficient,soft,surface,other,True,VWC
3,267,38.058046,-9.418672e+00,250,Ruvuma / Southern Coast,other,unknown,dry,soft,groundwater,other,,VWC
4,1260,35.006123,-1.095041e+01,60,Ruvuma / Southern Coast,gravity,pay monthly,enough,soft,groundwater,communal standpipe,,Water Board
5,1685,36.685279,-3.302420e+00,200,Pangani,gravity,never pay,enough,soft,groundwater,communal standpipe,True,VWC
6,550,36.398041,-7.541382e+00,600,Rufiji,handpump,never pay,enough,salty,groundwater,hand pump,True,VWC
7,234,39.607420,-1.089379e+01,1,Ruvuma / Southern Coast,submersible,never pay,dry,soft,groundwater,communal standpipe,True,Water Board
8,584,39.262951,-1.082359e+01,40,Ruvuma / Southern Coast,gravity,pay per bucket,insufficient,soft,groundwater,communal standpipe,True,VWC
9,1083,37.096108,-3.251754e+00,1,Pangani,gravity,pay monthly,enough,soft,groundwater,communal standpipe,True,Water Board


In [29]:
# rearange columns
# cols = df_test_values.columns.tolist()
# cols = ['gps_height',
#              'longitude',
#              'latitude',
#              'population',
#              'basin',
#              'extraction_type_class',
#              'payment',
#              'quantity',
#              'water_quality',
#              'source_class',
#              'waterpoint_type_group',
#              'public_meeting',
#              'scheme_management']
# df_test_values = df_test_values[cols]

In [117]:
df_test_values = pd.get_dummies(df_test_values,columns=list(df_test_values.columns[4:]))
df_test_values.sample(3)

Unnamed: 0,gps_height,longitude,latitude,population,basin_Internal,basin_Lake Nyasa,basin_Lake Rukwa,basin_Lake Tanganyika,basin_Lake Victoria,basin_Pangani,...,scheme_management_Other,scheme_management_Parastatal,scheme_management_Private operator,scheme_management_SWC,scheme_management_Trust,scheme_management_VWC,scheme_management_WUA,scheme_management_WUG,scheme_management_Water Board,scheme_management_Water authority
4280,0,34.224404,-8.802241,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
173,0,33.378153,-2.792809,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
5569,1738,35.374811,-8.205277,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [118]:
a = [0] * len(df_test_values['gps_height'])
df_test_values['scheme_management_None'] = a
df_test_values.sample(3)

Unnamed: 0,gps_height,longitude,latitude,population,basin_Internal,basin_Lake Nyasa,basin_Lake Rukwa,basin_Lake Tanganyika,basin_Lake Victoria,basin_Pangani,...,scheme_management_Parastatal,scheme_management_Private operator,scheme_management_SWC,scheme_management_Trust,scheme_management_VWC,scheme_management_WUA,scheme_management_WUG,scheme_management_Water Board,scheme_management_Water authority,scheme_management_None
5146,-17,39.45638,-8.847157,230,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14047,0,32.771707,-4.868387,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
6760,1006,29.725329,-4.755255,211,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0


In [119]:
columns = list(df_test_values.columns)
df_test_values[columns[5:]] = df_test_values[columns[5:]].astype(int)
df_test_values[columns[0]] = df_test_values[columns[0]].astype(int)

## Apply to competition data

In [None]:
df_test_labels = decisiontree.predict(df_test_values)

In [None]:
df_test_labels.shape

In [None]:
df_sub['status_group'] = df_test_labels

In [None]:
df_sub['status_group'] = df_sub['status_group'].astype(str)

for i in range(len(df_sub.status_group)):
    if df_sub.status_group[i] == '3':
        df_sub.at[i, 'status_group'] = 'functional'
    elif df_sub.status_group[i] == '2':
        df_sub.at[i, 'status_group'] = 'functional needs repair'
    else:
        df_sub.at[i, 'status_group'] = 'non functional'
df_sub.sample(3)

In [None]:
df_sub.status_group.value_counts()

In [None]:
df_sub.to_csv('./Submissions/decisionTree.csv',index=False)

## Ranking

<table>
  <tr>
    <th>Best</th>
    <th>Current Rank</th>
    <th>Competitors</th>
    <th>Time</th>
  </tr>
  <tr>
    <td>0.6449</td>
    <td>1309</td>
    <td>5059</td>
    <td>May 7, 2018, 11:50 p.m.</td>
  </tr>
</table>

## Adding depths to tree

In [35]:
from sklearn.cross_validation import cross_val_score

depth = []
for i in range(3,20):
    clf = DecisionTreeClassifier(max_depth=i)
    # Perform 7-fold cross validation 
    scores = cross_val_score(estimator=clf, X=X, y=y, cv=7, n_jobs=4)
    depth.append((i,scores.mean()))
print(depth)

[(3, 0.6933672940058565), (4, 0.7014481277774461), (5, 0.7074076640671273), (6, 0.7138723251449507), (7, 0.7196972711208949), (8, 0.7248148250084826), (9, 0.7298822743078776), (10, 0.7353201608784905), (11, 0.7413808625996199), (12, 0.7455050707863776), (13, 0.7500506362634685), (14, 0.7530640564635427), (15, 0.7560100336891841), (16, 0.7591920404217628), (17, 0.76151517870262), (18, 0.7620879944308843), (19, 0.7628117864963532)]


## Gridsearchcv?

In [42]:
from sklearn.model_selection import GridSearchCV

parameters = {'max_depth':range(3,20)}
clf = GridSearchCV(DecisionTreeClassifier(), parameters, n_jobs=4)
clf.fit(X_train, y_train)
tree_model = clf.best_estimator_
print (clf.best_score_, clf.best_params_) 

0.7481481481481481 {'max_depth': 19}


In [44]:
clf.score(X_test,y_test)

0.7593434343434343

In [45]:
df_test_labels3 = clf.predict(df_test_values)

In [46]:
df_sub['status_group'] = df_test_labels3

In [47]:
df_sub['status_group'] = df_sub['status_group'].astype(str)

for i in range(len(df_sub.status_group)):
    if df_sub.status_group[i] == '3':
        df_sub.at[i, 'status_group'] = 'functional'
    elif df_sub.status_group[i] == '2':
        df_sub.at[i, 'status_group'] = 'functional needs repair'
    else:
        df_sub.at[i, 'status_group'] = 'non functional'
df_sub.sample(3)

Unnamed: 0,id,status_group
4544,65748,functional
1462,56896,non functional
9760,54331,functional


In [48]:
df_sub.status_group.value_counts()

functional                 8548
non functional             4428
functional needs repair    1874
Name: status_group, dtype: int64

In [49]:
df_sub.to_csv('./Submissions/decisionTree5_8_2pm.csv',index=False)

## Ranking

<table>
  <tr>
    <th>Best</th>
    <th>Current Rank</th>
    <th>Competitors</th>
    <th>Time</th>
  </tr>
  <tr>
    <td>0.6527</td>
    <td>Not Best</td>
    <td>1277</td>
    <td>May 8, 2018, 9:04 p.m.</td>
  </tr>
</table>

## Random forest

In [95]:
from sklearn.ensemble import RandomForestClassifier

In [96]:
clf = RandomForestClassifier(max_depth = 4,random_state=0)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [113]:
importance = clf.feature_importances_
features = list(df_features.columns)

a = list(zip(features,importance))

feature_of_importances = []
for i in range(len(a)):
    if a[i][1] != 0:
        feature_of_importances.append(a[i])
print(len(feature_of_importances),feature_of_importances)

74 [('status_group', 0.095068002872861), ('amount_tsh', 0.0007553448403511924), ('population', 0.0064418546827212765), ('latitude', 0.0323111380422475), ('longitude', 0.04219760610264458), ('region_code_1', 0.0017100980871774484), ('region_code_2', 5.7430196425321216e-05), ('region_code_4', 0.00013791932982497817), ('region_code_5', 0.00017989386400874523), ('region_code_10', 0.028369332993668), ('region_code_15', 0.00789521734778333), ('region_code_16', 0.012186131796517778), ('region_code_18', 0.0001193427834499857), ('region_code_99', 0.003002785824599587), ('lga_Bahi', 0.004287103858994933), ('lga_Dodoma Urban', 0.002344103668995936), ('lga_Karatu', 0.000734018289777858), ('lga_Kilolo', 0.0007509250711387261), ('lga_Kisarawe', 0.0002468792607916525), ('lga_Kongwa', 0.000232934543586482), ('lga_Meatu', 0.00023799909997696992), ('lga_Morogoro Urban', 0.00030603280779806225), ('lga_Musoma Rural', 0.0010853905987366322), ('lga_Pangani', 0.0007479406581947224), ('lga_Rufiji', 0.00426123

In [158]:
df_test_labels3 = clf.predict(df_test_values_new)

In [159]:
df_sub['status_group'] = df_test_labels3

In [None]:
inv_map_status_group = {v: k for k, v in map_status_group.items()}

In [163]:
# turn y into 3 class 0,1,2
df_sub['status_group'].replace(inv_map_status_group, inplace=True)

In [164]:
df_sub.status_group.value_counts()

functional        11875
non functional     2975
Name: status_group, dtype: int64

In [165]:
#df_sub.to_csv('./Submissions/decisionTree59_1pm.csv',index=False)

## Ranking

<table>
  <tr>
    <th>Best</th>
    <th>Current Rank</th>
    <th>Competitors</th>
    <th>Time</th>
  </tr>
  <tr>
    <td>0.6988</td>
    <td>1276</td>
    <td>5059</td>
    <td>May 8, 2018, 12:13 a.m.</td>
  </tr>
</table>

## Gridsearchcv?

In [166]:
from sklearn.model_selection import GridSearchCV

parameters = {'max_depth':range(3,40)}
clf = GridSearchCV(RandomForestClassifier(), parameters, n_jobs=-1)
clf.fit(X_train, y_train)
tree_model = clf.best_estimator_
print (clf.best_score_, clf.best_params_) 

0.7877314814814815 {'max_depth': 23}


In [167]:
clf.score(X_test,y_test)

0.7957070707070707

In [171]:
df_test_labels4 = clf.predict(df_test_values_new)

In [173]:
prob = clf.predict_proba(df_test_values_new)

In [179]:
df_sub['status_group'] = df_test_labels4

In [180]:
df_sub['status_group'].value_counts()

0    9302
2    5029
1     519
Name: status_group, dtype: int64

In [181]:
for i in range(len(prob)):
    if prob[i].max() <= 0.54:
        df_sub.at[i,'status_group'] = 2

In [182]:
df_sub['status_group'].value_counts()

0    8435
2    6055
1     360
Name: status_group, dtype: int64

In [183]:
# turn y into 3 class 0,1,2
df_sub['status_group'].replace(inv_map_status_group, inplace=True)

In [184]:
df_sub.status_group.value_counts()

functional                 8435
non functional             6055
functional needs repair     360
Name: status_group, dtype: int64

In [185]:
df_sub.to_csv('./Submissions/decisionTree5_9_110pm.csv',index=False)

# Appendix

## A

In [None]:
columns = list(feature_sel.columns)
feature_sel[columns[5:]] = feature_sel[columns[5:]].astype(int)
feature_sel[columns[0]] = feature_sel[columns[0]].astype(int)

In [None]:
feature_sel = feature_sel.sample(frac=1).reset_index(drop=True)
y = feature_sel.status_group
X = feature_sel.drop('status_group', axis=1)

from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.84)

In [None]:
X_train.head(1)

## Adjust competition data

In [None]:
df_test_values = df_test_values[['gps_height', 'longitude', 'latitude', 'basin',
                  'extraction_type_class','payment','quantity','water_quality', 'source_class',
                  'waterpoint_type_group','population','public_meeting','scheme_management']]

In [None]:
# rearange columns
cols = df_test_values.columns.tolist()
cols = ['gps_height',
             'longitude',
             'latitude',
             'population',
             'basin',
             'extraction_type_class',
             'payment',
             'quantity',
             'water_quality',
             'source_class',
             'waterpoint_type_group',
             'public_meeting',
             'scheme_management']
df_test_values = df_test_values[cols]

In [None]:
a = [0] * len(df_test_values['gps_height'])

In [None]:
df_test_values = pd.get_dummies(df_test_values,columns=list(df_test_values.columns[4:]))
df_test_values.sample(5)

In [None]:
columns = list(feature_sel.columns)
feature_sel[columns[5:]] = feature_sel[columns[5:]].astype(int)
feature_sel[columns[0]] = feature_sel[columns[0]].astype(int)

In [None]:
df_test_values['scheme_management_None'] = a
df_test_values.head(10)

In [None]:
list(set(X_train.columns)-set(df_test_values.columns))

##  B

In [None]:
from __future__ import print_function

import numpy as np
import pandas as pd

from IPython.display import Image

from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.cross_validation import train_test_split
from sklearn import svm

from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
pd.Series(y_train).value_counts()

## Principle Component Analysis

In [None]:
pca = PCA(n_components=2)
pca.fit(X_train)

In [None]:
pcafeatures_train = pca.transform(X_train)

In [None]:
from itertools import cycle

In [None]:
# def plot_PCA_2D(data, target, target_names):
#     colors = cycle(['r','g','b'])
#     target_ids = range(len(target_names))
#     plt.figure()
#     for i, c, label in zip(target_ids, colors, target_names):
#         plt.scatter(data[target == i, 0], data[target == i, 1],
#                    c=c, label=label)
#     plt.legend()

In [None]:
# plot_PCA_2D(pcafeatures_train, target=y_train, target_names=digits.target_names)

## Fitting Linear and RBF SVM Models

In [None]:
# fit linear model
model_svm = svm.SVC(kernel='rbf',probability=False,cache_size=2000)
model_svm.fit(X_train, y_train)

In [None]:
# predict out of sample
y_pred = model_svm.predict(X_test)

In [None]:
# check accuracy
accuracy_score(y_test,y_pred)

In [None]:
# confusion matrix
confusion_matrix(y_test,y_pred)

In [None]:
# fit rbf model
# model_svm2 = svm.SVC(kernel='rbf', gamma = 0.001)
# model_svm2.fit(X_train, y_train)

In [None]:
# predict out of sample
y_pred2 = model_svm2.predict(X_test)

In [None]:
# check accuracy
accuracy_score(y_test,y_pred2)

In [None]:
# confusion matrix
confusion_matrix(y_test,y_pred2)

## C

Extra code from class to utilize

In [None]:
df.age=df.age.fillna(df.age.mean())

In [None]:
y,X=dmatrices('survived~ pclass +age+sibsp+parch+fare',data=df,return_type='dataframe')

In [None]:
# Generate a confusion matrix plot: 

def plot_confusion_matrix(cm,title='Confusion matrix', cmap=plt.cm.Reds):
    plt.imshow(cm, interpolation='nearest',cmap=cmap)
    plt.title(title)
    plt.colorbar()
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

#Could be a typical function for classifying:

def train_score(classifier,x,y):
    xtrain, xtest, ytrain, ytest = cross_validation.train_test_split(x, y, test_size=0.2, random_state=1234)
    ytrain=np.ravel(ytrain)
    clf = classifier.fit(xtrain, ytrain)
    # accuracy for test & train:
    train_acc=clf.score(xtrain, ytrain)
    test_acc=clf.score(xtest,ytest)
    print("Training Data Accuracy: %0.2f" %(train_acc))
    print("Test Data Accuracy:     %0.2f" %(test_acc))
    
    y_true = ytest
    y_pred = clf.predict(xtest)


    conf = confusion_matrix(y_true, y_pred)
    print(conf)

    print ('\n')
    print ("Precision:              %0.2f" %(conf[0, 0] / (conf[0, 0] + conf[1, 0])))
    print ("Recall:                 %0.2f"% (conf[0, 0] / (conf[0, 0] + conf[0, 1])))
    
    cm=confusion_matrix(y_true, y_pred, labels=None)
    
    plt.figure()
    plot_confusion_matrix(cm)

In [None]:
log_clf=LogisticRegression()
train_score(log_clf,X,y)

In [None]:
# What about ROC ? 

from sklearn.metrics import roc_curve, auc

xtrain, xtest, ytrain, ytest = cross_validation.train_test_split(X, y, test_size=0.2, random_state=1234)
log = LogisticRegression()
log.fit(xtrain,np.ravel(ytrain))
y_score=log.predict_proba(xtest)[:,1]

fpr, tpr,_ = roc_curve(ytest, y_score)
roc_auc = auc(fpr, tpr)

plt.figure()
# Plotting our Baseline..
plt.plot([0,1],[0,1])
plt.plot(fpr,tpr)
plt.xlabel('FPR')
plt.ylabel('TPR')

In [None]:
tpr

####  Cost Benefit Example: 

We can also optimize our models based on specific costs associated with our classification errors; here we will use specific dollar amounts as weights.

Let's say we were developing a classification model for Aircraft Delay prediction.  For this example let's assume that a true positive would 
lead to a cost savings of 2160 dollars, a false negative would cost us 2900 dollars a false positive would cost 750 dollars.  

cb = np.array([[2160, -750.0], [-2900, 0]])  

Expected_Value = #TPs(2160) - #FNs(2900) -#FPs(750)  