In [63]:
# Numpy for working with Arrays
import numpy as np
# Pandas for working with data tables
import pandas as pd
# SciPy implements many different numerical algorithms
import scipy as sp
import scipy.stats as stats
from scipy.sparse import csr_matrix
from scipy.sparse import hstack
# Module for plotting
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
# Module for pretty plotting
# import seaborn as sns
# Module for linear regression
import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn import cross_validation
from sklearn import preprocessing


In [2]:
### load data from csv files
water_values = pd.read_csv('./data/train_set_values.csv')
water_labels = pd.read_csv('./data/train_set_labels.csv')

In [3]:
# list categorical features to be turned into dummy variables:
cat_features = ['region_code', 'district_code', 'basin', 'region', 'public_meeting', \
                'scheme_management', 'permit', 'extraction_type', 'extraction_type_group', 'extraction_type_class', \
                'management', 'payment', 'water_quality', 'quantity', \
                'source', 'waterpoint_type', 'date_recorded', 'recorded_by', 'funder', 'installer', \
               'lga', 'ward', 'scheme_name', 'management_group', 'wpt_name', 'subvillage', \
               'payment_type', 'quality_group', 'quantity_group', 'source_class', 'source_type', 'waterpoint_type_group']

# make dataframe of just the categorical features
water_cat = water_values[cat_features]

# make dataframe of just numeric (basically, the rest of the columns)
water_num = water_values[list(set(water_values.columns) - set(cat_features))]



In [4]:
# prepare test data for model prediction
test = pd.read_csv('./test_set_values.csv')

# make test dataframe of just the categorical features
test_cat = test[cat_features]

# make test dataframe of just numeric (basically, the rest of the columns)
test_num = test[list(set(test.columns) - set(cat_features))]

# replace 0s with NaN
for col in ['num_private', 'amount_tsh', 'population', 'construction_year', 'gps_height']:
    test_num.loc[:,col].replace(0, np.nan, inplace = True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [5]:
# concatenate training and test datasets (for encoding piece)
all_cat = pd.concat([water_cat, test_cat])
all_num = pd.concat([water_num, test_num])

# impute 'missing' for categorical NaNs
all_cat.loc[:,'permit'][all_cat.loc[:,'permit'].isnull() == True] = 'missing'
all_cat.loc[:,'public_meeting'][all_cat.loc[:,'public_meeting'].isnull() == True] = 'missing'


#### I keep getting the pandas "value trying to be set on a copy error", even though I am using .loc[:,] to properly index
#### No idea why ####

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [6]:
# convert status labels to numeric
water_labels['status_group'] = water_labels.status_group.factorize()[0]

In [7]:
# replace 0s with NaN
for col in ['num_private', 'amount_tsh', 'population', 'construction_year', 'gps_height']:
    all_num.loc[:,col].replace(0, np.nan, inplace = True)

In [8]:
# preprocessing steps

# encode labels to floats
le = preprocessing.LabelEncoder()

# make dummy variables
enc = preprocessing.OneHotEncoder(handle_unknown = 'ignore')


In [9]:
# labels to floats, row by row (le doesn't work across df's)
for col in cat_features:
    all_cat.loc[:,col] = le.fit_transform(all_cat.loc[:,col])

  flag = np.concatenate(([True], aux[1:] != aux[:-1]))


In [10]:
# make dummy variables
all_cat = enc.fit_transform(all_cat[cat_features])

In [11]:
water_num.columns

Index([u'gps_height', u'longitude', u'latitude', u'amount_tsh', u'num_private',
       u'construction_year', u'id', u'population'],
      dtype='object')

In [12]:
# impute data
imp = preprocessing.Imputer(missing_values='NaN', strategy='most_frequent', axis=0, verbose=0, copy=True)
all_num = imp.fit_transform(all_num)

In [13]:
# standardize numeric variables
all_num = preprocessing.scale(all_num)

In [14]:
# split back into training/test now that all dummy variables have been accounted for
water_cat = all_cat[:59400,:]
water_cat.shape

(59400, 77664)

In [15]:
# split back into training/test now that all dummy variables have been accounted for
test_cat = all_cat[59400:,:]
test_cat.shape

(14850, 77664)

In [16]:
all_cat.shape

(74250, 77664)

In [17]:
all_num.shape

(74250L, 8L)

In [19]:
test_num = all_num[59400:,:]
test_num.shape

(14850L, 8L)

In [22]:
# convert to sparse matrix
water_num = csr_matrix(water_num)

In [49]:
# concatenate 
X = hstack([water_cat, water_num])

In [50]:
# flatten label column into a 1-D array called y
y = np.ravel(water_labels['status_group'])

In [28]:
# split into training/test sets for crossval
xtrain, xtest, ytrain, ytest = \
cross_validation.train_test_split(X, y, test_size=0.2, random_state=11)

In [29]:
# define random forest classifier
clf = RandomForestClassifier(n_estimators=100, n_jobs=2)

In [30]:
# fit model on training
clf.fit(xtrain, ytrain)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [31]:
# predict on test
yhats = clf.predict(xtest)

In [32]:
# compute accuracy on test prediction
np.mean(yhats == ytest)

0.81052188552188553

In [34]:
clf.score(xtrain, ytrain)

0.99997895622895627

In [52]:
# define adaboost
ada = AdaBoostClassifier(clf, n_estimators=100,
    learning_rate=1,
    algorithm="SAMME")

In [53]:
# fit adaboost
ada.fit(X, y)

AdaBoostClassifier(algorithm='SAMME',
          base_estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          learning_rate=1, n_estimators=100, random_state=None)

In [54]:
# test prediction on test (if fit to entire set, expect 1.0)
ada_yhats = ada.predict(xtest)

In [55]:
np.mean(ada_yhats == ytest)

1.0

In [56]:
ada.score(xtrain, ytrain)

1.0

In [57]:
# convert (real) test set to sparse matrix
test_num = csr_matrix(test_num)

# concatenate 
Xtest = hstack([test_cat, test_num])

In [58]:
# run prediction on (real) test
y_hat = ada.predict(Xtest)
y_hat

array([1, 0, 0, ..., 0, 0, 1])

In [59]:
len(y_hat)

14850

In [60]:
len(test)

14850

In [61]:
# submission

test['status_group'] = y_hat

map_dict = {0: 'functional', 1: 'non functional', 2: 'functional needs repair'}
test['status_group'] = test['status_group'].map(map_dict)
test_submit = test[['id','status_group']]
test_submit.to_csv('submit_adaboost.csv', index = False)

0                 non functional
1                     functional
2                     functional
3                 non functional
4                     functional
5                     functional
6                     functional
7                 non functional
8                 non functional
9                     functional
10                    functional
11                non functional
12                non functional
13                non functional
14                    functional
15                    functional
16                    functional
17                    functional
18                    functional
19                non functional
20                    functional
21                    functional
22                non functional
23                non functional
24                    functional
25                    functional
26                non functional
27                non functional
28       functional needs repair
29                    functional
          

In [None]:
######## below code added later to produce files for ensemble. will not run properly if executed #####

In [66]:
test['ada_predict'] = y_hat
pred_ada = test[['id', 'ada_predict']]
pred_ada.to_csv('pred_ada.csv', index = False)

In [64]:
# make prediction array on water_cat to use in ensemble:
y_ensemble = ada.predict(X)

In [65]:
water_values['ada_predict'] = y_ensemble
ens_ada = water_values[['id', 'ada_predict']]
ens_ada.to_csv('ens_ada.csv', index = False)


In [None]:
##### random code chunks below this line #####

In [None]:
###############################################

In [None]:
# received error that test dataset does not have as many columns as training (3dummy columns missing)
# this is to find out which ones
mask = np.in1d(X.columns, Xtest.columns)
print np.where(~mask)[0]



In [None]:
# wenhong's script for sorting columns by # of uniques
water_unique = water_cat.apply(lambda x: len(x.unique()))
print water_unique.sort_values()