In [1]:
# Numpy for working with Arrays
import numpy as np
# Pandas for working with data tables
import pandas as pd
# SciPy implements many different numerical algorithms
import scipy as sp
import scipy.stats as stats
from scipy.sparse import csr_matrix
from scipy.sparse import hstack
# Module for plotting
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
# Module for pretty plotting
# import seaborn as sns
# Module for linear regression
import statsmodels.api as sm
import statsmodels.formula.api as smf

from patsy import dmatrices
from sklearn import linear_model, datasets
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn import cross_validation
from sklearn import preprocessing





In [2]:
### load data from csv files
water_values = pd.read_csv('./data/train_set_values.csv')
water_labels = pd.read_csv('./data/train_set_labels.csv')

#water_values.drop(['wpt_name', 'subvillage'], axis=1, inplace=True)

In [3]:
# list categorical features to be turned into dummy variables:
cat_features = ['region_code', 'district_code', 'basin', 'region', 'public_meeting', \
                'scheme_management', 'permit', 'extraction_type', 'extraction_type_group', 'extraction_type_class', \
                'management', 'payment', 'water_quality', 'quantity', \
                'source', 'waterpoint_type', 'date_recorded', 'recorded_by', 'funder', 'installer', \
               'lga', 'ward', 'scheme_name', 'management_group', 'wpt_name', 'subvillage', \
               'payment_type', 'quality_group', 'quantity_group', 'source_class', 'source_type', 'waterpoint_type_group']

# make dataframe of just the categorical features
water_cat = water_values[cat_features]

# make dataframe of just numeric (basically, the rest of the columns)
water_num = water_values[list(set(water_values.columns) - set(cat_features))]



In [4]:
# prepare test data for model prediction
test = pd.read_csv('./test_set_values.csv')

#test['status_group'] = np.random.choice(range(1, 3), test.shape[0])
#test.drop(['wpt_name', 'subvillage'], axis=1, inplace=True)

# make dataframe of just the categorical features
test_cat = test[cat_features]

# make dataframe of just numeric (basically, the rest of the columns)
test_num = test[list(set(test.columns) - set(cat_features))]

# replace 0s with NaN
for col in ['num_private', 'amount_tsh', 'population', 'construction_year', 'gps_height']:
    test_num.loc[:,col].replace(0, np.nan, inplace = True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [5]:
all_cat = pd.concat([water_cat, test_cat])
all_num = pd.concat([water_num, test_num])

In [6]:
# convert status labels to numeric
water_labels['status_group'] = water_labels.status_group.factorize()[0]

In [7]:
# replace 0s with NaN
for col in ['num_private', 'amount_tsh', 'population', 'construction_year', 'gps_height']:
    all_num.loc[:,col].replace(0, np.nan, inplace = True)

In [8]:
# preprocessing steps
# encode labels to floats
le = preprocessing.LabelEncoder()

# make dummy variables
enc = preprocessing.OneHotEncoder(handle_unknown = 'ignore')


In [9]:
for col in cat_features:
    all_cat.loc[:,col] = le.fit_transform(all_cat.loc[:,col])

  flag = np.concatenate(([True], aux[1:] != aux[:-1]))


In [10]:
all_cat = enc.fit_transform(all_cat[cat_features])

In [11]:
water_num.columns

Index([u'gps_height', u'longitude', u'latitude', u'amount_tsh', u'num_private',
       u'construction_year', u'id', u'population'],
      dtype='object')

In [12]:
# impute data
imp = preprocessing.Imputer(missing_values='NaN', strategy='most_frequent', axis=0, verbose=0, copy=True)
all_num = imp.fit_transform(all_num)

In [13]:
# standardize numeric variables
all_num = preprocessing.scale(all_num)

In [14]:
water_cat = all_cat[:59400,:]
water_cat.shape

(59400, 92484)

In [15]:
test_cat = all_cat[59400:,:]
test_cat.shape

(14850, 92484)

In [16]:
all_cat.shape

(74250, 92484)

In [17]:
all_num.shape

(74250L, 8L)

In [18]:
water_num = all_num[:59400,:]
water_num.shape

(59400L, 8L)

In [19]:
test_num = all_num[59400:,:]
test_num.shape

(14850L, 8L)

In [20]:
# convert to sparse matrix
water_num = csr_matrix(water_num)

In [21]:
# concatenate 
X = hstack([water_cat, water_num])

In [22]:
# flatten label column into a 1-D array called y
y = np.ravel(water_labels['status_group'])

In [23]:
# instantiate a logistic regression model, and fit with X and y
model = LogisticRegression(penalty = 'l2')
model = model.fit(X, y)

# check the accuracy on the training set
model.score(X, y)

0.85156565656565653

In [24]:
# create prediction accuracy function to be used in cross val
def predict_accuracy(model, x,y):
    y_hat = model.predict(x)
    return np.mean(y_hat == y)

# run cross validation
score = cross_validation.cross_val_score(model, X, y, scoring = predict_accuracy).mean()

score

0.7852525252525252

In [27]:
# create train, test sets for X,Y to cross-validate, k=1 (just to verify above cross-val)
xtrain, xtest, ytrain, ytest = \
cross_validation.train_test_split(X, y, test_size=0.2, random_state=11)

# verify shapes
print xtrain.shape, ytrain.shape
print xtest.shape, ytest.shape

# predict labels for test set
yhat = model.predict(xtest)

# calculate accuracy
np.mean(yhat == ytest)

(47520, 92492) (47520L,)
(11880, 92492) (11880L,)


0.85260942760942759

In [None]:
'''
# prepare test data for model prediction
test = pd.read_csv('./test_set_values.csv')

#test['status_group'] = np.random.choice(range(1, 3), test.shape[0])
test.drop(['id', 'wpt_name', 'subvillage'], axis=1, inplace=True)

# make dataframe of just the categorical features
test_cat = test[cat_features]

# make dataframe of just numeric (basically, the rest of the columns)
test_num = test[list(set(test.columns) - set(cat_features))]

# replace 0s with NaN
for col in ['num_private', 'amount_tsh', 'population', 'construction_year', 'gps_height']:
    test_num.loc[:,col].replace(0, np.nan, inplace = True)

# encode categorical features w/ floats
for col in cat_features:
    test_cat.loc[:,col] = le.transform(test_cat.loc[:,col])

# make dummy variables
test_cat = enc.transform(test_cat[cat_features])

# impute data
test_num = imp.transform(test_num)

# standardize numeric variables
test_num = preprocessing.scale(test_num)
'''


In [30]:
# convert to sparse matrix
test_num = csr_matrix(test_num)

# concatenate 
Xtest = hstack([test_cat, test_num])

In [31]:
y_hat = model.predict(Xtest)
y_hat

array([1, 0, 1, ..., 0, 0, 1])

In [32]:
len(y_hat)

14850

In [33]:
len(test)

14850

In [34]:
# submission

test['status_group'] = y_hat

map_dict = {0: 'functional', 1: 'non functional', 2: 'functional needs repair'}
test['status_group'] = test['status_group'].map(map_dict)
test['status_group']

0                 non functional
1                     functional
2                 non functional
3                 non functional
4                     functional
5                     functional
6                 non functional
7                 non functional
8                 non functional
9                     functional
10                    functional
11                non functional
12                non functional
13                non functional
14                    functional
15                    functional
16                    functional
17                    functional
18                    functional
19                non functional
20                non functional
21                    functional
22                non functional
23                non functional
24                    functional
25                    functional
26                non functional
27                non functional
28       functional needs repair
29                non functional
          

In [35]:
test_logreg = test[['id', 'status_group']]
test_logreg.to_csv('test_logreg_l1.csv', index = False)

In [None]:
##### random code chunks below this line #####

In [None]:
###############################################

In [70]:
a = pd.read_csv('./test_logreg.csv')
b = pd.read_csv('./test_logreg2.csv')


In [None]:
# received error that test dataset does not have as many columns as training (3dummy columns missing)
# this is to find out which ones
mask = np.in1d(X.columns, Xtest.columns)
print np.where(~mask)[0]



In [None]:
water_unique = water_cat.apply(lambda x: len(x.unique()))
print water_unique.sort_values()