In [None]:
# Numpy for working with Arrays
import numpy as np
# Pandas for working with data tables
import pandas as pd
# SciPy implements many different numerical algorithms
import scipy as sp
import scipy.stats as stats
from scipy.sparse import csr_matrix
from scipy.sparse import hstack
# Module for plotting
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
# Module for pretty plotting
# import seaborn as sns
# Module for linear regression
import statsmodels.api as sm
import statsmodels.formula.api as smf
# the bayes stuff
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn import cross_validation
from sklearn import preprocessing

In [None]:
### load data from csv files
water_values = pd.read_csv('./data/train_set_values.csv')
water_labels = pd.read_csv('./data/train_set_labels.csv')


In [None]:
# list categorical features to be turned into dummy variables:
cat_features = ['region_code', 'district_code', 'basin', 'region', 'public_meeting', \
                'scheme_management', 'permit', 'extraction_type', 'extraction_type_group', 'extraction_type_class', \
                'management', 'payment', 'water_quality', 'quantity', \
                'source', 'waterpoint_type', 'date_recorded', 'recorded_by', 'funder', 'installer', \
               'lga', 'ward', 'scheme_name', 'management_group', 'wpt_name', 'subvillage', \
               'payment_type', 'quality_group', 'quantity_group', 'source_class', 'source_type', 'waterpoint_type_group']

# make dataframe of just the categorical features
water_cat = water_values[cat_features]

# make dataframe of just numeric (basically, the rest of the columns)
#water_num = water_values[list(set(water_values.columns) - set(cat_features))]



In [None]:
# prepare test data for model prediction
test = pd.read_csv('./test_set_values.csv')

# make dataframe of just the categorical features
test_cat = test[cat_features]

# make dataframe of just numeric (basically, the rest of the columns)
#test_num = test[list(set(test.columns) - set(cat_features))]

# replace 0s with NaN
#for col in ['num_private', 'amount_tsh', 'population', 'construction_year', 'gps_height']:
#    test_num.loc[:,col].replace(0, np.nan, inplace = True)


In [None]:
# combine training and test data, so that when dummy variables are created, all categories from both datasets are included
# if this is not done, clf.predict will fail because test set will have different number of features 
all_cat = pd.concat([water_cat, test_cat])

#all_num = pd.concat([water_num, test_num])

In [None]:
# convert status labels to numeric
water_labels['status_group'] = water_labels.status_group.factorize()[0]

In [None]:
# preprocessing steps
# encode labels to floats
le = preprocessing.LabelEncoder()

# make dummy variables
enc = preprocessing.OneHotEncoder(handle_unknown = 'ignore')


In [None]:
for col in cat_features:
    all_cat.loc[:,col] = le.fit_transform(all_cat.loc[:,col])

In [None]:
all_cat = enc.fit_transform(all_cat[cat_features])

In [None]:
#water_num.columns

In [None]:
# impute numeric data
#imp = preprocessing.Imputer(missing_values='NaN', strategy='most_frequent', axis=0, verbose=0, copy=True)
#all_num = imp.fit_transform(all_num)

In [None]:
# standardize numeric variables
# all_num = preprocessing.scale(all_num)

In [None]:
# split back into training and test sets
water_cat = all_cat[:59400,:]
water_cat.shape

In [None]:
test_cat = all_cat[59400:,:]
test_cat.shape

In [None]:
all_cat.shape

In [None]:
#all_num.shape

In [None]:
#water_num = all_num[:59400,:]
#water_num.shape

In [None]:
#test_num = all_num[59400:,:]
#test_num.shape

In [None]:
# convert to sparse matrix
#water_num = csr_matrix(water_num)

In [None]:
# concatenate 
#X = hstack([water_cat, water_num])#.toarray()
X = water_cat

In [None]:
# flatten label column into a 1-D array called y
y = np.ravel(water_labels['status_group'])

In [None]:
np.unique(y)

In [None]:
X = X.tocsr()

In [None]:
X1 = X[:10000,:]
X2 = X[10001:20000,:]
X3 = X[20001:30000,:]
X4 = X[30001:40000,:]
X5 = X[40001:50000,:]
X6 = X[50001:,:]

Xlist = [X1, X2, X3, X4, X5, X6]

y1 = y[:10000]
y2 = y[10001:20000]
y3 = y[20001:30000]
y4 = y[30001:40000]
y5 = y[40001:50000]
y6 = y[50001:]

ylist = [y1, y2, y3, y4, y5, y6]

In [None]:
clf = MultinomialNB()

In [None]:
# Because X is too large to fit into memory as a dense array (req'd by NB), break into chunks and partially fit each in a loop:

for x, y in zip(Xlist, ylist):
    x = x.toarray()
    print type(x)
    clf.partial_fit(x, y, classes = [0,1,2])
    x = '' # release memory




In [None]:
# check the accuracy on the training set
X1 = X1.toarray()
print clf.score(X1, y1)
X1=''

In [None]:
X2 = X2.toarray()

In [None]:
y2.shape

In [None]:
# create train, test sets for X,Y to cross-validate, k=1 (just to verify above cross-val)
xtrain, xtest, ytrain, ytest = \
cross_validation.train_test_split(X2, y2, test_size=0.2, random_state=101)

# verify shapes
print xtrain.shape, ytrain.shape
print xtest.shape, ytest.shape

# predict labels for test set
yhat = clf.predict(xtest)

# calculate accuracy
np.mean(yhat == ytest)

In [None]:
X2=''
X1=''