## 2 - Naive Bayes

### Normal Naive Bayes

In [140]:
from sklearn.naive_bayes import GaussianNB 
from sklearn import cross_validation
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from time import time
import numpy as np
import pandas as pd

In [141]:
train_data = pd.read_csv('./input/train_prep.csv')
test_data = pd.read_csv('./input/test_prep.csv')

In [142]:
param = train_data.ix[:, :-1] # take all rows and all but last column from training data
res = train_data.ix[:, -1] # take all rows and only last column from training data
test_data = test_data.drop('ID', 1) # drop ID column from test data

In [143]:
model = GaussianNB()
# cross_validation.cross_val_score(model, param, res, scoring="neg_log_loss")
cross_validation.cross_val_score(model, param, res, cv=5)

array([ 0.62303665,  0.61791659,  0.61347053,  0.61478017,  0.6184506 ])

In [144]:
# create output file
def create_output_file(data, file_name):
    output = []
    for pred in data:
        oi = [0] * 5
        oi[pred - 1] = 1
        output.append(oi)
    output = pd.DataFrame(output, columns=['Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer'])
    output.index.names = ['ID']
    output.index += 1
    output.to_csv('./output/Sub-' + file_name, index_label='ID')

In [145]:
model.fit(param, res) # fit the data
predictions = model.predict(test_data) # make predictions
create_output_file(predictions, 'Gaussian-NB.csv')

### Naive Bayes with Pipeline and GridSearchCV

In [164]:
pipeline = Pipeline([
        ('featureSelection', SelectKBest(f_classif)),
        ('classifier', GaussianNB())
    ])
params = {
    "featureSelection__k" : [3, 4]
    }
grid_search = GridSearchCV(pipeline, params, n_jobs=-1, scoring='neg_log_loss')

In [165]:
grid_search.fit(param, res)
# predictions = grid_search.predict_proba(test_data) # predict_proba predicts the probability of each outcome
predictions = grid_search.predict(test_data) # predict method predicts the outcome of the tuple
create_output_file(predictions, 'Gaussian-NB-grid-search.csv')