## 5 - Logistic Regression

In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import cross_validation
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.grid_search import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from scipy.stats import randint as sp_randint
from time import time
from operator import itemgetter
import numpy as np
import pandas as pd



In [3]:
train_data = pd.read_csv('./input/train_prep.csv')
test_data = pd.read_csv('./input/test_prep.csv')

param = train_data.ix[:, :-1] # take all rows and all but last column from training data
res = train_data.ix[:, -1] # take all rows and only last column from training data
test_data = test_data.drop('ID', 1) # drop ID column from test data

In [4]:
# create output file
def create_output_file(data, file_name):
    output = []
    for pred in data:
        oi = [0] * 5
        oi[pred - 1] = 1
        output.append(oi)
    output = pd.DataFrame(output, columns=['Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer'])
    output.index.names = ['ID']
    output.index += 1
    output.to_csv('./output/Sub-' + file_name, index_label='ID')

In [8]:
model = KNeighborsClassifier()
cross_validation.cross_val_score(model, param, res)

array([ 0.57405745,  0.58170595,  0.59784439])

In [11]:
model.fit(param, res)
pred = model.predict(test_data)
create_output_file(pred, 'KNN.csv')

### With Pipeline and GridSearchCV

In [12]:
def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

In [14]:
pipeline = Pipeline([
        ('featureSelection', SelectKBest(f_classif)),
        ('clf', KNeighborsClassifier())
    ])
params = {
    "featureSelection__k" : [2, 3, 4],
    "clf__n_neighbors" : sp_randint(5, 30),
    "clf__weights" : ["uniform", "distance"],
    "clf__algorithm" : ["auto", "ball_tree", "kd_tree", "brute"],
    "clf__leaf_size" : sp_randint(10, 40),
    "clf__p" : [1, 2]
}

In [16]:
rand_search = RandomizedSearchCV(pipeline, params, n_iter=50)
rand_search.fit(param, res)
report(rand_search.grid_scores_)

  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample

Model with rank: 1
Mean validation score: -1.414 (std: 0.086)
Parameters: {'clf__algorithm': 'kd_tree', 'featureSelection__k': 2, 'clf__p': 1, 'clf__weights': 'uniform', 'clf__n_neighbors': 29, 'clf__leaf_size': 24}

Model with rank: 2
Mean validation score: -1.440 (std: 0.076)
Parameters: {'clf__algorithm': 'kd_tree', 'featureSelection__k': 4, 'clf__p': 1, 'clf__weights': 'uniform', 'clf__n_neighbors': 26, 'clf__leaf_size': 28}

Model with rank: 3
Mean validation score: -1.486 (std: 0.071)
Parameters: {'clf__algorithm': 'kd_tree', 'featureSelection__k': 2, 'clf__p': 2, 'clf__weights': 'distance', 'clf__n_neighbors': 29, 'clf__leaf_size': 15}



In [19]:
rand_search.predict_proba(test_data)[:5]

array([[ 0.10344828,  0.        ,  0.13793103,  0.10344828,  0.65517241],
       [ 0.48275862,  0.        ,  0.10344828,  0.31034483,  0.10344828],
       [ 0.5862069 ,  0.        ,  0.        ,  0.13793103,  0.27586207],
       [ 0.03448276,  0.03448276,  0.03448276,  0.03448276,  0.86206897],
       [ 0.4137931 ,  0.        ,  0.06896552,  0.24137931,  0.27586207]])

In [20]:
predictions = rand_search.predict(test_data)
create_output_file(pred, 'KNN-randomized-search.csv')