In [1]:
import os
import time
import pickle
import itertools

import pandas as pd
import numpy as np

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [2]:
experiments_dir = 'experiments'

if not os.path.exists(experiments_dir):
    os.mkdir(experiments_dir)
    
app_knn_dir = os.path.join(experiments_dir, 'app_knn')
if not os.path.exists(app_knn_dir):
    os.mkdir(app_knn_dir)
    
msg = \
    'WARNING this file is generated automatically. Do not edit it.\n\n' \
    'This directory contains experiment with scikin-learn KNeighborsClassifier'\
    '\n'

with open(os.path.join(app_knn_dir, 'note.txt'), 'w') as fp:
    fp.write(msg)

In [3]:
train_file = 'data/adult.data.clean'
df = pd.read_csv(train_file, header=None)

In [4]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,7,77516,9,13,4,1,1,4,1,2174,0,40,39,0
1,50,6,83311,9,13,2,4,0,4,1,0,0,13,39,0
2,38,4,215646,11,9,0,6,1,4,1,0,0,40,39,0
3,53,4,234721,1,7,2,6,0,2,1,0,0,40,39,0
4,28,4,338409,9,13,2,10,5,2,0,0,0,40,5,0


In [5]:
X = df.values[:, 0:14]
Y = df.values[:, 14]
seed = 7

In [6]:
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

In [7]:
def cm_score(cm):
    return np.mean(np.diag(cm) / np.sum(cm, axis=1))

In [8]:
class GridSearcher:
    def __init__(self, estimator, data):
        """
        @param data: dict with following fields: X_train, y_train, X_test, y_test
        """
        self._estimator = estimator
        self._data = data
        self._best_score = 0
        self._best_params = {}
        
    def search(self, opts):
        keys, values = zip(*opts.items())
        for v in itertools.product(*values):
            experiment = dict(zip(keys, v))
            
            self._estimator.set_params(**experiment)
            self._estimator.fit(self._data['X_train'], self._data['y_train'])
            
            y_pred = self._estimator.predict(self._data['X_test'])
            cm = confusion_matrix(self._data['y_test'], y_pred)
            
            score_cm = cm_score(cm)
            
            if score_cm > self._best_score:
                self._best_score = score_cm
                self._best_params = experiment
        
        return self._best_params, self._best_score

In [9]:
opts = {
    'n_neighbors': list(range(1, 10)),
    'algorithm': ['ball_tree', 'kd_tree', 'brute']
}

data = {
    'X_train': X_train,
    'y_train': y_train,
    'X_test': X_test,
    'y_test': y_test
}

knn = KNeighborsClassifier()

gs = GridSearcher(knn, data)
opts, score = gs.search(opts)

print('best score', score)
print('best params', opts)

best score 0.6300942359819933
best params {'n_neighbors': 1, 'algorithm': 'brute'}


In [10]:
def run_experiment(opts, data):
    experiment_start = time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())
    
    knn = KNeighborsClassifier()
    knn.set_params(**opts)
    knn.fit(data['X_train'], data['y_train'])
    
    
    opts['accuracy'] = knn.score(data['X_test'], data['y_test'])
    
    y_pred = knn.predict(data['X_test'])
    cm = confusion_matrix(data['y_test'], y_pred)
    opts['cm_accuracy'] = cm_score(cm)
    
    experiment_end = time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())
    opts['time_start'] = experiment_start
    opts['time_end'] = experiment_end
    
    df = pd.DataFrame(opts, index=[0])
    experiment_dir = os.path.join(app_knn_dir, experiment_end)
    os.mkdir(experiment_dir)
    
    with open(os.path.join(experiment_dir, 'model.knn'),'wb') as fp:
        pickle.dump(knn, fp)
    
    df.to_csv(os.path.join(experiment_dir, 'results.cvs'), index=None)
    
    return df

In [11]:
import copy

df = run_experiment(copy.copy(opts), data)
df

Unnamed: 0,accuracy,algorithm,cm_accuracy,n_neighbors,time_end,time_start
0,0.724549,brute,0.630094,1,2018-03-02-20-47-05,2018-03-02-20-46-56


In [12]:
# just check if model is fine

with open(os.path.join(app_knn_dir, df.loc[0, 'time_end'], 'model.knn') ,'rb') as fp:
    knn2 = pickle.loads(fp.read())

knn2.score(X_test, y_test)

0.7245486692722873