In [131]:
%matplotlib inline
%load_ext autoreload
%autoreload 5
%autosave 15

import sklearn as sk
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.cross_validation import KFold
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing

import pickle
import copy

def open_pickle(name):
    with open(name, 'rb') as f:
        u = pickle._Unpickler(f)
        u.encoding = 'latin1'
        x, y = u.load()
        return (x, y)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Autosaving every 15 seconds


In [180]:
class KNNClassificator:
    
    def __init__(self, k, metric, minkovski_k=None, fair_k=True):
        self.k = k
        self.fair_k = fair_k
        if hasattr(metric, '__call__'):
            self.metric = metric
        elif metric == 'euclidean2':
            def euclidean2(a, b):
                t = 0
                for i in range(min(len(a), len(b))):
                    t += (a[i] - b[i]) ** 2
                return t
            self.metric = euclidean2
        elif metric == 'minkovski' and minkovski_k != None:
            self.minkovski_k = minkovski_k
            def minkovski(a, b):
                t = 0
                for i in range(min(len(a), len(b))):
                    t += (a[i] - b[i]) ** minkovski_k
                return t ** (1 / self.minkovski_k)
            self.metric = minkovski
        elif metric == 'module':
            def module(a, b):
                t = 0
                for i in range(min(len(a), len(b))):
                    t += np.abs(a[i] - b[i])
                return t
            self.metric = module
        else:
            raise ValueError('Wrong metric')
    
    def fit(self, x, y, w=None):
        assert(len(x) == len(y))
        self.X = x
        self.Y = y
        if w == None:
            self.W = [1 for _ in y]
        else:
            assert(len(x) == len(w))
            self.W = w
    
    def set_k(self, k):
        self.k = k
    
    
    def predict1(self, x):
        dist = np.apply_along_axis(lambda cur: self.metric(cur, x), 1, self.X)
        args = np.argsort(dist)
        b = {}
        k = self.k
        if self.fair_k:# несильно помогает, но почему бы и нет?
            while (np.abs(dist[args[k - 1]] - dist[args[k]]) < 0.00001 and k < len(args)):
                k += 1
        for i in range(k):
            arg = args[i]
            if self.Y[arg] in b:
                b[self.Y[arg]] += self.W[arg] 
            else:
                b[self.Y[arg]] = self.W[arg] * (k + 1 - i)
                
        max_key = None
        max_value = 0
        for key in b:
            if b[key] > max_value:
                max_values = b[key]
                max_key = key
        return max_key
    
    def predict(self, x):
        res = []
        for i in x:
            res.append(self.predict1(i))
        return res

In [177]:
x, y = open_pickle('iris.txt')

def cv(x, y, classificator, n_folds=5, k_iter=10):
    res = 0
    for state in range(k_iter):
        ires = 0
        kf = KFold(n=len(y), n_folds=n_folds, shuffle=True, random_state=state + 10)
        for train_index, test_index in kf:
            x_train, x_test = x[train_index], x[test_index]
            y_train, y_test = y[train_index], y[test_index]
            cl = copy.copy(classificator)
            cl.fit(x_train, y_train)
            y_pred = cl.predict(x_test)
            ires += accuracy_score(y_test, y_pred)
        res += ires / n_folds
    res = res / k_iter
    return res


top = []
top.append(['my knnc k = 1, metric=euclidean2', cv(x, y, KNNClassificator(1, metric='euclidean2'))])
top.append(['my knnc k = 3, metric=euclidean2', cv(x, y, KNNClassificator(3, metric='euclidean2'))])
top.append(['my knnc k = 5, metric=euclidean2', cv(x, y, KNNClassificator(5, metric='euclidean2'))])
top.append(['my knnc k = 9, metric=euclidean2', cv(x, y, KNNClassificator(9, metric='euclidean2'))])

top.append(['my knnc k = 1, metric=module', cv(x, y, KNNClassificator(1, metric='module'))])
top.append(['my knnc k = 3, metric=module', cv(x, y, KNNClassificator(3, metric='module'))])
top.append(['my knnc k = 5, metric=module', cv(x, y, KNNClassificator(5, metric='module'))])
top.append(['my knnc k = 9, metric=module', cv(x, y, KNNClassificator(9, metric='module'))])

top.append(['sklearn knnc k = 1, metric=euclidean2', cv(x, y, KNeighborsClassifier(n_neighbors=1, algorithm='brute'))])
top.append(['sklearn knnc k = 3, metric=euclidean2', cv(x, y, KNeighborsClassifier(n_neighbors=3, algorithm='brute'))])
top.append(['sklearn knnc k = 5, metric=euclidean2', cv(x, y, KNeighborsClassifier(n_neighbors=5, algorithm='brute'))])
top.append(['sklearn knnc k = 9, metric=euclidean2', cv(x, y, KNeighborsClassifier(n_neighbors=9, algorithm='brute'))])

preprocessing.normalize(x, norm='l2', copy=False)

top.append(['norm my knnc k = 1, metric=euclidean2', cv(x, y, KNNClassificator(1, metric='euclidean2'))])
top.append(['norm my knnc k = 3, metric=euclidean2', cv(x, y, KNNClassificator(3, metric='euclidean2'))])
top.append(['norm my knnc k = 5, metric=euclidean2', cv(x, y, KNNClassificator(5, metric='euclidean2'))])
top.append(['norm my knnc k = 9, metric=euclidean2', cv(x, y, KNNClassificator(9, metric='euclidean2'))])

top.append(['norm my knnc k = 1, metric=module', cv(x, y, KNNClassificator(1, metric='module'))])
top.append(['norm my knnc k = 3, metric=module', cv(x, y, KNNClassificator(3, metric='module'))])
top.append(['norm my knnc k = 5, metric=module', cv(x, y, KNNClassificator(5, metric='module'))])
top.append(['norm my knnc k = 9, metric=module', cv(x, y, KNNClassificator(9, metric='module'))])

top.append(['norm sklearn knnc k = 1, metric=euclidean2', cv(x, y, KNeighborsClassifier(n_neighbors=1, algorithm='brute'))])
top.append(['norm sklearn knnc k = 3, metric=euclidean2', cv(x, y, KNeighborsClassifier(n_neighbors=3, algorithm='brute'))])
top.append(['norm sklearn knnc k = 5, metric=euclidean2', cv(x, y, KNeighborsClassifier(n_neighbors=5, algorithm='brute'))])
top.append(['norm sklearn knnc k = 9, metric=euclidean2', cv(x, y, KNeighborsClassifier(n_neighbors=9, algorithm='brute'))])

top = sorted(top, key=lambda p:-p[1])

for s, res in top:
    print(s, (60 - len(s)) * ' ', res)
                                                             

norm sklearn knnc k = 3, metric=euclidean2                    0.977333333333
norm sklearn knnc k = 5, metric=euclidean2                    0.974
norm sklearn knnc k = 9, metric=euclidean2                    0.97
sklearn knnc k = 9, metric=euclidean2                         0.968
norm my knnc k = 1, metric=module                             0.966666666667
norm my knnc k = 1, metric=euclidean2                         0.964
norm sklearn knnc k = 1, metric=euclidean2                    0.964
norm my knnc k = 3, metric=euclidean2                         0.963333333333
norm my knnc k = 3, metric=module                             0.958666666667
sklearn knnc k = 5, metric=euclidean2                         0.958
sklearn knnc k = 3, metric=euclidean2                         0.957333333333
my knnc k = 1, metric=euclidean2                              0.956
sklearn knnc k = 1, metric=euclidean2                         0.954666666667
my knnc k = 1, metric=module                                  0

In [182]:
x, y = open_pickle('iris.txt')
preprocessing.normalize(x, norm='l2', copy=False)

best_score = 0
best = []
for k in range(1, 21):
    for p in range(2, 5):
        for fair_k in [True, False]:
            score = cv(x, y, KNNClassificator(k, metric='minkovski', minkovski_k=p, fair_k=fair_k),
                       n_folds=5, k_iter=5)
            if best_score < score:
                best_score = score
                best = [k, p, fair_k]
print(best_score, best)

0.976 [2, 2, True]
