# kNN Hash Example

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from functools import partial
from random import random
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from collections import defaultdict
from collections import Counter

## Iris dataset

In [2]:
df = load_iris()
df.data.shape

(150, 4)

In [3]:
def f_hash(w,r,b,x):
    return int((np.dot(w,x)+b)/r)

* https://docs.python.org/2/library/functools.html Here you can read about "partial"
* http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html About mapping to [0,1]

In [4]:
class KNNHash(object):
    def __init__(self,m,L,nn):
        self.m = m
        self.L = L
        self.nn = nn

    def fit(self,X,y):
        self.t_hh = [] #hash table
        for j in range(self.L):
            f_hh = [] #compositional hash function
            for i in range(self.m):
                w = np.random.rand(1,X[0].shape[0]) #  weights of a hash function
                f_hh.append(partial(f_hash,w = w,r=random(),b=random())) # list of initialized hash function
            self.t_hh.append(
                (defaultdict(list),f_hh)
            )
        for n in range(X.shape[0]): 
            for j in range(self.L):
                ind = 0
                for i in range(self.m):
                    ind = ind + self.t_hh[j][1][i](x=X[n]) #calculation of index in hash table, simply sum of all hash func
                self.t_hh[j][0][ind].append((X[n],y[n])) #saving sample into corresponding index
    
    
    def distance(self, data1, data2):
            return sum(abs(data1 - data2))
        
    def compute_weights(self, distances):
            matches = [(1, y) for d, y in distances if d == 0]
            return matches if matches else [(1/d, y) for d, y in distances]
    
    def _predict_one(self, test):
            distances = sorted((self._distance(x, test), y) for x, y in zip(self.X, self.y))
            weights = self._compute_weights(distances[:self.n_neighbors])
            weights_by_class = defaultdict(list)
            for d, c in weights:
                weights_by_class[c].append(d)
            return max((sum(val), key) for key, val in weights_by_class.items())[1]

    def predict(self, X):
        return [self._predict_one(x) for x in X]
    
    def score(self, X, y):
        return sum(1 for p, t in zip(self.predict(X), y) if p == t) / len(y)
    //vrode should to work...

In [5]:
scaler = MinMaxScaler()
scaler.fit(df.data)
x = scaler.transform(df.data)
y = df.target


In [6]:

knnhash = KNNHash(4,4,4)
test1x = x[0]
test2x = x[75]
test3x = x[149]

test1y = y[0]
test2y = y[75]
test3y = y[149]
x = np.delete(x,[0,75,149],axis=0)
y = np.delete(y,[0,75,149],axis=0)
print("-------------")
knnhash.fit(x,y)
print(test1y)
knnhash.predict(test1x)
print("-------------")
knnhash.fit(x,y)
print(test2y)
knnhash.predict(test2x)
print("-------------")
knnhash.fit(x,y)
print(test3y)
knnhash.predict(test3x)

-------------
0


AttributeError: 'KNNHash' object has no attribute 'X'

* Each string above corresponds to the particular hash table. And index in counter maps to the class. For example Counter({0: 13, 1: 1}) means that there are 13 samples close to "u" with "0" class labels and 1 sample with "1" class label.