In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy.spatial.distance import cdist
from collections import Counter

In [2]:
iris = pd.read_csv('datasets/Iris.csv')
iris.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
iris = iris.set_index('Id')

In [4]:
iris['Species_int'] = pd.Categorical(iris['Species'])
iris['Species_int'] = iris['Species_int'].cat.codes

In [5]:
iris.head(3)

Unnamed: 0_level_0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,Species_int
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,5.1,3.5,1.4,0.2,Iris-setosa,0
2,4.9,3.0,1.4,0.2,Iris-setosa,0
3,4.7,3.2,1.3,0.2,Iris-setosa,0


In [6]:
iris.tail(3)

Unnamed: 0_level_0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,Species_int
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
148,6.5,3.0,5.2,2.0,Iris-virginica,2
149,6.2,3.4,5.4,2.3,Iris-virginica,2
150,5.9,3.0,5.1,1.8,Iris-virginica,2


In [7]:
X = iris.iloc[:,:4].values
y = iris.iloc[:,5].values

In [8]:
np.random.seed(7)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=130)

### 1. KNN without sklearn library (unweighted neighbors)

In [9]:
class KNN(object):
    def __init__(self):
        pass
    
    def train(self, X, y, weighted=False):
        """
        Just memorizing all training data
        """
        
        self.Xtrain = X
        self.ytrain = y
        self.weighted = weighted
        
    def compute_dist(self, X):
        """
        Computer distance between all each point of X and each point of training data
        Implementation using fast computation without any loops
        """
        
        num_train = self.Xtrain.shape[0]
        dim = self.Xtrain.shape[1]
        X = X.reshape(-1, dim)
        num_test = X.shape[0]
#         return cdist(X, self.Xtrain)
        return np.sqrt(np.sum(X*X, 1).reshape(num_test, 1) + np.sum(self.Xtrain * self.Xtrain, 1) - 2*X.dot(self.Xtrain.T))
    
    def predict(self, X, k=1):
        """
        Make prediction for every point in X set
        """
        dim = self.Xtrain.shape[1]  # number of features
        X = X.reshape(-1, dim)  # reshape input data into 2-D vector
        num_test = X.shape[0]
        dists = np.sort(self.compute_dist(X), 1)[:,:k]  # sort all the distances in ascending order
        sorted_idx = np.argsort(self.compute_dist(X), 1)[:,:k]  # indices with respect to the sorted distances
        y_pred = np.zeros(num_test)
        
        # using distance-weighted
        if self.weighted:
            dists_inv = 1 / dists
            weights = dists_inv / np.sum(dists_inv, 1).reshape(-1,1)
            labels = self.ytrain[sorted_idx]
            for i in range(num_test):
                labels_batch = labels[i]
                
                if dists[i,0] == 0:
                    y_pred[i] = labels_batch[0]  # if the smallest distance = 0, return the point in the training set with respect to that distance
                else:
                    class_weights = {}
                    max_weight = 0
                    for label in np.unique(labels_batch):
                        class_weights[label] = 0  # initialize each class with weight 0
                    for j in range(len(labels_batch)):
                        class_weights[labels_batch[j]] += weights[i, j]  # accumulative weights of each class
                        if class_weights[labels_batch[j]] > max_weight:
                            y_pred[i] = labels_batch[j]
                            max_weight = class_weights[labels_batch[j]]
        
        # predict labels which appear most frequently among K labels
        else:
            for i in range(num_test):
                top_labels = Counter(self.ytrain[sorted_idx[i]]).most_common()
                y_pred[i] = top_labels[0][0]
        return y_pred
    
    def accuracy(self, y_test, y_pred):
        return 100 * np.sum(y_test == y_pred) / len(y_test)

#### Unweighted KNN

In [10]:
knn = KNN()
knn.train(X_train, y_train)

In [11]:
y_pred = knn.predict(X_test, k=7)
print('Accuracy using conventional KNN: %.2f%%' %knn.accuracy(y_pred, y_test))

Accuracy using conventional KNN: 93.85%


#### Weighted KNN

In [12]:
knn = KNN()
knn.train(X_train, y_train, weighted=True)

In [13]:
y_pred = knn.predict(X_test, k=7)
print('Accuracy using weighted KNN: %.2f%%' %knn.accuracy(y_pred, y_test))

Accuracy using weighted KNN: 94.62%




### 3. KNN using sklearn library (unweighted neighbors)

In [14]:
from sklearn import neighbors
from sklearn.metrics import accuracy_score

#### Unweighted KNN

In [15]:
model = neighbors.KNeighborsClassifier(n_neighbors = 7, p = 2)
model.fit(X_train, y_train)
y_pred1 = model.predict(X_test)
print('Accuracy using conventional KNN: %.2f%%' %(100*accuracy_score(y_pred1, y_test)))

Accuracy using conventional KNN: 93.85%


#### Weighted KNN

In [16]:
model = neighbors.KNeighborsClassifier(n_neighbors = 7, p = 2, weights='distance')
model.fit(X_train, y_train)
y_pred1 = model.predict(X_test)
print('Accuracy using weighted KNN: %.2f%%' %(100*accuracy_score(y_pred1, y_test)))

Accuracy using weighted KNN: 94.62%
