In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import LeaveOneOut
from sklearn.preprocessing import MinMaxScaler

In [2]:
cancer_data = pd.read_csv("cancer.csv")
print(cancer_data.shape)
cancer_labels = cancer_data['label'].values
cancer_features = cancer_data.drop(['label'], axis=1).values
cancer_data.head()

(569, 31)


Unnamed: 0,label,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [3]:
spam_data = pd.read_csv("spam.csv")
print(spam_data.shape)
spam_labels = spam_data['label'].values
spam_features = spam_data.drop(['label'], axis=1).values
spam_data.head()

(4601, 58)


Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_:,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,label
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [4]:
def dist(x1, x2):
    return np.linalg.norm(x1 - x2)


class KNN:
    def __init__(self, k=3):
        self.k = k
        
    def fit(self, X, y):
        self.data = X
        self.ys = y
        
    def predict(self, xs):
        return np.array([self.get_label(self.get_neighbours(x)) for x in xs])
    
    def get_neighbours(self, x):
        '''
        return list of pairs (ind, dist to x) sorted by dist
        '''
        dists = enumerate(map(lambda y: dist(x, y), self.data))
        return self.filter_neighbours(sorted(dists, key=lambda y: y[1]), self.k)
    
    def filter_neighbours(self, neighbours, k):
        raise NotImplementedError()
    
    def get_label(self, x):
        raise NotImplementedError()
        

class KNNClassifier(KNN):
    def __init__(self, k=3):
        super().__init__(k)
        
    def filter_neighbours(self, neighbours, k):
        return neighbours[:k]
        
    def get_label(self, neighbours):
        labels = {}
        for i, x in neighbours:
            if self.ys[i] not in labels:
                labels[self.ys[i]] = 0
            labels[self.ys[i]] += 1
        return sorted(list(labels.items()), key=lambda t: t[1])[0][0]
    

class RNNClassifier(KNN):
    def __init__(self, k=3):
        super().__init__(k)
        
    def filter_neighbours(self, neighbours, r):
        return list(filter(lambda x: x[1] < r, neighbours))
        
    def get_label(self, neighbours):
        labels = {}
        for i, x in neighbours:
            if self.ys[i] not in labels:
                labels[self.ys[i]] = 0
            labels[self.ys[i]] += 1
        if len(labels) == 0:
            return self.ys[0]
        return sorted(list(labels.items()), key=lambda t: t[1])[0][0]

In [5]:
def accuracy(y_pr, y_te):
    return np.mean(y_pr == y_te)

def evaluate_knn(X, y, ks, knn_constructor):
    loo = LeaveOneOut()
    match = np.zeros(len(ks))
    max_k = max(ks)
    for tr_ind, te_ind in loo.split(X):
        X_tr, X_te = X[tr_ind], X[te_ind]
        y_tr, y_te = y[tr_ind], y[te_ind]
        knn = knn_constructor(max_k)
        knn.fit(X_tr, y_tr)
        neighbours = knn.get_neighbours(X_te[0])
        for i, k in enumerate(ks):
            match[i] += 1 if knn.get_label(knn.filter_neighbours(neighbours, k)) == y_te[0] else 0
    return match / loo.get_n_splits(X)

In [6]:
print("kNN on cancer data with k in [1, 10]")
print(evaluate_knn(cancer_features, cancer_labels, list(range(1, 11)), KNNClassifier))

kNN on cancer data with k in [1, 10]
[0.91564148 0.91564148 0.89982425 0.87170475 0.84885764 0.82776801
 0.80843585 0.7943761  0.79086116 0.77680141]


In [7]:
print("kNN on spam data with k in [1, 10]")
print(evaluate_knn(spam_features, spam_labels, list(range(1, 11)), KNNClassifier))

kNN on spam data with k in [1, 10]
[0.83047164 0.83047164 0.73288416 0.70093458 0.62964573 0.61769181
 0.56596392 0.55314062 0.51119322 0.50836775]


In [8]:
# Find radius on big interval like binary search
print("Radius Neighbours on cancer data with R in [1, 50]")
print(evaluate_knn(cancer_features, cancer_labels, np.linspace(1, 50, 10), RNNClassifier))

Radius Neighbours on cancer data with R in [1, 50]
[0.37258348 0.4112478  0.66608084 0.86115993 0.87521968 0.83128295
 0.77504394 0.72759227 0.66080844 0.62038664]


In [9]:
print("Radius Neighbours on cancer data with R in [10, 35]")
print(evaluate_knn(cancer_features, cancer_labels, np.linspace(10, 35, 10), RNNClassifier))

Radius Neighbours on cancer data with R in [10, 35]
[0.53954306 0.72056239 0.82601054 0.87521968 0.88400703 0.88049209
 0.85588752 0.80492091 0.78910369 0.76977153]


In [10]:
print("Radius Neighbours on cancer data with R in [15, 30]")
print(evaluate_knn(cancer_features, cancer_labels, np.linspace(15, 30, 10), RNNClassifier))

Radius Neighbours on cancer data with R in [15, 30]
[0.80316344 0.85237258 0.87521968 0.8857645  0.88224956 0.87521968
 0.87170475 0.85588752 0.82952548 0.80140598]


In [11]:
print("Radius Neighbours on cancer data with R in [18, 28]. Best values achieved in this interval")
print(evaluate_knn(cancer_features, cancer_labels, np.linspace(18, 28, 10), RNNClassifier))

Radius Neighbours on cancer data with R in [18, 28]. Best values achieved in this interval
[0.87873462 0.87346221 0.8857645  0.88224956 0.88049209 0.87697715
 0.87346221 0.87346221 0.84885764 0.83479789]


In [12]:
print("Radius Neighbours on spam data with R in [1, 50]")
print(evaluate_knn(spam_features, spam_labels, np.linspace(1, 50, 5), RNNClassifier))

Radius Neighbours on spam data with R in [1, 50]
[0.47446207 0.40339057 0.34123017 0.33036296 0.3236253 ]


In [13]:
print("Radius Neighbours on spam data with R in [1, 20]")
print(evaluate_knn(spam_features, spam_labels, np.linspace(1, 20, 5), RNNClassifier))

Radius Neighbours on spam data with R in [1, 20]
[0.47446207 0.49945664 0.42621169 0.38969789 0.36252988]


In [14]:
print("Radius Neighbours on spam data with R in [1, 8]")
print(evaluate_knn(spam_features, spam_labels, np.linspace(1, 8, 5), RNNClassifier))

Radius Neighbours on spam data with R in [1, 8]
[0.47446207 0.5905238  0.57313627 0.48750272 0.45446642]


In [15]:
print("Radius Neighbours on spam data with R in [2, 6]")
print(evaluate_knn(spam_features, spam_labels, np.linspace(2, 6, 5), RNNClassifier))

Radius Neighbours on spam data with R in [2, 6]
[0.51858292 0.60704195 0.60247772 0.53901326 0.49576179]


In [16]:
print("Radius Neighbours on spam data with R in [3, 4]. Best values achieved in this interval")
print(evaluate_knn(spam_features, spam_labels, np.linspace(3, 4, 5), RNNClassifier))

Radius Neighbours on spam data with R in [3, 4]. Best values achieved in this interval
[0.60704195 0.62008259 0.62182134 0.61095414 0.60247772]


In [17]:
normed_cancer_features = MinMaxScaler().fit_transform(cancer_features)
normed_spam_features = MinMaxScaler().fit_transform(spam_features)

In [18]:
print("kNN on normed cancer data with k in [1, 10]")
print(evaluate_knn(normed_cancer_features, cancer_labels, list(range(1, 11)), KNNClassifier))
print("kNN on normed spam data with k in [1, 10]")
print(evaluate_knn(normed_spam_features, spam_labels, list(range(1, 11)), KNNClassifier))

kNN on normed cancer data with k in [1, 10]
[0.95254833 0.95254833 0.91739895 0.90509666 0.89103691 0.87521968
 0.85764499 0.83479789 0.81546573 0.80843585]
kNN on normed spam data with k in [1, 10]
[0.912193   0.912193   0.84286025 0.81634427 0.75722669 0.74483808
 0.69984786 0.68398174 0.64442512 0.63116714]


In [19]:
print("Radius Neighbours on normed cancer data with R in [0.1, 2]")
print(evaluate_knn(normed_cancer_features, cancer_labels, np.linspace(0.1, 2, 5), RNNClassifier))

Radius Neighbours on normed cancer data with R in [0.1, 2]
[0.37258348 0.37961336 0.20913884 0.31634446 0.35325132]


In [20]:
print("Radius Neighbours on normed cancer data with R in [0.05, 0.7]")
print(evaluate_knn(normed_cancer_features, cancer_labels, np.linspace(0.05, 0.7, 5), RNNClassifier))

Radius Neighbours on normed cancer data with R in [0.05, 0.7]
[0.37258348 0.53954306 0.73110721 0.44463972 0.29173989]


In [21]:
print("Radius Neighbours on normed cancer data with R in [0.2, 0.5]")
print(evaluate_knn(normed_cancer_features, cancer_labels, np.linspace(0.2, 0.5, 5), RNNClassifier))

Radius Neighbours on normed cancer data with R in [0.2, 0.5]
[0.4973638  0.73286467 0.771529   0.64850615 0.52196837]


In [22]:
print("Radius Neighbours on normed cancer data with R in [0.28, 0.35]. Best values achieved in this interval")
print(evaluate_knn(normed_cancer_features, cancer_labels, np.linspace(0.28, 0.35, 5), RNNClassifier))

Radius Neighbours on normed cancer data with R in [0.28, 0.35]. Best values achieved in this interval
[0.74340949 0.77328647 0.79789104 0.78031634 0.771529  ]


In [23]:
print("Radius Neighbours on normed spam data with R in [0.05, 2]")
print(evaluate_knn(normed_spam_features, spam_labels, np.linspace(0.05, 2, 5), RNNClassifier))

Radius Neighbours on normed spam data with R in [0.05, 2]
[0.53444903 0.41056292 0.40425994 0.39708759 0.39534884]


In [24]:
print("Radius Neighbours on normed spam data with R in [0.05, 1]")
print(evaluate_knn(normed_spam_features, spam_labels, np.linspace(0.05, 0.4, 5), RNNClassifier))

Radius Neighbours on normed spam data with R in [0.05, 1]
[0.53444903 0.58769833 0.47207129 0.40317322 0.38165616]


In [25]:
print("Radius Neighbours on normed spam data with R in [0.05, 0.2]. Best values achieved in this interval")
print(evaluate_knn(normed_spam_features, spam_labels, np.linspace(0.05, 0.2, 5), RNNClassifier))

Radius Neighbours on normed spam data with R in [0.05, 0.2]. Best values achieved in this interval
[0.53444903 0.58726364 0.59921756 0.55140187 0.50858509]
