In [25]:
import math
import statistics
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [26]:
data = pd.read_csv('data.csv', index_col='id').reset_index(drop=True)
data.drop('Unnamed: 32', axis=1, inplace=True)

# Preview data
print('Dataframe shape:', data.shape)
data.head(3)

Dataframe shape: (569, 31)


Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758


In [27]:
# Features and labels
X = data.drop('diagnosis', axis=1)
y = data['diagnosis']

# Encode target to binary
y = (y=='M').astype('int')

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

In [43]:
class KNN:
    def __init__(self, metric, k=3, p=None):
        self.k = k
        self.metric = metric 
        self.p = p

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, X_test):
        preds = []
        for test_row in X_test:
            nn = self.get_neighbours(test_row)
            majority = statistics.mode([n[1] if not np.isscalar(n) else n for n in nn])
            preds.append(majority)
        return np.array(preds)

    def euclidean(self, dp1, dp2):
        return np.sqrt(np.sum((dp1 - dp2)**2))
    
    def manhattan(self, dp1, dp2):
        return np.sum(np.abs(dp1-dp2))
    
    def minkowski(self, dp1, dp2, p=2):
        return np.sum(np.abs(dp1-dp2)**p)**(1/p)
    
    def get_neighbours(self, test_row):
        distances = []
        for (train_row, train_class) in zip(self.X_train, self.y_train):
            if self.metric == 'euclidean':
                dist = self.euclidean(train_row, test_row)
            elif self.metric == 'manhattan':
                dist = self.manhattan(train_row, test_row)
            elif self.metric == 'minkowski':
                dist = self.minkowski(train_row, test_row)
            else:
                raise NameError('Supported metrics are euclidean, manhattan and minkowski.')
            distances.append((dist, train_class))
            neighbours = list()
            for i in range(min(self.k, len(distances))):
                neighbours.append(distances[i][1])

            return neighbours

In [44]:
def accuracy(preds, y_test):
    return 100 * (preds == y_test).mean()

In [45]:
for metric in ['euclidean', 'manhattan', 'minkowski']:
    clf = KNN(k=5, metric=metric)
    clf.fit(X_train.values, y_train.values)
    preds = clf.predict(X_test.values)
    print(f'Metric: {metric}, accuracy: {accuracy(preds, y_test):.3f} %')

IndexError: invalid index to scalar variable.