In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import math
from sklearn.model_selection import KFold

%matplotlib inline

In [2]:
df = pd.read_csv('./cleaned_dataset.csv')

In [3]:
df.head()

Unnamed: 0,id,clump_thickness,cell_size_uniformity,cell_shape_uniformity,marginal_adhesion,epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [4]:
df.columns

Index(['id', 'clump_thickness', 'cell_size_uniformity',
       'cell_shape_uniformity', 'marginal_adhesion', 'epithelial_cell_size',
       'bare_nuclei', 'bland_chromatin', 'normal_nucleoli', 'mitoses',
       'class'],
      dtype='object')

In [5]:
df.to_csv('modified.csv', header=False, index=False)

In [6]:
df = pd.read_csv('modified.csv', header=None)

In [7]:
df = df.drop([0], axis=1)

In [8]:
def accuracy(y_label, y_pred):
    correct = 0
    for i in range(len(y_label)):
        if(y_label[i]==y_pred[i]):
            correct = correct + 1
    return (correct/len(y_label))*100

In [9]:
def euclidean_distance(point1, point2):
    sum_squared_distance = 0
    for i in range(len(point1)):
        sum_squared_distance += math.pow(point1[i] - point2[i], 2)
    return math.sqrt(sum_squared_distance)

In [10]:
def mean(acc):
    sum = 0
    for i in acc:
        sum = sum + i
    return sum/len(acc)

In [11]:
def mode(labels):
    return Counter(labels).most_common(1)[0][0]

In [12]:
def knn(data, query, k):
    neighbor_distances_and_indices = []
    
    for index, example in enumerate(data):
        distance = euclidean_distance(example[:-1], query)
        neighbor_distances_and_indices.append((distance, index))
    
    sorted_neighbor_distances_and_indices = sorted(neighbor_distances_and_indices)
    
    k_nearest_distances_and_indices = sorted_neighbor_distances_and_indices[:k]
    
    k_nearest_labels = [data[i][9] for distance, i in k_nearest_distances_and_indices]

    return mode(k_nearest_labels)

In [13]:
 def manual_split(df):   
    y_label = df[10][600:]
    new = df.drop(10, axis=1)[600:]
    df = df[:600]
    y_label = y_label.values

    y_pred = []
    for i in range(len(new)):
        clf_query = new.values[i]
        clf_prediction = knn(df.values, clf_query, k=3)
        y_pred.append(clf_prediction)

    print(accuracy(y_label, y_pred))

In [14]:
manual_split(df)

96.96969696969697


In [15]:
kf = KFold(n_splits=6)
kf.get_n_splits(df)

6

In [16]:
X = np.array(df)

In [17]:
cv_acc = []
for train_index, test_index in kf.split(X):
    y_pred = []
    train_values, test_values = X[train_index], X[test_index]
    y_label = test_values[:, 9:]
    test_values = test_values[:, :-1]
    test_values = test_values.tolist()
    train_values = train_values.tolist()
    #print(test_values)
    
    for i in range(len(test_values)):
        clf_query = test_values[i]
        clf_prediction = knn(train_values, clf_query, k=5)
        y_pred.append(clf_prediction)
    cv_acc.append(accuracy(y_label, y_pred))

In [18]:
mean(cv_acc)

96.29015620394931