# Implement K Nearest Neighbors Algorithm

In [20]:
# import libraries
import numpy as np
from math import sqrt
import warnings
from collections import Counter
import pandas as pd
import random

def k_nearest_neighbors(data, predict, k=3):
    if len(data) >= k:
        warnings.warn('K is set to a value less than total voting groups!')
        
    #knnalgos
    distances = []
    for group in data:
        for features in data[group]:
#             euclidean_distance = sqrt((features[0]-predict[0])**2 + (features[1]-predict[1])**2)
#             euclidean_distance = np.sqrt(np.sum((np.array(features)-np.array(predict))**2))
            euclidean_distance = np.linalg.norm(np.array(features)-np.array(predict))
            distances.append([euclidean_distance, group])
    votes = [i[1] for i in sorted(distances)[:k]]
#     print(Counter(votes).most_common(1)[0])
    vote_result = Counter(votes).most_common(1)[0][0]
    confidence = Counter(votes).most_common(1)[0][1] / k
    return vote_result, confidence


In [21]:
# load the data set
df = pd.read_csv('breast-cancer-wisconsin.data.txt')
# handle missing data
df.replace('?', -99999, inplace=True)
df.drop(['id'], 1, inplace=True)
print(df.head())

   clump_thickness  unif_cel_size  unif_cell_shape  marg_adhesion  \
0                5              1                1              1   
1                5              4                4              5   
2                3              1                1              1   
3                6              8                8              1   
4                4              1                1              3   

   single_epith_cell_size bare_nuclei  bland_chrom  norm_nucleoli  mitoses  \
0                       2           1            3              1        1   
1                       7          10            3              2        1   
2                       2           2            3              1        1   
3                       3           4            3              7        1   
4                       2           1            3              1        1   

   class  
0      2  
1      2  
2      2  
3      2  
4      2  


In [22]:
full_data = df.astype(float).values.tolist()
# full_data = df.values.tolist()
print(full_data[:5])

[[5.0, 1.0, 1.0, 1.0, 2.0, 1.0, 3.0, 1.0, 1.0, 2.0], [5.0, 4.0, 4.0, 5.0, 7.0, 10.0, 3.0, 2.0, 1.0, 2.0], [3.0, 1.0, 1.0, 1.0, 2.0, 2.0, 3.0, 1.0, 1.0, 2.0], [6.0, 8.0, 8.0, 1.0, 3.0, 4.0, 3.0, 7.0, 1.0, 2.0], [4.0, 1.0, 1.0, 3.0, 2.0, 1.0, 3.0, 1.0, 1.0, 2.0]]


In [23]:
random.shuffle(full_data)
print(full_data[:5])

[[8.0, 4.0, 5.0, 1.0, 2.0, -99999.0, 7.0, 3.0, 1.0, 4.0], [8.0, 7.0, 8.0, 5.0, 10.0, 10.0, 7.0, 2.0, 1.0, 4.0], [1.0, 1.0, 1.0, 3.0, 2.0, 3.0, 1.0, 1.0, 1.0, 2.0], [5.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0], [3.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0]]


In [26]:
# create training and testing sets
test_size = 0.4
train_set = {2:[], 4:[]}
test_set = {2:[], 4:[]}
train_data = full_data[:-int(test_size*len(full_data))]
test_data = full_data[-int(test_size*len(full_data)):]

# populate the empty dictionary for train and test sets
for i in train_data:
    train_set[i[-1]].append(i[:-1])
    
for i in test_data:
    test_set[i[-1]].append(i[:-1])


In [27]:
# try our knn algorithm 
correct = 0
total = 0

for group in test_set:
    for data in test_set[group]:
        vote, conf = k_nearest_neighbors(train_set, data, k=5)
#         print(f'{vote}, {conf}')
        if group == vote:
            correct += 1
        else:
            print(conf)
        total += 1

print(f'Accuracy: {correct/total}')

0.6
0.8
1.0
0.6
Accuracy: 0.985663082437276
