# Implement K Nearest Neighbors Algorithm

In [8]:
# import libraries
import numpy as np
from math import sqrt
import warnings
from collections import Counter
import pandas as pd
import random
from sklearn import preprocessing, neighbors
from sklearn.model_selection import cross_validate, train_test_split

def k_nearest_neighbors(data, predict, k=3):
    if len(data) >= k:
        warnings.warn('K is set to a value less than total voting groups!')       
    #knn algos
    distances = []
    for group in data:
        for features in data[group]:
            euclidean_distance = np.linalg.norm(np.array(features)-np.array(predict))
            distances.append([euclidean_distance, group])
    votes = [i[1] for i in sorted(distances)[:k]]
    vote_result = Counter(votes).most_common(1)[0][0]
    confidence = Counter(votes).most_common(1)[0][1] / k
    return vote_result, confidence


### Try Our K-NN Algorithm

In [9]:
accuracies = []
for i in range(25):
    # load the data set
    df = pd.read_csv('breast-cancer-wisconsin.data.txt')
    # handle missing data
    df.replace('?', -99999, inplace=True)
    df.drop(['id'], 1, inplace=True)
    full_data = df.astype(float).values.tolist()
    random.shuffle(full_data)

    # create training and testing sets
    test_size = 0.4
    train_set = {2:[], 4:[]}
    test_set = {2:[], 4:[]}
    train_data = full_data[:-int(test_size*len(full_data))]
    test_data = full_data[-int(test_size*len(full_data)):]

    # populate the empty dictionary for train and test sets
    for i in train_data:
        train_set[i[-1]].append(i[:-1])
    for i in test_data:
        test_set[i[-1]].append(i[:-1])

    # try our knn algorithm 
    correct = 0
    total = 0
    for group in test_set:
        for data in test_set[group]:
            vote, conf = k_nearest_neighbors(train_set, data, k=5)
            if group == vote:
                correct += 1
            total += 1

    accuracies.append(correct/total)

print(sum(accuracies)/len(accuracies))

0.9661648745519714


### Try ScikitLearn K-NN Algorithm

In [11]:
accuracies = []
for i in range(25):
    df = pd.read_csv('breast-cancer-wisconsin.data.txt')
    df.replace('?', -99999, inplace=True)
    df.drop(['id'], 1, inplace=True) 

    X = np.array(df.drop(['class'], 1))
    y = np.array(df['class'])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    clf = neighbors.KNeighborsClassifier(n_jobs=-1)
    clf.fit(X_train, y_train)

    accuracy = clf.score(X_test, y_test)
    accuracies.append(accuracy)
    
print(sum(accuracies)/len(accuracies))

0.9734285714285711
