## Question 0: Getting real data

In [11]:
import numpy as np
import matplotlib.pyplot as plt
import urllib.request
from operator import itemgetter
from collections import Counter
import math

dataUrl = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
rawData = urllib.request.urlopen(dataUrl)
dataHeaders = ['code_number', 'clump_thickness', 'cell_size', 'cell_shape', 'marginal_adhesion', 'single_cell_size', 'bare_nuclei', 'bland_chromatin', 'normal_nucleoli', 'mitoses', 'class']

data = np.genfromtxt(rawData, delimiter=',', dtype=int, missing_values={6:'?'}, filling_values={6:999})
#np.random.shuffle(data)

split = math.ceil(len(data) * .80)

trainingData, testData = data[:split], data[split:]
y_train = [i[10] for i in trainingData]

# Need to keep this around to check if accurate
x_train = [i[10] for i in testData] 

trainingData = np.delete(trainingData, 10, axis=1)
testData = np.delete(testData, 10, axis=1)

trainingData = np.delete(trainingData, 0, axis=1)
testData = np.delete(testData, 0, axis=1)

## Question 1: k-Nearest Neighbor Classifier

In [12]:
"""
Function to find distance between two vectors
"""
def LP_distance(x,y,p):
    totalDistance = 0
    for i,j in zip(x,y):
        newDistance = (abs(i-j)**p)
        totalDistance += newDistance
    return (totalDistance**(1/p))


"""
Function gets k neighbors for one tuple
"""
def get_k_neighbors(trainingData, testTuple, y_train, k, p):
    allDistances = []
    trainCnt = 0
    for dataEntry in trainingData:
        newDistance = LP_distance(dataEntry, testTuple, p)
        allDistances.append((newDistance, y_train[trainCnt]))
        trainCnt += 1
    allDistances = sorted(allDistances, key=itemgetter(0))
    
    kDistances = [val[1] for val in allDistances[:k]]
    return kDistances
    

def knn_classifier(x_test, x_train, y_train, k, p):
    y_pred = []
    
    for testValue in x_test:
        
        neighbors = get_k_neighbors(x_train, testValue, y_train, k, p)
        countClass = list(Counter(neighbors).keys())
        y_pred.append(countClass[0])
    
    return y_pred


result = knn_classifier(testData, trainingData, y_train, 1, 2)

sameCount = 0
for i,j in zip(x_train, result):
    if i == j:
        sameCount += 1

print(sameCount/len(x_train) * 100)
        





99.28057553956835


## Sources

1. https://stackoverflow.com/questions/10695139/sort-a-list-of-tuples-by-2nd-item-integer-value
2. https://docs.scipy.org/doc/numpy-1.14.0/reference/generated/numpy.delete.html
3. 