## Question 0: Getting real data

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import urllib.request
from operator import itemgetter
from collections import Counter
import math

dataUrl = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
rawData = urllib.request.urlopen(dataUrl)
dataHeaders = ['code_number', 'clump_thickness', 'cell_size', 'cell_shape', 'marginal_adhesion', 'single_cell_size', 'bare_nuclei', 'bland_chromatin', 'normal_nucleoli', 'mitoses', 'class']

data = np.genfromtxt(rawData, delimiter=',', dtype=int, missing_values={6:'?'}, filling_values={6:999})
#np.random.shuffle(data)

split = math.ceil(len(data) * .80)

trainingData, testData = data[:split], data[split:]
y_train = [i[10] for i in trainingData]

# Need to keep this around to check if accurate
x_train = [i[10] for i in testData] 

trainingData = np.delete(trainingData, 10, axis=1)
testData = np.delete(testData, 10, axis=1)

trainingData = np.delete(trainingData, 0, axis=1)
testData = np.delete(testData, 0, axis=1)

## Question 1: k-Nearest Neighbor Classifier

In [3]:
"""
Function to find distance between two vectors
"""
def LP_distance(x,y,p):
    totalDistance = 0
    for i,j in zip(x,y):
        newDistance = (abs(i-j)**p)
        totalDistance += newDistance
    return (totalDistance**(1/p))


"""
Function gets k neighbors for one tuple
"""
def get_k_neighbors(trainingData, testTuple, y_train, k, p):
    allDistances = []
    trainCnt = 0
    for dataEntry in trainingData:
        newDistance = LP_distance(dataEntry, testTuple, p)
        allDistances.append((newDistance, y_train[trainCnt]))
        trainCnt += 1
    allDistances = sorted(allDistances, key=itemgetter(0))
    
    kDistances = [val[1] for val in allDistances[:k]]
    return kDistances
    

def knn_classifier(x_test, x_train, y_train, k, p):
    y_pred = []
    
    for testValue in x_test:
        
        neighbors = get_k_neighbors(x_train, testValue, y_train, k, p)
        countClass = list(Counter(neighbors).keys())
        y_pred.append(countClass[0])
    
    return y_pred


result = knn_classifier(testData, trainingData, y_train, 1, 2)

sameCount = 0
for i,j in zip(x_train, result):
    if i == j:
        sameCount += 1

print(sameCount/len(x_train) * 100)
        





99.28057553956835


## Question 2: Evaluation

### Splitting up the data into 10 sections

In [7]:
np.random.shuffle(data)
split_ten = len(data)//10

splitData = []
x_splitTrain = []

splitData = [data[i:i + split_ten] for i in range(0, len(data), split_ten)]

count = 0
for split_slice in splitData:
    x_train = [i[10] for i in split_slice]
    x_splitTrain.append(x_train)
    split_slice = np.delete(split_slice, 10, axis=1)
    splitData[count] = np.delete(split_slice, 0, axis=1)
    count += 1    

[[  3   2   1   2   2   1   3   1   1]
 [  3   1   3   1   2 999   2   1   1]
 [  4   1   1   1   2   3   2   1   1]
 [  3   1   1   1   2   1   2   1   1]
 [  1   1   1   3   2   1   1   1   1]
 [  5   1   1   1   2   1   1   1   1]
 [  1   3   3   2   2   1   7   2   1]
 [  3   1   1   1   2   1   2   1   1]
 [  1   1   2   1   3 999   1   1   1]
 [  5   1   2   1   2   1   1   1   1]
 [  4  10   8   5   4   1  10   1   1]
 [  1   1   1   1   2   1   1   1   1]
 [ 10  10  10   8   6   8   7  10   1]
 [  4   1   1   3   1   1   2   1   1]
 [  1   1   1   1   2   1   2   1   1]
 [  5   4   6  10   2  10   4   1   1]
 [  1   1   1   1   2   1   3   1   1]
 [  3   1   2   1   2   1   3   1   1]
 [  4   1   1   1   2   1   3   6   1]
 [  2   1   1   1   2   1   1   1   1]
 [  1   1   1   1   2   1   2   1   1]
 [  1   2   1   3   2   1   2   1   1]
 [  1   1   1   1   2   3   3   1   1]
 [  4   1   1   1   2   1   1   1   1]
 [  5   4   6   6   4  10   4   3   1]
 [  5   7  10   6   5  10

### Using knn classifier to get results 

In [10]:
"""
Input:

splitData: Contains test data and training data split into 10 sections
x_splitTrain: Contains training data classes

Output:
x_splitPred: Contains test data classes

"""
i = 0
y_predList = []
for testData in splitData:
    j = 0
    predictList = []
    for trainingData, trainingClasses in zip(splitData,x_splitTrain):
        if(j == i):
            j+=1
            continue
        else:
            prediction = knn_classifier(testData, trainingData, trainingClasses, 1, 2)
            predictList.append(prediction)
            j += 1
    y_predList.append(predictList)  
    i += 1

print(y_predList)
    


[[[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 4, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 2, 4, 4, 2, 2, 2, 2, 4, 2, 2, 2, 2, 4, 2, 2, 4, 2, 2, 2, 4, 2, 2, 2, 2, 4, 2, 4, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2], [2, 4, 2, 2, 2, 2, 2, 2, 4, 2, 4, 2, 4, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 4, 2, 2, 4, 2, 2, 2, 4, 2, 2, 2, 2, 4, 4, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 4, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 2, 4, 4, 2, 2, 2, 2, 4, 2, 2, 2, 2, 4, 2, 2, 4, 2, 2, 2, 4, 2, 2, 2, 2, 4, 2, 4, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2], [2, 4, 2, 2, 2, 2, 2, 2, 4, 2, 4, 2, 4, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 4, 4, 2, 2, 2, 2, 4, 2, 2, 2, 2, 4, 2, 2, 4, 2, 2, 2, 4, 2, 2, 2, 2, 4, 4, 4, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 4, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 4, 2, 2, 4, 2, 2, 2, 4, 2, 2, 2, 2, 2,

In [None]:
def testSensitivity(y_pred, y_train):
    return sensitivity

def testSpecificity(y_pred, y_train):
    return specificity

def testAccuracy(y_pred, y_train):
    sameCount = 0
    for i,j in zip(y_pred, y_train):
        if i == j:
            sameCount += 1
    accuracy = sameCount/len(x_train) * 100
    return accuracy

## Sources

1. https://stackoverflow.com/questions/10695139/sort-a-list-of-tuples-by-2nd-item-integer-value
2. https://docs.scipy.org/doc/numpy-1.14.0/reference/generated/numpy.delete.html
3. 