## Question 0: Getting real data

In [44]:
import numpy as np
import matplotlib.pyplot as plt
import urllib.request
from operator import itemgetter
from collections import Counter
import math

dataUrl = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
rawData = urllib.request.urlopen(dataUrl)
dataHeaders = ['code_number', 'clump_thickness', 'cell_size', 'cell_shape', 'marginal_adhesion', 'single_cell_size', 'bare_nuclei', 'bland_chromatin', 'normal_nucleoli', 'mitoses', 'class']

data = np.genfromtxt(rawData, delimiter=',', dtype=int, missing_values={6:'?'}, filling_values={6:999})
#np.random.shuffle(data)

split = math.ceil(len(data) * .80)

trainingData, testData = data[:split], data[split:]
y_train = [i[10] for i in trainingData]

# Need to keep this around to check if accurate
x_train = [i[10] for i in testData] 

trainingData = np.delete(trainingData, 10, axis=1)
testData = np.delete(testData, 10, axis=1)

trainingData = np.delete(trainingData, 0, axis=1)
testData = np.delete(testData, 0, axis=1)

## Question 1: k-Nearest Neighbor Classifier

In [45]:
"""
Function to find distance between two vectors
"""
def LP_distance(x,y,p):
    totalDistance = 0
    for i,j in zip(x,y):
        newDistance = (abs(i-j)**p)
        totalDistance += newDistance
    return (totalDistance**(1/p))


"""
Function gets k neighbors for one tuple
"""
def get_k_neighbors(trainingData, testTuple, y_train, k, p):
    allDistances = []
    trainCnt = 0
    for dataEntry in trainingData:
        newDistance = LP_distance(dataEntry, testTuple, p)
        allDistances.append((newDistance, y_train[trainCnt]))
        trainCnt += 1
    allDistances = sorted(allDistances, key=itemgetter(0))
    
    kDistances = [val[1] for val in allDistances[:k]]
    return kDistances
    

def knn_classifier(x_test, x_train, y_train, k, p):
    y_pred = []
    
    for testValue in x_test:
        
        neighbors = get_k_neighbors(x_train, testValue, y_train, k, p)
        countClass = list(Counter(neighbors).keys())
        y_pred.append(countClass[0])
    
    return y_pred


result = knn_classifier(testData, trainingData, y_train, 1, 2)

sameCount = 0
for i,j in zip(x_train, result):
    if i == j:
        sameCount += 1

print(sameCount/len(x_train) * 100)
        





99.28057553956835


## Question 2: Evaluation

### Splitting up the data into 10 sections

In [46]:
np.random.shuffle(data)


splitData = data[:]

#splitData = [data[i:i + split_ten] for i in range(0, len(data), split_ten)]

count = 0
y_train = [i[10] for i in splitData]
splitData = np.delete(splitData, 10, axis=1)
splitData= np.delete(splitData, 0, axis=1)   

### Using knn classifier to get results 

In [64]:
"""
Input:

splitData: Contains test data and training data split into 10 sections
y_train: Contains training data classes

Output:
y_splitPred: Contains test data classes

"""
def tenCrossValidation(data, y_train, k, p):
    y_predList = []
    split_ten = math.ceil(len(data)/10)
    currentPos = 0

    for current in range(10):
        # Getting the current slice of data to test on
        testData = splitData[currentPos:currentPos+split_ten]
        y_testData = y_train[currentPos:currentPos+split_ten]

        # Getting the current data to be training from
        #trainingData = [e for e in splitData if e not in testData]
        #y_trainCopy = [d for d in y_train if d not in y_testData]

        trainingData = []
        y_trainCopy = []

        if currentPos == 0:
            trainingData = splitData[currentPos+split_ten:]
            y_trainCopy = y_train[currentPos+split_ten:]

        elif currentPos+split_ten >= (len(splitData)-1):
            trainingData = splitData[0:currentPos]
            y_trainCopy = y_train[0:currentPos]


        else:
            a_train = splitData[0:currentPos]
            b_train = splitData[currentPos+split_ten:]

            trainingData = np.concatenate((a_train, b_train), axis=0)

            a_y = y_train[0:currentPos]
            b_y = y_train[currentPos+split_ten:]

            y_trainCopy = np.concatenate((a_y, b_y), axis=0)


        y_pred = knn_classifier(testData, trainingData, y_trainCopy, k, p)
        y_predList.append(y_pred)

        currentPos += split_ten
    return y_predList

In [72]:
def testSensitivity(y_pred, y_train):
    tp_count = 0
    fn_count = 0
    for i,j in zip(y_pred, y_train):
        if i == 4 and j == 4:
            tp_count += 1
        if i == 2 and j == 4:
            fn_count += 1
        
    sensitivity = tp_count/(tp_count + fn_count) * 100
    return sensitivity

def testSpecificity(y_pred, y_train):
    tn_count = 0
    fp_count = 0
    for i,j in zip(y_pred, y_train):
        if i == 2 and j == 2:
            tn_count += 1
        if i == 4 and j == 2:
            fp_count += 1
    specificity = tn_count/(tn_count + fp_count) * 100
    return specificity

def testAccuracy(y_pred, y_train):
    sameCount = 0
    for i,j in zip(y_pred, y_train):
        if i == j:
            sameCount += 1
    accuracy = (sameCount/len(y_train)) * 100
    return accuracy

In [73]:
# tenCrossValidation will return a list of lists containing the
# prediction for our knn algorithm

y_predList = tenCrossValidation(data,y_train, 10,2)

# Need to flatten list of lists to a single list to allow for 
# easier testing

flat_y_predList = [item for sublist in y_predList for item in sublist]

print("Accuracy:", testAccuracy(flat_y_predList, y_train))
print("Sensitivity:", testSensitivity(flat_y_predList, y_train))
print("Specificity:", testSpecificity(flat_y_predList, y_train))

Accuracy: 96.13733905579399
Sensitivity: 93.7759336099585
Specificity: 97.37991266375546


## Sources

1. https://stackoverflow.com/questions/10695139/sort-a-list-of-tuples-by-2nd-item-integer-value
2. https://docs.scipy.org/doc/numpy-1.14.0/reference/generated/numpy.delete.html
3. https://stackoverflow.com/questions/952914/making-a-flat-list-out-of-list-of-lists-in-python
4. 