# AUC ML LabExercise: k - Nearest Neighbours 
Ungraded exercise, deadline Thursday 9 November, 23:59.


In [7]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In this notebook you will implement the k-Nearest Neighbour classifier.

The step-wise recipe for k-nn:

(1) Handle Data: Open the dataset from CSV and split into test/train datasets.

(2) Similarity: Calculate the distance between two data instances.

(3) Neighbors: Locate k most similar data instances.

(4) Majority vote: Get the neighbours to vote on the class of the test points.

(5) Accuracy: Summarize the accuracy of predictions.


We provide the main function that brings all the steps together and your task is to implement the missing functions.


In [41]:
#In this notebook we will work with the Iris dataset again
#First we import all the modules that you need for this exrecise
from sklearn import datasets # to load the dataset
from sklearn.model_selection import train_test_split #to split in train and test set
from sklearn.model_selection import cross_val_score #BONUS
from sklearn.metrics import classification_report, accuracy_score # for reporting
from scipy.spatial import distance #to calculate the Euclidean distance
from collections import Counter #to count unique occurances of items in array, for majority voting


# Missing function  1) given a training set and a test instance use 
# the euclidian distance function to calculate all pairwise distances.
# Return a list with indexes of k nearest neighbours for a given test instance.
def get_neighbours(training_set, test_instance, k):
    # Calculate distances from test_instance to all training points, use the euclidean() function.
    # Remember to save not only distances but also the index of the training example,
    # so that when you want to choose k closest ones, you actually know which examples those are.
    distances = []
    knearest = []
    for i in range(len(training_set)):
        dist = distance.euclidean(training_set[i], test_instance)
        distances.append((i, dist))
        
    distances = sorted(distances, key=lambda x: x[1])
    
    knearest = distances[0:k]
    return knearest
    

# Missing function 2) given an array of nearest neighbours indices, 
# tally up their classes to vote on test case class. 
# Retun the label of most common class.
def get_majority_vote(neighbours, training_labels):
    # You can use the Counter() function to count unique occurances of items in array. 
    # And also a method of Counter(), to take a most common one. 
    classes = []
    for i in range(len(neighbours)):
        classes.append(training_labels[neighbours[i][0]])
    c = Counter(classes)
    return c.most_common(1)[0][0]
    
    
# setting up main executable method
def main(k_start, k_end):
 
    # load the data and create the training and test sets
    iris = datasets.load_iris()
    X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.4) 

    X_crossv, X_test, y_crossv, y_test = train_test_split(X_test, y_test, test_size=0.5)
    # generate predictions
    
    #cross validation
    accuracy_diff_k = []
    for x in range(k_start, k_end):
        predictions = []
        k = x       
        for i in range(len(X_crossv)):        
            neighbours = get_neighbours(X_train, X_crossv[i], k)
            majority_vote = get_majority_vote(neighbours, y_train)
            predictions.append(majority_vote)
        accuracy_diff_k.append(accuracy_score(y_crossv, predictions))
    
    print(accuracy_diff_k)
    
    k = accuracy_diff_k.index(max(accuracy_diff_k)) + k_start
    
    predictions = []
    # for each instance in the test set, get nearest neighbours and majority vote on predicted class
    for x in range(len(X_test)):
 
            print('Classifying test instance number ' + str(x) + ":")
            neighbours = get_neighbours(X_train, X_test[x], k)
            majority_vote = get_majority_vote(neighbours, y_train)
            predictions.append(majority_vote)
            print('Predicted label=' + str(majority_vote) + ', Actual label=' + str(y_test[x]))
 
    # summarize performance of the classification
    print('\nFor value k = ' + str(k) + ', the overall accuracy of the model is: ' + str(accuracy_score(y_test, predictions)) + "\n")
    
    #BONUS: study function classification_report to find out how to produce 
    #       a detailed classification report
    target_names = ['class 0', 'class 1', 'class 2']
    report = classification_report(y_test, predictions, target_names=target_names)
    print('A detailed classification report: \n\n' + report)
    
    
    # BONUS: in this implementation we arbitrarily chose k=3. 
    #        We could have chosen other values, which would influence accuracy. 
    #        Ideally, k would be optimized by seeing which value 
    #        produces the most accurate predictions. 
    #        Implement this using cross-validation. 
    
    # -----> see above 
    

main(1, 20)


[0.96666666666666667, 0.96666666666666667, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
Classifying test instance number 0:
Predicted label=2, Actual label=2
Classifying test instance number 1:
Predicted label=0, Actual label=0
Classifying test instance number 2:
Predicted label=0, Actual label=0
Classifying test instance number 3:
Predicted label=2, Actual label=2
Classifying test instance number 4:
Predicted label=0, Actual label=0
Classifying test instance number 5:
Predicted label=1, Actual label=1
Classifying test instance number 6:
Predicted label=1, Actual label=2
Classifying test instance number 7:
Predicted label=2, Actual label=2
Classifying test instance number 8:
Predicted label=0, Actual label=0
Classifying test instance number 9:
Predicted label=0, Actual label=0
Classifying test instance number 10:
Predicted label=1, Actual label=1
Classifying test instance number 11:
Predicted label=2, Actual label=1
Classifying test instance numb