In [1]:
## This notebook aims to understand KNN algorithm in a tangable way for self-learning
## It is inspired by the post of Jason Brownlee 
## https://machinelearningmastery.com/tutorial-to-implement-k-nearest-neighbors-in-python-from-scratch/


## Loadfile fucntion

import csv 
import random

def LoadData(file, split, trainSet=[], testSet=[]):
    with open(file, 'r') as csvfile:
        lines = csv.reader(csvfile)
        dataset = list(lines)
        
        for x in range(len(dataset)-1):
            for y in range(4):
                dataset[x][y] = float(dataset[x][y])
                
            #Return random floats in the half-open interval [0.0, 1.0).
            if random.random() < split: 
                trainSet.append(dataset[x])
            else:
                testSet.append(dataset[x])               
                    

In [2]:
# Similarity computation - Euclidean Distance
# length is the number of attributes of the instance we need to compute
import math

def EuclideanDistance(instance1, instance2, length):
    distance = 0
    for x in range(length):
        distance += pow((instance1[x]-instance2[x]), 2)
    return math.sqrt(distance)

In [3]:
# get K nearest neighbours
import operator 
def GetNeighbours(trainSet, testInstance, k):
    distances = []
    length = len(testInstance)-1 
    
    for x in range(len(trainSet)):
        dist = EuclideanDistance(testInstance, trainSet[x], length)
        distances.append((trainSet[x], dist))
    
    # sort the distances by 2-nd index, namely, the distance of two instances
    distances.sort(key=operator.itemgetter(1))
    
    neighbours = []
    for x in range(k):
        neighbours.append(distances[x][0])
    
    return neighbours


In [4]:
# Vote for the classes of test instances

def GetVotes(neighbours):
    classVotes = {}
    for x in range(len(neighbours)):
        label = neighbours[x][-1]
        
        if label in classVotes:
            classVotes[label] += 1
        else:
            classVotes[label] = 1
    
    sortedClass = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True)
    return sortedClass[0][0]

In [5]:
# Accuracy 

def GetAccuracy(testSet, predictions):
    correct = 0
    for x in range(len(testSet)):
        if testSet[x][-1] == predictions[x]:
            correct += 1
    return round((correct/float(len(testSet)) * 100.0),2)

In [6]:
from colorama import Fore
from colorama import Style 

CRED = '\033[91m'
CEND = '\033[0m'

def main():
    trainSet = []
    testSet = []
    split = 0.67
    LoadData('iris.data', split, trainSet, testSet)
    print('Train set: ' + repr(len(trainSet)))
    print('Test set: ' + repr(len(testSet)))
    
    # generate predictions
    predictions = []
    k = 3
    for x in range(len(testSet)):
        neighbours = GetNeighbours(trainSet, testSet[x], k)
        results = GetVotes(neighbours)
        predictions.append(results)   
        
        if results == testSet[x][-1]:
            print('>predicted = '+ repr(results)+ ', actural = '+ repr(testSet[x][-1]))
        else:
            print(CRED + '>predicted = '+ repr(results)+ ', actural = '+ repr(testSet[x][-1]) + CEND)
    
    accuracy = GetAccuracy(testSet, predictions)
    print('Accuracy is '+ repr(accuracy)+ '%.' )
    
main()
    

Train set: 105
Test set: 45
>predicted = 'Iris-setosa', actural = 'Iris-setosa'
>predicted = 'Iris-setosa', actural = 'Iris-setosa'
>predicted = 'Iris-setosa', actural = 'Iris-setosa'
>predicted = 'Iris-setosa', actural = 'Iris-setosa'
>predicted = 'Iris-setosa', actural = 'Iris-setosa'
>predicted = 'Iris-setosa', actural = 'Iris-setosa'
>predicted = 'Iris-setosa', actural = 'Iris-setosa'
>predicted = 'Iris-setosa', actural = 'Iris-setosa'
>predicted = 'Iris-setosa', actural = 'Iris-setosa'
>predicted = 'Iris-setosa', actural = 'Iris-setosa'
>predicted = 'Iris-setosa', actural = 'Iris-setosa'
>predicted = 'Iris-setosa', actural = 'Iris-setosa'
>predicted = 'Iris-setosa', actural = 'Iris-setosa'
>predicted = 'Iris-setosa', actural = 'Iris-setosa'
>predicted = 'Iris-setosa', actural = 'Iris-setosa'
>predicted = 'Iris-versicolor', actural = 'Iris-versicolor'
>predicted = 'Iris-versicolor', actural = 'Iris-versicolor'
>predicted = 'Iris-versicolor', actural = 'Iris-versicolor'
>predicted =

### Summury 

#### K-nearest neighbours
#### Lazy-learning
#### Competitive-learning
#### Instance-based method
#### Similarity Measure