<h1>Importing required libraries and reading data from given files</h1>

In [1]:
from scipy.io import arff
import pandas as pd
import math
import operator

data = arff.loadarff('trainProdSelection.arff')
training_set = pd.DataFrame(data[0])

data = arff.loadarff('testProdSelection.arff')
testing_set = pd.DataFrame(data[0])

<h1>Checking the training set</h1>

In [2]:
training_set.head()

Unnamed: 0,Type,LifeStyle,Vacation,eCredit,salary,property,label
0,b'student',b'spend>saving',6.0,40.0,13.62,3.2804,b'C1'
1,b'student',b'spend>saving',11.0,21.0,15.32,2.0232,b'C1'
2,b'student',b'spend>saving',7.0,64.0,16.55,3.1202,b'C1'
3,b'student',b'spend>saving',3.0,47.0,15.71,3.4022,b'C1'
4,b'student',b'spend>saving',15.0,10.0,16.96,2.2825,b'C1'


<h1>Checking the testing set</h1>

In [3]:
testing_set.head()

Unnamed: 0,Type,LifeStyle,Vacation,eCredit,salary,property,label
0,b'student',b'spend<saving',12.0,19.0,14.79,3.7697,b'C1'
1,b'student',b'spend>>saving',29.0,10.0,16.19,2.4839,b'C1'
2,b'student',b'spend<<saving',28.0,60.0,15.46,1.1885,b'C1'
3,b'engineer',b'spend>saving',15.0,41.0,21.26,1.4379,b'C1'
4,b'librarian',b'spend<saving',2.0,9.0,19.7207,0.6913,b'C1'


<h1>Training set pre-processing</h1>
<h3>Converting the columns from byte code to string</h3>

In [4]:
training_set.Type = training_set.Type.str.decode("UTF-8")
training_set.LifeStyle = training_set.LifeStyle.str.decode("UTF-8")
training_set.label = training_set.label.str.decode("UTF-8")

<h3>Scaling the values of columns in training set</h3>

In [5]:
minValue = training_set.Vacation.min()
maxValue = training_set.Vacation.max()
training_set.Vacation = training_set.Vacation.apply(lambda x:(x-minValue)/(maxValue-minValue))

minValue = training_set.eCredit.min()
maxValue = training_set.eCredit.max()
training_set.eCredit = training_set.eCredit.apply(lambda x:(x-minValue)/(maxValue-minValue))

minValue = training_set.salary.min()
maxValue = training_set.salary.max()
training_set.salary = training_set.salary.apply(lambda x:(x-minValue)/(maxValue-minValue))

minValue = training_set.property.min()
maxValue = training_set.property.max()
training_set.property = training_set.property.apply(lambda x:(x-minValue)/(maxValue-minValue))

<h1>Testing set pre-processing</h1>
<h3>Converting the columns from byte code to string</h3>

In [6]:
testing_set.Type=testing_set.Type.str.decode("UTF-8")
testing_set.LifeStyle=testing_set.LifeStyle.str.decode("UTF-8")
testing_set.label=testing_set.label.str.decode("UTF-8")

<h3>Scaling the values of the columns in testing set</h3>

In [7]:
minValue = testing_set.Vacation.min()
maxValue = testing_set.Vacation.max()
testing_set.Vacation = testing_set.Vacation.apply(lambda x:(x-minValue)/(maxValue-minValue))

minValue = testing_set.eCredit.min()
maxValue = testing_set.eCredit.max()
testing_set.eCredit = testing_set.eCredit.apply(lambda x:(x-minValue)/(maxValue-minValue))

minValue = testing_set.salary.min()
maxValue = testing_set.salary.max()
testing_set.salary = testing_set.salary.apply(lambda x:(x-minValue)/(maxValue-minValue))

minValue = testing_set.property.min()
maxValue = testing_set.property.max()
testing_set.property = testing_set.property.apply(lambda x:(x-minValue)/(maxValue-minValue))

<h1>Writing the required functions for the prediction</h1>
<ul>
    <li><b>Euclidean Distance --> </b>To calculate the distance between points in a graph</li>
    <li><b>Get Neighbours --> </b>Calculates the distance between the test instance and the whole training set. Returns 'K' number of neighbours</li>
    <li><b>Get Response --> </b>It calculates the frequency of the class from 'K' nearest neighbours and returns the class which has occured the most</li>
    <li><b>Get Accuracy --> </b>It calculates the accuracy</li>
</ul>

In [8]:
def euclidean_distance(instance1, instance2, length):
    distance = 0
    for i in range(2):
        if (instance1[i]!=instance2[i]):
            distance += pow((1), 2)
    for x in range(2,length):
        distance += pow((instance1[x] - instance2[x]), 2)
    return math.sqrt(distance)
 
def get_neighbours(trainingSet, testInstance, k):
    distances = []
    length = len(testInstance)-1
    for x in range(len(trainingSet)):
        dist = euclidean_distance(testInstance, trainingSet[x], length)
        distances.append((trainingSet[x], dist))
    distances.sort(key=operator.itemgetter(1))
    neighbours = []
    for x in range(k):
        neighbours.append(distances[x])
    return neighbours
 
def get_response(neighbours):
#     print(neighbours)
    classVotes = {}
#     print(len(neighbours))
    for x in range(len(neighbours)):
#         print(neighbours[x][0][-1])
#         print(neighbours[x][1])
        response = neighbours[x][0][-1]
        if response in classVotes:
            classVotes[response] += 1
        else:
            classVotes[response] = 1
#     print(classVotes)
    sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True)
    return sortedVotes[0][0]
 
def get_accuracy(testSet, predictions):
    correct = 0
    for x in range(len(testSet)):
        if testSet[x][-1] == predictions[x]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

<h1>This is the driver function which calls all the other functions by taking an Integer as input. The input given is the 'K' value.</h1>

In [9]:
def knn(k):
    predictions=[]
    for x in range(len(testing_set)):
        neighbours = get_neighbours(training_set.values, testing_set.values[x], k)
        result = get_response(neighbours)
        predictions.append(result)
    accuracy = get_accuracy(testing_set.values, predictions)
    print('Accuracy: ' + repr(accuracy) + '%')

In [10]:
knn(55)

Accuracy: 33.33333333333333%
