In [148]:
from scipy.io import arff
import pandas as pd
import math
import operator

data = arff.loadarff('trainProdSelection.arff')
training_set = pd.DataFrame(data[0])

data = arff.loadarff('testProdSelection.arff')
testing_set = pd.DataFrame(data[0])

training_set.head()

Unnamed: 0,Type,LifeStyle,Vacation,eCredit,salary,property,label
0,b'student',b'spend>saving',6.0,40.0,13.62,3.2804,b'C1'
1,b'student',b'spend>saving',11.0,21.0,15.32,2.0232,b'C1'
2,b'student',b'spend>saving',7.0,64.0,16.55,3.1202,b'C1'
3,b'student',b'spend>saving',3.0,47.0,15.71,3.4022,b'C1'
4,b'student',b'spend>saving',15.0,10.0,16.96,2.2825,b'C1'


In [149]:
testing_set.head()

Unnamed: 0,Type,LifeStyle,Vacation,eCredit,salary,property,label
0,b'student',b'spend<saving',12.0,19.0,14.79,3.7697,b'C1'
1,b'student',b'spend>>saving',29.0,10.0,16.19,2.4839,b'C1'
2,b'student',b'spend<<saving',28.0,60.0,15.46,1.1885,b'C1'
3,b'engineer',b'spend>saving',15.0,41.0,21.26,1.4379,b'C1'
4,b'librarian',b'spend<saving',2.0,9.0,19.7207,0.6913,b'C1'


In [150]:
pd.DataFrame(data[0]).dtypes

Type          object
LifeStyle     object
Vacation     float64
eCredit      float64
salary       float64
property     float64
label         object
dtype: object

<h1>Training set pre-processing</h1>

In [151]:
training_set.Type = training_set.Type.str.decode("UTF-8")
training_set.LifeStyle = training_set.LifeStyle.str.decode("UTF-8")
training_set.label = training_set.label.str.decode("UTF-8")

In [152]:
minValue = training_set.Vacation.min()
maxValue = training_set.Vacation.max()
training_set.Vacation = training_set.Vacation.apply(lambda x:(x-minValue)/(maxValue-minValue))

minValue = training_set.eCredit.min()
maxValue = training_set.eCredit.max()
training_set.eCredit = training_set.eCredit.apply(lambda x:(x-minValue)/(maxValue-minValue))

minValue = training_set.salary.min()
maxValue = training_set.salary.max()
training_set.salary = training_set.salary.apply(lambda x:(x-minValue)/(maxValue-minValue))

minValue = training_set.property.min()
maxValue = training_set.property.max()
training_set.property = training_set.property.apply(lambda x:(x-minValue)/(maxValue-minValue))

<h1>Training set pre-processing done</h1>

<h1>Testing set pre-processing</h1>

In [153]:
testing_set.Type=testing_set.Type.str.decode("UTF-8")
testing_set.LifeStyle=testing_set.LifeStyle.str.decode("UTF-8")
testing_set.label=testing_set.label.str.decode("UTF-8")

In [154]:
minValue = testing_set.Vacation.min()
maxValue = testing_set.Vacation.max()
testing_set.Vacation = testing_set.Vacation.apply(lambda x:(x-minValue)/(maxValue-minValue))

minValue = testing_set.eCredit.min()
maxValue = testing_set.eCredit.max()
testing_set.eCredit = testing_set.eCredit.apply(lambda x:(x-minValue)/(maxValue-minValue))

minValue = testing_set.salary.min()
maxValue = testing_set.salary.max()
testing_set.salary = testing_set.salary.apply(lambda x:(x-minValue)/(maxValue-minValue))

minValue = testing_set.property.min()
maxValue = testing_set.property.max()
testing_set.property = testing_set.property.apply(lambda x:(x-minValue)/(maxValue-minValue))

<h1>Testing set pre-processing done</h1>

In [158]:
def euclideanDistance(instance1, instance2, length):
    distance = 0
    for i in range(2):
        if (instance1[i]!=instance2[i]):
            distance += pow((1), 2)
    for x in range(2,length):
        distance += pow((instance1[x] - instance2[x]), 2)
    return math.sqrt(distance)
 
def getNeighbors(trainingSet, testInstance, k):
    distances = []
    length = len(testInstance)-1
    for x in range(len(trainingSet)):
        dist = euclideanDistance(testInstance, trainingSet[x], length)
        distances.append((trainingSet[x], dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(k):
        neighbors.append(distances[x])
    return neighbors
 
def getResponse(neighbors):
#     print(neighbors)
    classVotes = {}
#     print(len(neighbors))
    for x in range(len(neighbors)):
#         print(neighbors[x][0][-1])
#         print(neighbors[x][1])
        response = neighbors[x][0][-1]
        if response in classVotes:
            classVotes[response] += 1
        else:
            classVotes[response] = 1
#     print(classVotes)
    sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True)
    return sortedVotes[0][0]
 
def getAccuracy(testSet, predictions):
    correct = 0
    for x in range(len(testSet)):
        if testSet[x][-1] == predictions[x]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

In [159]:
def knn(k):
    predictions=[]
    for x in range(len(testing_set)):
        neighbors = getNeighbors(training_set.values, testing_set.values[x], k)
        result = getResponse(neighbors)
        predictions.append(result)
#         print('> predicted=' + repr(result) + ', actual=' + repr(testing_set.values[x][-1]))
    accuracy = getAccuracy(testing_set.values, predictions)
    print('Accuracy: ' + repr(accuracy) + '%')

In [160]:
for i in range(1, 100, 2):
    print('for k = '+ str(i))
    knn(i)

for k = 1
Accuracy: 23.809523809523807%
for k = 3
Accuracy: 28.57142857142857%
for k = 5
Accuracy: 28.57142857142857%
for k = 7
Accuracy: 28.57142857142857%
for k = 9
Accuracy: 23.809523809523807%
for k = 11
Accuracy: 23.809523809523807%
for k = 13
Accuracy: 19.047619047619047%
for k = 15
Accuracy: 14.285714285714285%
for k = 17
Accuracy: 19.047619047619047%
for k = 19
Accuracy: 19.047619047619047%
for k = 21
Accuracy: 19.047619047619047%
for k = 23
Accuracy: 19.047619047619047%
for k = 25
Accuracy: 19.047619047619047%
for k = 27
Accuracy: 19.047619047619047%
for k = 29
Accuracy: 19.047619047619047%
for k = 31
Accuracy: 19.047619047619047%
for k = 33
Accuracy: 19.047619047619047%
for k = 35
Accuracy: 14.285714285714285%
for k = 37
Accuracy: 14.285714285714285%
for k = 39
Accuracy: 19.047619047619047%
for k = 41
Accuracy: 19.047619047619047%
for k = 43
Accuracy: 19.047619047619047%
for k = 45
Accuracy: 19.047619047619047%
for k = 47
Accuracy: 19.047619047619047%
for k = 49
Accuracy: 19.