# K Nearest Neighbours Classification 

#### Importing the custom classes

In [1]:
from classify import *
from wordvec import *
from get_emoji import *
import numpy as np
import operator
from random import shuffle
from IPython.display import Math

from sklearn.neighbors import KNeighborsClassifier

#### Training Word2Vec based on training set text

In [2]:
t2v_model = word2vec('./Data/Train/train.txt.text')
t2v_model.run()

#### Preprocessing of training set

In [3]:
label_file = open('./Data/Train/train.txt.labels', 'r') 
labels = label_file.read().split()
train_labels = list(map(int, labels))

In [4]:
train_classifier = Classifier('./Data/Train/train.txt.text', train_labels, 'KNN', t2v_model.tweet2vec)
train_classifier.make_features()

In [32]:
train_features = train_classifier.features
train_labels = train_classifier.labels[0:len(train_features)]

training_set = []

for i in range(len(train_features)):
    feature = list(train_features[i])
    label = train_labels[i]
    feature.append(label)
    training_set.append(feature)

#### Preprocessing of test set

In [6]:
label_file = open('./Data/Test/us_test.labels', 'r') 
labels = label_file.read().split()
test_labels = list(map(int, labels))

In [7]:
test_classifier = Classifier('./Data/Test/us_test.text', test_labels, 'KNN', t2v_model.tweet2vec)
test_classifier.make_features()

In [8]:
test_features = test_classifier.features
test_labels = test_classifier.labels

test_set = []

for i in range(len(test_features)):
    feature = list(test_features[i])
    label = test_labels[i]
    feature.append(label)
    test_set.append(feature)

#### Manhattan Distance

In [9]:
Math(r'ManhattanDistance(x,y) = \sum_{n=1}^{dim} \mid x - y\ \mid')

<IPython.core.display.Math object>

In [10]:
def manhattanDistance(instance1, instance2, length):
    distance = 0
    for x in range(length):
        distance += np.absolute(instance1[x]-instance2[x])
    return np.sqrt(distance)

#### Chebyshev Distance

In [11]:
Math(r'ChebyshevDistance(x,y) = \max_i( \mid x_i - y_i\ \mid )')

<IPython.core.display.Math object>

In [12]:
def chebyshevDistance(instance1, instance2, length):
    distance = 0
    for x in range(length):
        d = np.absolute(instance1[x] - instance2[x])
        if d>distance:
            distance = d
    return distance

#### Cosine Distance

In [13]:
def cosineDistance(instance1, instance2, length):
    dotProduct = np.dot(instance1, instance2)
    mag1 = np.linalg.norm(instance1)
    mag2 = np.linalg.norm(instance2)
    mag = mag1*mag2
    similarity = dotProduct/mag
    cosineDistance = 1 - similarity
    return cosineDistance

#### Euclidian Distance

In [14]:
def euclideanDistance(instance1, instance2, length):
	distance = 0
	for x in range(length):
		distance += np.power((instance1[x] - instance2[x]), 2)
	return np.sqrt(distance)

#### Getting the closest K Neighbours

In [15]:
def getNeighbors(trainingSet, testInstance, k, distanceMetric):
	distances = []
	length = len(testInstance)-1
	for x in range(len(trainingSet)):
		dist = distanceMetric(testInstance, trainingSet[x], length)
		distances.append((trainingSet[x], dist))
	distances.sort(key=operator.itemgetter(1))
	neighbors = []
	for x in range(k):
		neighbors.append(distances[x][0])
	return neighbors

#### Voting to classify the test tweet

In [16]:
def getResponse(neighbors):
	classVotes = {}
	for x in range(len(neighbors)):
		response = neighbors[x][-1]
		if response in classVotes:
			classVotes[response] += 1
		else:
			classVotes[response] = 1
	sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True)
	return sortedVotes[0][0]

#### KNN using Euclidean Distance Metric

In [24]:
correct_count = 0
for i in test_set[0:200]:
    prediction = getResponse(getNeighbors(training_set, i, 5, euclideanDistance))
    print("Actual answer is {} and the Predicted answer is {} ".format(i[-1],prediction))
    if(prediction == i[-1]):
        correct_count = correct_count + 1

accuracy = (correct_count/len(test_set[0:200]))*100
print("Accuracy of KNN classification using Euclidean Distance Metric is {} ".format(accuracy))

Actual answer is 2 and the Predicted answer is 1 
Actual answer is 10 and the Predicted answer is 0 
Actual answer is 6 and the Predicted answer is 0 
Actual answer is 1 and the Predicted answer is 8 
Actual answer is 16 and the Predicted answer is 0 
Actual answer is 17 and the Predicted answer is 4 
Actual answer is 4 and the Predicted answer is 10 
Actual answer is 10 and the Predicted answer is 1 
Actual answer is 12 and the Predicted answer is 1 
Actual answer is 18 and the Predicted answer is 5 
Actual answer is 0 and the Predicted answer is 8 
Actual answer is 1 and the Predicted answer is 11 
Actual answer is 5 and the Predicted answer is 0 
Actual answer is 7 and the Predicted answer is 11 
Actual answer is 7 and the Predicted answer is 0 
Actual answer is 4 and the Predicted answer is 0 
Actual answer is 0 and the Predicted answer is 16 
Actual answer is 7 and the Predicted answer is 1 
Actual answer is 0 and the Predicted answer is 0 
Actual answer is 10 and the Predicted an

Actual answer is 11 and the Predicted answer is 1 
Actual answer is 7 and the Predicted answer is 0 
Actual answer is 14 and the Predicted answer is 4 
Actual answer is 0 and the Predicted answer is 0 
Actual answer is 18 and the Predicted answer is 10 
Actual answer is 0 and the Predicted answer is 0 
Actual answer is 9 and the Predicted answer is 9 
Actual answer is 1 and the Predicted answer is 16 
Actual answer is 0 and the Predicted answer is 12 
Actual answer is 7 and the Predicted answer is 15 
Actual answer is 6 and the Predicted answer is 2 
Actual answer is 0 and the Predicted answer is 2 
Actual answer is 2 and the Predicted answer is 0 
Actual answer is 16 and the Predicted answer is 2 
Actual answer is 2 and the Predicted answer is 2 
Actual answer is 1 and the Predicted answer is 11 
Actual answer is 6 and the Predicted answer is 2 
Actual answer is 1 and the Predicted answer is 7 
Actual answer is 10 and the Predicted answer is 1 
Actual answer is 4 and the Predicted ans

In [36]:
neigh = KNeighborsClassifier(n_neighbors=5, metric = 'euclidean')
neigh.fit(train_features, train_labels)
predicted = neigh.predict(test_features[0:200])

correct_count = 0
for i in range(200):
    if test_labels[i] == predicted[i]:
        correct_count = correct_count + 1

accuracy = (correct_count/200)*100
print(accuracy)
print("Accuracy of KNN classification using Euclidean Distance Metric is {} ".format(accuracy))

18.0
Accuracy of KNN classification using Euclidean Distance Metric is 18.0 


#### KNN using Manhattan Distance Metric

In [43]:
correct_count = 0
for i in test_set[0:200]:
    prediction = getResponse(getNeighbors(training_set, i, 5, manhattanDistance))
    print("Actual answer is {} and the Predicted answer is {} ".format(i[-1],prediction))
    if(prediction == i[-1]):
        correct_count = correct_count + 1

accuracy = (correct_count/200)*100
print("Accuracy of KNN classification using Manhattan Distance Metric is {} ".format(accuracy))

Actual answer is 2 and the Predicted answer is 1 
Actual answer is 10 and the Predicted answer is 0 
Actual answer is 6 and the Predicted answer is 2 
Actual answer is 1 and the Predicted answer is 0 
Actual answer is 16 and the Predicted answer is 0 
Actual answer is 17 and the Predicted answer is 4 
Actual answer is 4 and the Predicted answer is 10 
Actual answer is 10 and the Predicted answer is 1 
Actual answer is 12 and the Predicted answer is 1 
Actual answer is 18 and the Predicted answer is 0 
Actual answer is 0 and the Predicted answer is 8 
Actual answer is 1 and the Predicted answer is 11 
Actual answer is 5 and the Predicted answer is 0 
Actual answer is 7 and the Predicted answer is 11 
Actual answer is 7 and the Predicted answer is 0 
Actual answer is 4 and the Predicted answer is 0 
Actual answer is 0 and the Predicted answer is 16 
Actual answer is 7 and the Predicted answer is 0 
Actual answer is 0 and the Predicted answer is 0 
Actual answer is 10 and the Predicted an

Actual answer is 11 and the Predicted answer is 1 
Actual answer is 7 and the Predicted answer is 0 
Actual answer is 14 and the Predicted answer is 4 
Actual answer is 0 and the Predicted answer is 2 
Actual answer is 18 and the Predicted answer is 17 
Actual answer is 0 and the Predicted answer is 0 
Actual answer is 9 and the Predicted answer is 9 
Actual answer is 1 and the Predicted answer is 16 
Actual answer is 0 and the Predicted answer is 12 
Actual answer is 7 and the Predicted answer is 15 
Actual answer is 6 and the Predicted answer is 2 
Actual answer is 0 and the Predicted answer is 2 
Actual answer is 2 and the Predicted answer is 0 
Actual answer is 16 and the Predicted answer is 0 
Actual answer is 2 and the Predicted answer is 2 
Actual answer is 1 and the Predicted answer is 3 
Actual answer is 6 and the Predicted answer is 0 
Actual answer is 1 and the Predicted answer is 7 
Actual answer is 10 and the Predicted answer is 0 
Actual answer is 4 and the Predicted answ

In [40]:
neigh = KNeighborsClassifier(n_neighbors=5, metric = 'manhattan')
neigh.fit(train_features, train_labels)
predicted = neigh.predict(test_features[0:200])

correct_count = 0
for i in range(200):
    if test_labels[i] == predicted[i]:
        correct_count = correct_count + 1

accuracy = (correct_count/200)*100
print(accuracy)
print("Accuracy of KNN classification using Manhattan Distance Metric is {} ".format(accuracy))

15.0
Accuracy of KNN classification using Manhattan Distance Metric is 15.0 


#### KNN using Chebyshev Distance Metric

In [44]:
correct_count = 0
for i in test_set[0:200]:
    prediction = getResponse(getNeighbors(training_set, i, 5, chebyshevDistance))
    print("Actual answer is {} and the Predicted answer is {} ".format(i[-1],prediction))
    if(prediction == i[-1]):
        correct_count = correct_count + 1

accuracy = (correct_count/200)*100
print("Accuracy of KNN classification using Chebyshev Distance Metric is {} ".format(accuracy))

Actual answer is 2 and the Predicted answer is 0 
Actual answer is 10 and the Predicted answer is 0 
Actual answer is 6 and the Predicted answer is 0 
Actual answer is 1 and the Predicted answer is 2 
Actual answer is 16 and the Predicted answer is 2 
Actual answer is 17 and the Predicted answer is 1 
Actual answer is 4 and the Predicted answer is 2 
Actual answer is 10 and the Predicted answer is 1 
Actual answer is 12 and the Predicted answer is 1 
Actual answer is 18 and the Predicted answer is 5 
Actual answer is 0 and the Predicted answer is 0 
Actual answer is 1 and the Predicted answer is 0 
Actual answer is 5 and the Predicted answer is 11 
Actual answer is 7 and the Predicted answer is 0 
Actual answer is 7 and the Predicted answer is 12 
Actual answer is 4 and the Predicted answer is 0 
Actual answer is 0 and the Predicted answer is 0 
Actual answer is 7 and the Predicted answer is 1 
Actual answer is 0 and the Predicted answer is 0 
Actual answer is 10 and the Predicted answ

Actual answer is 11 and the Predicted answer is 0 
Actual answer is 7 and the Predicted answer is 4 
Actual answer is 14 and the Predicted answer is 16 
Actual answer is 0 and the Predicted answer is 0 
Actual answer is 18 and the Predicted answer is 10 
Actual answer is 0 and the Predicted answer is 0 
Actual answer is 9 and the Predicted answer is 4 
Actual answer is 1 and the Predicted answer is 6 
Actual answer is 0 and the Predicted answer is 0 
Actual answer is 7 and the Predicted answer is 17 
Actual answer is 6 and the Predicted answer is 0 
Actual answer is 0 and the Predicted answer is 0 
Actual answer is 2 and the Predicted answer is 2 
Actual answer is 16 and the Predicted answer is 0 
Actual answer is 2 and the Predicted answer is 2 
Actual answer is 1 and the Predicted answer is 16 
Actual answer is 6 and the Predicted answer is 0 
Actual answer is 1 and the Predicted answer is 3 
Actual answer is 10 and the Predicted answer is 17 
Actual answer is 4 and the Predicted ans

In [42]:
neigh = KNeighborsClassifier(n_neighbors=5, metric = 'chebyshev')
neigh.fit(train_features, train_labels)
predicted = neigh.predict(test_features[0:200])

correct_count = 0
for i in range(200):
    if test_labels[i] == predicted[i]:
        correct_count = correct_count + 1

accuracy = (correct_count/200)*100
print(accuracy)
print("Accuracy of KNN classification using Euclidian Distance Metric is {} ".format(accuracy))

19.0
Accuracy of KNN classification using Euclidian Distance Metric is 19.0 


#### KNN using Cosine Distance Metric

In [46]:
correct_count = 0
for i in test_set[0:200]:
    prediction = getResponse(getNeighbors(training_set, i, 15, cosineDistance))
    print("Actual answer is {} and the Predicted answer is {} ".format(i[-1],prediction))
    if(prediction == i[-1]):
        correct_count = correct_count + 1

accuracy = (correct_count/200)*100
print("Accuracy of KNN classification using Cosine Distance Metric is {} ".format(accuracy))

Actual answer is 2 and the Predicted answer is 2 
Actual answer is 10 and the Predicted answer is 11 
Actual answer is 6 and the Predicted answer is 6 
Actual answer is 1 and the Predicted answer is 1 
Actual answer is 16 and the Predicted answer is 15 
Actual answer is 17 and the Predicted answer is 15 
Actual answer is 4 and the Predicted answer is 4 
Actual answer is 10 and the Predicted answer is 10 
Actual answer is 12 and the Predicted answer is 12 
Actual answer is 18 and the Predicted answer is 18 
Actual answer is 0 and the Predicted answer is 0 
Actual answer is 1 and the Predicted answer is 1 
Actual answer is 5 and the Predicted answer is 5 
Actual answer is 7 and the Predicted answer is 7 
Actual answer is 7 and the Predicted answer is 7 
Actual answer is 4 and the Predicted answer is 4 
Actual answer is 0 and the Predicted answer is 0 
Actual answer is 7 and the Predicted answer is 7 
Actual answer is 0 and the Predicted answer is 0 
Actual answer is 10 and the Predicted 

Actual answer is 2 and the Predicted answer is 2 
Actual answer is 11 and the Predicted answer is 11 
Actual answer is 7 and the Predicted answer is 7 
Actual answer is 14 and the Predicted answer is 14 
Actual answer is 0 and the Predicted answer is 0 
Actual answer is 18 and the Predicted answer is 17 
Actual answer is 0 and the Predicted answer is 0 
Actual answer is 9 and the Predicted answer is 8 
Actual answer is 1 and the Predicted answer is 1 
Actual answer is 0 and the Predicted answer is 0 
Actual answer is 7 and the Predicted answer is 7 
Actual answer is 6 and the Predicted answer is 6 
Actual answer is 0 and the Predicted answer is 0 
Actual answer is 2 and the Predicted answer is 2 
Actual answer is 16 and the Predicted answer is 16 
Actual answer is 2 and the Predicted answer is 2 
Actual answer is 1 and the Predicted answer is 1 
Actual answer is 6 and the Predicted answer is 6 
Actual answer is 1 and the Predicted answer is 1 
Actual answer is 10 and the Predicted answ