In [505]:
import math
from sklearn.model_selection import train_test_split
import operator
from sklearn.metrics import accuracy_score
from classifier import classifier
from scipy.io import arff
import pandas as pd
from knn import *

In [506]:
def load_data_set(filename):
    data = arff.loadarff(filename)
    df = pd.DataFrame(data[0])
    df = df.astype('int')
    return df


def euclidean_distance(instance1, instance2, length):
    distance = 0
    for x in range(length):
        distance += pow((instance1[x] - instance2[x]), 2)
    return math.sqrt(distance)


def get_neighbors(train_x, test_x, k):
    distances = []
    length = len(test_x) - 1
    for x in range(len(train_x)):
        dist = euclidean_distance(test_x, train_x[x], length)
        distances.append((train_x[x], dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(k):
        neighbors.append(distances[x][0])
    return neighbors


def get_predictions(neighbors):
    neighs = {}
    for x in range(len(neighbors)):
        response = neighbors[x][-1]
        if response in neighs:
            neighs[response] += 1
        else:
            neighs[response] = 1
    sorted_neighs = sorted(neighs.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_neighs[0][0]


def get_accuracy(test_set, predictions):
    correct = 0
    for x in range(len(test_set)):
        if test_set[x][-1] == predictions[x]:
            correct += 1
    return correct/float(len(test_set))


class knn(classifier):

    def __init__(self, k):
        super().__init__()
        self.train_x = []
        self.k = k

    def fit(self, X, Y):
        self.train_x = X

    def predict(self, X):
        hypothesis = []
        for x in range(len(X)):
            neighbors = get_neighbors(self.train_x, X[x], self.k)
            result = get_predictions(neighbors)
            hypothesis.append(result)
        return hypothesis


In [507]:
df = load_data_set('data/PhishingData.arff')
trainSet, testSet = train_test_split(df, test_size=0.2, random_state=42, shuffle=False)
trainSet = trainSet.values.tolist()
testSet = testSet.values.tolist()

for k in range(2, 33):
    knn_clf = knn(k)
    knn_clf.fit(trainSet, trainSet)
    hyp = knn_clf.predict(testSet)
    score = get_accuracy(testSet, hyp)
    print("k=", k, 'Score=', score)

k= 2 Score= 0.8745387453874539
k= 3 Score= 0.8856088560885609
k= 4 Score= 0.8929889298892989
k= 5 Score= 0.8892988929889298
k= 6 Score= 0.8892988929889298
k= 7 Score= 0.8929889298892989
k= 8 Score= 0.8892988929889298
k= 9 Score= 0.8966789667896679
k= 10 Score= 0.8966789667896679
k= 11 Score= 0.8929889298892989
k= 12 Score= 0.8892988929889298
k= 13 Score= 0.8819188191881919
k= 14 Score= 0.8856088560885609
k= 15 Score= 0.8745387453874539
k= 16 Score= 0.8819188191881919
k= 17 Score= 0.8634686346863468
k= 18 Score= 0.8671586715867159
k= 19 Score= 0.8560885608856088
k= 20 Score= 0.8487084870848709
k= 21 Score= 0.8450184501845018
k= 22 Score= 0.8376383763837638
k= 23 Score= 0.8339483394833949
k= 24 Score= 0.8413284132841329
k= 25 Score= 0.8376383763837638
k= 26 Score= 0.8376383763837638
k= 27 Score= 0.8339483394833949
k= 28 Score= 0.8339483394833949
k= 29 Score= 0.8339483394833949
k= 30 Score= 0.8376383763837638
k= 31 Score= 0.8339483394833949
k= 32 Score= 0.8339483394833949
