# Libraries

Import the required libraries

In [1]:
import numpy as np
from math import sqrt
import time
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# Sample Data

Loading the Irish and Ionosphere sample data and format it.

In [2]:
#Load iris data
iris = load_iris()

#Load ionosphere data
ionosphere_data = np.genfromtxt("ionosphere.txt", delimiter=',', names=True, dtype=None)

ionos_X = []
ionos_X = [row for row in ionosphere_data] #format ionos_X values
ionos_X = [list(x) for x in ionos_X]
ionos_X = np.array(ionos_X) # Convert list to array

ionos_y = [row[-1] for row in ionosphere_data] #format ionos_y values
ionos_y = np.array(ionos_y) #Convert list to array

# Sorting function

In [3]:
def sort_any_array(arr):
    for i in range(len(arr)):
        for j in range(len(arr) - 1):
            if arr[j] > arr[j+1]:
                arr[j], arr[j + 1] = arr[j + 1], arr[j]
    return arr

# Calculate Euclidean Distances

In [4]:
def euclidean_distances(X_test, X_train):
    distance = 0.0
    for i in range(len(X_test)-1):
        distance += (X_test[i] - X_train[i])**2
    dist = np.sqrt(distance)
    return dist

# KNN Algorithm

KNN algorithm function for general K value

In [5]:
def knn_algorithm(X, y, test_size=0.3,train_size=0.7, random_state=2707, k=1):
    #split the data into training and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)
    
    predicted_labels = []
    prediction = ({})
    
    # Training the modal using X_train & y_train
    for j in range(len(X_test)):
        distances = []
        for i in range(len(X_train)):
            distance = euclidean_distances(X_test[j,:], X_train[i,:]) # Call function to calculate distance
            distances.append([distance, i])
        
        neighbors = sort_any_array(distances) # Call function to sort the neighbors based on calculated euclidean distance 
        targets = [y_train[neighbors[i][1]] for i in range(k)] # Get the k neigbhors labels
        predicted_labels.append(max(targets, key=targets.count))

    prediction['predicted_labels'] = predicted_labels
    prediction['accuracy'] = np.mean(predicted_labels == y_test)
    prediction['error'] = np.mean(predicted_labels != y_test)
    
    return prediction

# 1NN algorithm

Calculate Error rate for Irish dataset for 1NN

In [6]:
n = 1
iris_result = knn_algorithm(iris.data, iris.target, random_state=2707, k = n)
print("For k=%s: Accuracy = %s , Error Rate = %s" %(n, iris_result["accuracy"], iris_result["error"]))

For k=1: Accuracy = 0.9210526315789473 , Error Rate = 0.07894736842105263


Calculate Error rate for Irish dataset for 1NN

In [7]:
n = 1
ionos_result = knn_algorithm(ionos_X, ionos_y, random_state=2707, k = n)
print("For k=%s: Accuracy = %s , Error Rate = %s" %(n, ionos_result["accuracy"], ionos_result["error"]))

For k=1: Accuracy = 0.8977272727272727 , Error Rate = 0.10227272727272728


# 3NN algorithm

Calculate Error rate for Irish dataset for 3NN

In [8]:
n = 3
iris_result = knn_algorithm(iris.data, iris.target, random_state=2707, k = n)
print("For k=%s: Accuracy = %s , Error Rate = %s" %(n, iris_result["accuracy"], iris_result["error"]))

For k=3: Accuracy = 0.8947368421052632 , Error Rate = 0.10526315789473684


Calculate Error rate for Ionosphere dataset for 3NN

In [9]:
n = 3
ionos_result3 = knn_algorithm(ionos_X, ionos_y, random_state=2707, k = n)
print("For k=%s: Accuracy = %s , Error Rate = %s" %(n, ionos_result["accuracy"], ionos_result["error"]))

For k=3: Accuracy = 0.8977272727272727 , Error Rate = 0.10227272727272728


# KNN algorithm for a general K

Irish Dataset: Calculating error rate for K 1 to N

In [10]:
iris_errors = []
N = 20

for i in range(1, N):
    knn = knn_algorithm(iris.data, iris.target, random_state=2707, k=i)
    print("For k=%s: Accuracy = %s , Error Rate = %s" %(i, knn["accuracy"], knn["error"]))

For k=1: Accuracy = 0.9210526315789473 , Error Rate = 0.07894736842105263
For k=2: Accuracy = 0.9210526315789473 , Error Rate = 0.07894736842105263
For k=3: Accuracy = 0.8947368421052632 , Error Rate = 0.10526315789473684
For k=4: Accuracy = 0.9210526315789473 , Error Rate = 0.07894736842105263
For k=5: Accuracy = 0.868421052631579 , Error Rate = 0.13157894736842105
For k=6: Accuracy = 0.8947368421052632 , Error Rate = 0.10526315789473684
For k=7: Accuracy = 0.8947368421052632 , Error Rate = 0.10526315789473684
For k=8: Accuracy = 0.9210526315789473 , Error Rate = 0.07894736842105263
For k=9: Accuracy = 0.9210526315789473 , Error Rate = 0.07894736842105263
For k=10: Accuracy = 0.9210526315789473 , Error Rate = 0.07894736842105263
For k=11: Accuracy = 0.9210526315789473 , Error Rate = 0.07894736842105263
For k=12: Accuracy = 0.9210526315789473 , Error Rate = 0.07894736842105263
For k=13: Accuracy = 0.8947368421052632 , Error Rate = 0.10526315789473684
For k=14: Accuracy = 0.894736842105

Ionosphere Dataset: Calculating error rate for K 1 to N

In [11]:
ionos_errors = []
N = 20

for i in range(1, N):
    knn = knn_algorithm(iris.data, iris.target, random_state=2707, k=i)
    print("For k=%s: Accuracy = %s , Error Rate = %s" %(i, knn["accuracy"], knn["error"]))

For k=1: Accuracy = 0.9210526315789473 , Error Rate = 0.07894736842105263
For k=2: Accuracy = 0.9210526315789473 , Error Rate = 0.07894736842105263
For k=3: Accuracy = 0.8947368421052632 , Error Rate = 0.10526315789473684
For k=4: Accuracy = 0.9210526315789473 , Error Rate = 0.07894736842105263
For k=5: Accuracy = 0.868421052631579 , Error Rate = 0.13157894736842105
For k=6: Accuracy = 0.8947368421052632 , Error Rate = 0.10526315789473684
For k=7: Accuracy = 0.8947368421052632 , Error Rate = 0.10526315789473684
For k=8: Accuracy = 0.9210526315789473 , Error Rate = 0.07894736842105263
For k=9: Accuracy = 0.9210526315789473 , Error Rate = 0.07894736842105263
For k=10: Accuracy = 0.9210526315789473 , Error Rate = 0.07894736842105263
For k=11: Accuracy = 0.9210526315789473 , Error Rate = 0.07894736842105263
For k=12: Accuracy = 0.9210526315789473 , Error Rate = 0.07894736842105263
For k=13: Accuracy = 0.8947368421052632 , Error Rate = 0.10526315789473684
For k=14: Accuracy = 0.894736842105