In [1]:
import math
import random
import pandas as pd
import numpy as np

In [2]:
def normalize(x):
    min_values = x.min(axis=0)
    max_values = x.max(axis=0)
    return (x - min_values) / (max_values - min_values)

In [3]:
def knn(X, y, x_test, k):
    distances = np.sqrt(np.sum((X - x_test)**2, axis=1))
    indices = np.argpartition(distances, k)[:k]
    y_pred = np.argmax(np.bincount(y[indices]))
    return y_pred

def weighted_knn(X, y, x_test, k):
    distances = np.sqrt(np.sum((X - x_test)**2, axis=1))
    indices = np.argpartition(distances, k)[:k]
    weights = 1 / distances[indices]
    weights /= np.sum(weights)
    y_pred = np.argmax(np.bincount(y[indices], weights=weights))
    return y_pred

In [4]:
def data_prep(i, data):
    datas = np.array_split(data, 5)
    test_data = datas[i]
    datas.pop(i)
    train_data = np.concatenate(datas)

    y_train = train_data[:, -1]
    x_train = train_data[:, :-1]
    y_test = test_data[:, -1]
    x_test = test_data[:, :-1]
    
    length = len(test_data)
    
    return y_train, x_train, y_test, x_test, length

In [5]:
def prediction(length, x_train, y_train, x_test, k):
    predicted = list()
    for i in range(length):
        x = x_test[i]
        result = knn(x_train, y_train, x, k)
        predicted.append(result)
    return predicted

In [6]:
def weighted_prediction(length, x_train, y_train, x_test, k):
    predicted = list()
    for i in range(length):
        x = x_test[i]
        result = weighted_knn(x_train, y_train, x, k)
        predicted.append(result)
    return predicted

In [7]:
def find_perf_metrics(length, k, y_test, predicted):
    y_actu = pd.Series(y_test, name='Actual')
    y_pred = pd.Series(predicted, name='Predicted')
    cm = pd.crosstab(y_actu, y_pred)
    cm_np = cm.to_numpy()

    precisions = list()
    recalls = list()
    falses = list()
    tps = cm_np.diagonal().sum()
    tns = list()
        
    for i in range(16):
        tp = cm_np.diagonal()[i]
        tp_and_fn = cm_np.sum(1)[i]
        tp_and_fp = cm_np.sum(0)[i]
        fn = tp_and_fn - tp
        fp = tp_and_fp - tp
        tn = length - (tp + fn + fp)
        precisions.append(tp / tp_and_fp)
        recalls.append(tp / tp_and_fn)
        falses.append(fn + fp)
        tns.append(tn)
        
    false_count = sum(falses)
    tns = sum(tns)

    precision = sum(precisions) / 16 * 100
    recall = sum(recalls) / 16 * 100
    accuracy = (tps + tns) / (false_count + tps + tns) * 100
    
    output = "For k: {}\nPrecision: %{}\nRecall: %{}\nAccuracy: %{}\n".format(k, precision, recall, accuracy)
    
    return output

In [8]:
def test(data):
    output = ""
    for i in range(5):
        k = 2 * i + 1
        
        y_train, x_train, y_test, x_test, length = data_prep(i, data)
        
        predicted = prediction(length, x_train, y_train, x_test, k)
        
        output += find_perf_metrics(length, k, y_test, predicted)
        
    return output

In [9]:
def weighted_test(data):
    output = ""
    for i in range(5):
        k = 2 * i + 1
        
        y_train, x_train, y_test, x_test, length = data_prep(i, data)
        
        predicted = weighted_prediction(length, x_train, y_train, x_test, k)
        
        output += find_perf_metrics(length, k, y_test, predicted)
        
    return output

In [10]:
def norm_test(data):
    output = ""
    for i in range(5):
        k = 2 * i + 1
        
        y_train, x_train, y_test, x_test, length = data_prep(i, data)
        
        x_train = normalize(x_train)
        x_test = normalize(x_test)
        
        predicted = prediction(length, x_train, y_train, x_test, k)
        
        output += find_perf_metrics(length, k, y_test, predicted)
        
    return output

In [11]:
def norm_weighted_test(data):
    output = ""
    for i in range(5):
        k = 2 * i + 1
        
        y_train, x_train, y_test, x_test, length = data_prep(i, data)
        
        x_train = normalize(x_train)
        x_test = normalize(x_test)
        
        predicted = weighted_prediction(length, x_train, y_train, x_test, k)
        
        output += find_perf_metrics(length, k, y_test, predicted)
        
    return output

In [12]:
df = pd.read_csv("16P.csv", encoding="ISO-8859-1")

encoding_list = {"Personality":
{"ESTJ":0,
"ENTJ":1,
"ESFJ":2,
"ENFJ":3,
"ISTJ":4,
"ISFJ":5,
"INTJ":6,
"INFJ":7,
"ESTP":8,
"ESFP":9,
"ENTP":10,
"ENFP":11,
"ISTP":12,
"ISFP":13,
"INTP":14,
"INFP":15}}

df = df.drop(df.columns[0], axis=1)

df = df.replace(encoding_list)

array = df.to_numpy()

np.random.shuffle(array)

In [13]:
print(test(array))

For k: 1
Precision: %97.95284757174734
Recall: %97.93866608279865
Accuracy: %99.74374999999999
For k: 3
Precision: %98.8061760633038
Recall: %98.81219830176774
Accuracy: %99.85104166666666
For k: 5
Precision: %98.94289818113158
Recall: %98.9417287268162
Accuracy: %99.86770833333334
For k: 7
Precision: %98.85076794229558
Recall: %98.84968980464829
Accuracy: %99.85625
For k: 9
Precision: %99.00019229818999
Recall: %99.00399029378819
Accuracy: %99.8749895824652



In [14]:
print(norm_test(array))

For k: 1
Precision: %97.43817807799053
Recall: %97.42608299338062
Accuracy: %99.67916666666666
For k: 3
Precision: %98.53389218517822
Recall: %98.5338547022458
Accuracy: %99.81666666666666
For k: 5
Precision: %98.85272599609968
Recall: %98.85084065256773
Accuracy: %99.85625
For k: 7
Precision: %98.78493773384696
Recall: %98.78449123992884
Accuracy: %99.84791666666666
For k: 9
Precision: %98.9170050004587
Recall: %98.92028681224868
Accuracy: %99.86457204767063



In [15]:
print(weighted_test(array))

For k: 1
Precision: %97.95284757174734
Recall: %97.93866608279865
Accuracy: %99.74374999999999
For k: 3
Precision: %98.81393460852607
Recall: %98.8206705583094
Accuracy: %99.85208333333333
For k: 5
Precision: %98.93467664717011
Recall: %98.93368596379729
Accuracy: %99.86666666666667
For k: 7
Precision: %98.8585743675849
Recall: %98.8580120416656
Accuracy: %99.85729166666667
For k: 9
Precision: %98.9919116481722
Recall: %98.99558975615378
Accuracy: %99.87394782898575



In [16]:
print(norm_weighted_test(array))

For k: 1
Precision: %97.43817807799053
Recall: %97.42608299338062
Accuracy: %99.67916666666666
For k: 3
Precision: %98.58985725196136
Recall: %98.5930056007244
Accuracy: %99.82395833333332
For k: 5
Precision: %98.86084262298328
Recall: %98.85933489914999
Accuracy: %99.85729166666667
For k: 7
Precision: %98.76862692892453
Recall: %98.76745101943675
Accuracy: %99.84583333333333
For k: 9
Precision: %98.92538503603501
Recall: %98.92917470343421
Accuracy: %99.8656138011501



Error Analysis for Classification:

In conclusion, incorrect examples are usually due to these two reasons:

1- Since the algorithm predicts the class by looking at the closeness of the given sample to the classes, if there is not enough trained data close to that sample, it can make wrong predictions.

2- At the same time, if the sample is located at the intersection points of different classes, the algorithm may make an incorrect prediction.

The K value affects the accuracy in both these cases. If it is too high, it may cause incorrect predictions when there is not enough data, and if it is too low, it may make incorrect predictions when there is too much data. In this study, the ideal k value seems to be 9.

Normalization allows us to include each feature in the evaluation on an equal scale. But in this study, it does not cause a significant difference as the ranges of all the features are equal.

The weighted algorithm generally makes more accurate predictions in a variety of situations than the standard algorithm. The higher the K value, the larger the difference between the weighted algorithm and the normal algorithm.

Accuracy, precision and recall move in direct proportion throughout the entire operation. As the algorithm's predictions get better, they all go up, and as they get worse, they all go down.