In [2]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

In [3]:

start_time = time.time()
personality = {
    "ESTJ":0,
    "ENTJ":1,
    "ESFJ":2,
    "ENFJ":3,
    "ISTJ":4,
    "ISFJ":5,
    "INTJ":6,
    "INFJ":7,
    "ESTP":8,
    "ESFP":9,
    "ENTP":10,
    "ENFP":11,
    "ISTP":12,
    "ISFP":13,
    "INTP":14,
    "INFP":15,
}


dataset = pd.read_csv('16P.csv' , encoding = "ISO-8859-1")
del dataset["Response Id"]
for element in personality:
    dataset = dataset.replace(to_replace={"Personality":element}, value={"Personality":personality[element]})

def split_data_set(data_set,split_num =5):
    length = int(len(data_set) / split_num+ 0.5)
    def split_it(data_set, length, start_index=0):
        if start_index> len(dataset):
            return list()
        return [data_set[start_index:start_index+length]] + split_it(data_set,length, start_index= start_index+length)
     
    return split_it(data_set, length)

    
def cross_validation(splitted_data_set,test_num):
    Test = splitted_data_set[test_num]
    Train = pd.DataFrame({})

    for element in range(len(splitted_data_set)):
        if element != test_num:
            Train = pd.concat([Train,splitted_data_set[element]])

    x_train, y_train, x_test, y_test = Train.iloc[:,:60], Train.iloc[:,60], Test.iloc[:,:60], Test.iloc[:,60]
    return np.array(x_train), np.array(y_train),np.array(x_test),np.array(y_test)

def KNN_Algorithm(x_train, y_train, x_test, y_test, k_constant):
    test_results = list()

    x_train = x_train[np.newaxis,:,:]
    start_point,length = 0, 1
    train_index_limit = length

    while start_point<= len(x_test):
        end_point = start_point + length
        if end_point > len(x_test) :
            end_point = len(x_test) + 1
            train_index_limit = end_point- start_point-1

        test_part = x_test[start_point:end_point,:]
        test_part = test_part[:,np.newaxis,:]
        
        distances =  (x_train-test_part)
        distances *= distances
        distances = distances.sum(axis=2)
        distances = np.sqrt(distances)

        idx = np.argpartition(distances, k_constant, axis=1) # neighbours indexes 
        idx = idx[:,:k_constant]

        predicts = y_train[idx]
        predicts.resize(train_index_limit,)
        real_results = y_test[start_point:end_point]
        #print(real_results, predicts)
        test_results.extend(real_results == predicts)
        start_point = end_point

    return test_results

def all_calculation(dataset):
    split_num = 5
    k_list = [1,3,5,7,9]
    test_nums = [i for i in range(1,split_num+1)]
    column_names = ["Accuracy", "Precision", "Recall"]

    splitted_data_set = split_data_set(dataset, split_num)
    df_indexes = pd.MultiIndex.from_product([k_list,test_nums])
    results_df = pd.DataFrame(data = [] ,index= df_indexes ,columns= column_names)#index= [k_list,test_nums]
    results_df.index.names =["K constant", "Test Cases"]

    for k_counter in k_list:
        time1 = time.time()
        for test_counter in test_nums:

            x_train, y_train, x_test, y_test = cross_validation(splitted_data_set, test_num = (test_counter-1))
            test_results = KNN_Algorithm(x_train, y_train, x_test[:,:], y_test[:], k_counter)

            def accuracy_func(test_results):
                true_false_num = np.bincount(test_results)
                false_num, true_num = true_false_num[0], true_false_num[1]
                accuracy = true_num/(false_num +true_num)
                return accuracy

            def truth_values(accuracy):
                total_length = len(x_test)
                true_positive = int(total_length*accuracy * accuracy +0.5)
                true_negative = int(total_length*accuracy *(1-accuracy) +0.5)
                false_positive = int(total_length *(1-accuracy) *accuracy +0.5)
                false_negative = int(total_length *(1-accuracy) *(1-accuracy) +0.5)
                return true_positive, true_negative, false_positive, false_negative

            def recall_func(true_pos, false_neg):
                recall = round(true_pos/(true_pos+ false_neg),4)
                return recall

            def precision_func(true_pos, false_pos):
                precision = round(true_pos/(true_pos+ false_pos),4)
                return(precision)


            accuracy = accuracy_func(test_results)
            true_pos, true_neg, false_pos, false_neg = truth_values(accuracy)
            recall = recall_func(true_pos, false_neg)
            precision = precision_func(true_pos, false_pos)

            results_df.loc[k_counter,test_counter]["Accuracy"] = accuracy *100
            results_df.loc[k_counter,test_counter]["Precision"] = precision *100
            results_df.loc[k_counter,test_counter]["Recall"] = recall *100
            print(f"K-constant : {k_counter}, Test Case : {test_counter}, "+ "Elapsed Time : %.1f seconds" %(time.time()-time1))
            time1 = time.time()

    return results_df

def normalize_dataframe(dataset):
    df = dataset.copy()
    for column in df.columns:
        if column == "Personality":
            continue
        else:
            df[column] = (df[column] - df[column].min())/(df[column].max()-df[column].min())
    return df




In [4]:
normal_results_df = all_calculation(dataset)

normalized_df = normalize_dataframe(dataset)
normalized_results_df = all_calculation(normalized_df)

finish_time = time.time()-start_time
print("Total Elapsed Time : %.3f seconds" %(finish_time))

K-constant : 1, Test Case : 1, Elapsed Time : 54.6 seconds
K-constant : 1, Test Case : 2, Elapsed Time : 54.8 seconds
K-constant : 1, Test Case : 3, Elapsed Time : 55.2 seconds
K-constant : 1, Test Case : 4, Elapsed Time : 55.6 seconds
K-constant : 1, Test Case : 5, Elapsed Time : 54.3 seconds
K-constant : 3, Test Case : 1, Elapsed Time : 52.9 seconds
K-constant : 3, Test Case : 2, Elapsed Time : 53.1 seconds
K-constant : 3, Test Case : 3, Elapsed Time : 53.3 seconds
K-constant : 3, Test Case : 4, Elapsed Time : 52.5 seconds
K-constant : 3, Test Case : 5, Elapsed Time : 51.9 seconds
K-constant : 5, Test Case : 1, Elapsed Time : 52.7 seconds
K-constant : 5, Test Case : 2, Elapsed Time : 53.2 seconds
K-constant : 5, Test Case : 3, Elapsed Time : 52.5 seconds
K-constant : 5, Test Case : 4, Elapsed Time : 52.3 seconds
K-constant : 5, Test Case : 5, Elapsed Time : 51.1 seconds
K-constant : 7, Test Case : 1, Elapsed Time : 53.4 seconds
K-constant : 7, Test Case : 2, Elapsed Time : 51.9 secon

In [5]:
print("Normal Results")
print(normal_results_df)


Normal Results
                        Accuracy Precision Recall
K constant Test Cases                            
1          1           97.908333     97.91  99.96
           2           97.633333     97.64  99.94
           3           97.691667     97.69  99.95
           4           97.883333     97.88  99.96
           5           97.791483     97.79  99.95
3          1           97.908333     97.91  99.96
           2           97.516667     97.51  99.94
           3           97.708333     97.71  99.95
           4           97.783333     97.78  99.95
           5           97.758147     97.76  99.95
5          1               97.75     97.75  99.95
           2           97.491667      97.5  99.93
           3           97.533333     97.53  99.94
           4           97.616667     97.62  99.94
           5           97.616468     97.62  99.94
7          1               97.55     97.55  99.94
           2           97.558333     97.56  99.94
           3           97.591667   

In [6]:
print("\n\nNormalized Datas Results")
print(normalized_results_df)




Normalized Datas Results
                        Accuracy Precision Recall
K constant Test Cases                            
1          1           97.358333     97.36  99.93
           2           97.358333     97.36  99.93
           3           97.383333     97.38  99.93
           4           97.508333      97.5  99.94
           5           97.249771     97.25  99.92
3          1           97.358333     97.36  99.93
           2               97.25     97.25  99.92
           3           97.316667     97.32  99.92
           4           97.358333     97.36  99.93
           5           97.149762     97.15  99.91
5          1           97.141667     97.14  99.91
           2           97.133333     97.13  99.91
           3               97.25     97.25  99.92
           4           97.233333     97.23  99.92
           5           97.166431     97.17  99.91
7          1           97.083333     97.08  99.91
           2               97.05     97.05  99.91
           3           