# Import Library

In [1]:
import numpy as np
import pandas as pd
import random
import math
from sklearn.neighbors import KNeighborsClassifier #KNN
from sklearn import metrics #akurasi
from sklearn.metrics import confusion_matrix #confussion matrix
from sklearn.metrics import recall_score # recall
from sklearn.metrics import precision_score #precision
from sklearn.metrics import f1_score #f1 score
import dill
try:
    dill.load_session('KNN.db')
except Exception as ex:
    print("Data Belum Tersimpan!")

# Manual Method (Preprocessing and Support)

In [2]:
def count_label(data):
    distinct = [data[0][-1]]
    for i in range(len(data)):
        check = True
        for j in range(len(distinct)):
            if distinct[j] == data[i][-1]:
                check = False
        if check:
            distinct.append(data[i][-1])
    return np.array(distinct).astype(int), len(distinct)

In [3]:
def evaluation(arr_acc):
    mean = np.mean(arr_acc)
    varian = sum((arr_acc - mean) * (arr_acc - mean)) / len(arr_acc)
    std = math.sqrt(varian)
    return mean, std

In [4]:
def handleMissValue(data):
    filled_data = np.array([[0 for i in range(len(data[0]))] for j in range(len(data))])
    cLabel = count_label(data)[1]
    arr_data = [[] for j in range(cLabel)]
    mean = []
    for i in range(len(data)):
        arr_data[int(data[i][-1])-1].append(data[i])
    counter = 0
    for i in range(len(arr_data)):
        mean.append(np.nanmean(arr_data[i], axis=0))
        for j in range(len(arr_data[i])):
            for k in range(len(arr_data[i][j])):
                if np.isnan(arr_data[i][j][k]):
                    arr_data[i][j][k] = mean[i][k]
            filled_data[counter] = arr_data[i][j]
            counter += 1
    return filled_data

In [5]:
def randomize(arr):
    for i in range(len(arr)-1, 0, -1):
        j = random.randint(0, i)
        arr[i], arr[j] = arr[j], arr[i]
    return arr

In [6]:
def normalize(data):
    data = np.array(data).astype(float)
    max_value = np.max(data, axis=0)
    min_value = np.min(data, axis=0)
    for i in range(len(data)):
        for j in range(len(data[0])-1):
            data[i][j] = (data[i][j] - min_value[j]) / (max_value[j] - min_value[j])
    return data

In [7]:
def extraction(file, is_normal = True, is_random = True):
    data = pd.read_excel(file)
    data = handleMissValue(np.array(data))
    data = pd.DataFrame(data)
    data.drop_duplicates()
    data = np.array(data)
    if is_normal:
        data = normalize(data)
    if is_random:
        data = randomize(data.tolist())
    return data

In [8]:
def crossValidation(data, cross_val, cross_index, label_index):
    split_test = int(len(data) * (1 / cross_val))
    change_index = (cross_index - 1) * split_test
    
    for i in range(split_test):
        j = (i + change_index)
        data[i], data[j] = data[j], data[i]
    
    data_train_feature = pd.DataFrame(data[split_test:]).drop([label_index], axis=1)
    data_train_label = pd.DataFrame(pd.DataFrame(data[split_test:]), columns=[label_index]).astype(np.int)
    data_test_feature = pd.DataFrame(data[0:split_test]).drop([label_index], axis=1)
    data_test_label = pd.DataFrame(pd.DataFrame(data[0:split_test]), columns=[label_index]).astype(np.int)
    
    data_train_feature = np.array(data_train_feature)
    data_train_label = np.array(data_train_label)
    data_test_feature = np.array(data_test_feature)
    data_test_label = np.array(data_test_label)

    return data, data_train_feature, data_train_label, data_test_feature, data_test_label

In [9]:
##transform into 1d label array 
def transformArrLabel(X_label, Y_label):
    X_transformed = []
    Y_transformed = []
    for i in range(len(X_label)):
        X_transformed.append(X_label[i][0])
    for i in range(len(Y_label)):
        Y_transformed.append(Y_label[i][0])
    return np.array(X_transformed), np.array(Y_transformed)

In [10]:
def pickCoff(coff):
    choosen_coff = []
    for i in range(len(coff)):
        for j in range(len(coff[i])):
            acc = 0
            for k in range(len(coff[i][j])):
                for l in range(len(coff[i][j][k])):
                    if k == l:
                        acc += coff[i][j][k][l]
            acc /= np.sum(coff[i][j])
            if acc != 1:
                choosen_coff.append(coff[i][j])
                break
            elif j == len(coff[i]) - 1:
                choosen_coff.append(coff[i][j])
    return np.array(choosen_coff)

# Inisialisasi

In [11]:
k = 15
file = 'Dataset Pegawai.xlsx'
cross_val = 10
label_index = 9 #Kinerja
normalisasi = True
random_data = True
data = extraction(file, normalisasi, random_data)

# Testing

In [12]:
k_range = range(1,16)

arr_coff = [[] for i in range(k)]
arr_mean_acc = [0 for i in range(k)]
arr_std_acc  = [0 for i in range(k)]

for k in k_range:
    arr_acc = [0 for i in range(cross_val)]
    for i in range(1, cross_val + 1):
        data, train_f, train_l, test_f, test_l = crossValidation(data, cross_val, i, label_index)
        train_l, test_l = transformArrLabel(train_l, test_l)
        knn = KNeighborsClassifier(n_neighbors = k, algorithm='brute')
        knn.fit(train_f, train_l)
        y_pred = knn.predict(test_f)
        acc = metrics.accuracy_score(test_l, y_pred)
        arr_acc[i-1] = acc
        
        arr_coff[k-1].append(confusion_matrix(test_l, y_pred))

#         print("Cross-"+str(i)+" Acc:"+ str(acc))
#         print("---------------------------------------------------------")

    mean, std = evaluation(arr_acc)
    arr_mean_acc[k-1] = mean
    arr_std_acc[k-1] = std
#     print('K-{0} Mean : {1} with Standar Deviation : {2}.'.format(k,mean,std))
#     print("==========================================================")

In [12]:
print(np.array(arr_mean_acc))
print(np.array(arr_std_acc))
# print(np.array(arr_coff))

[0.95258359 0.94893617 0.95714286 0.95197568 0.95835866 0.95303951
 0.95759878 0.95425532 0.9550152  0.95182371 0.95486322 0.95106383
 0.95455927 0.94969605 0.95182371]
[0.00507699 0.00738921 0.00636849 0.00849434 0.00763517 0.00967048
 0.00764273 0.00814028 0.00849434 0.00946773 0.00917032 0.00869853
 0.00631934 0.00791004 0.00634123]


## Manual Testing (Manual vs Library => Precision, Recall, dan F1-Score)

In [13]:
picked_coff = pickCoff(np.array(arr_coff))
# print(np.array(picked_coff))

### Method (Manual)

In [14]:
def calcPrecRec(coff):
    tfn = []
    tfp = []
    ttp = []
    precision = []
    recall = []
    f1score = []
    for i in range(len(coff)):
        tfn.append(np.sum(coff[i], axis = 1))
        tfp.append(np.sum(coff[i], axis = 0))
        ttp.append(np.sum(coff[i], axis = 1))
        for j in range(len(coff[i])):
            for k in range(len(coff[i][j])):
                if j == k:
                    tfn[i][j] -= coff[i][j][k]
                    tfp[i][j] -= coff[i][j][k]
                else:
                    ttp[i][j] -= coff[i][j][k]
    for i in range(len(tfn)):
        precision.append(np.array([0 for i in range(len(tfn[i]))]).astype(np.float32))
        recall.append(np.array([0 for i in range(len(tfn[i]))]).astype(np.float32))
        f1score.append(np.array([0 for i in range(len(tfn[i]))]).astype(np.float32))
        for j in range(len(tfn[i])):
            precision[i][j] = ttp[i][j]/(ttp[i][j] + tfp[i][j])
            recall[i][j] = ttp[i][j]/(ttp[i][j] + tfn[i][j])
            f1score[i][j] = 2 * precision[i][j] * recall[i][j] / (precision[i][j] + recall[i][j])
    return np.array(precision), np.array(recall), np.array(f1score)

In [15]:
def calcAvg(precision, recall, f1score):
    precision_mean = []
    recall_mean = []
    f1score_mean = []
    for i in range(len(precision)):
        precision_mean.append(np.average(precision[i]))
        recall_mean.append(np.average(recall[i]))
        f1score_mean.append(np.average(f1score[i]))
    return np.array(precision_mean), np.array(recall_mean), np.array(f1score_mean)

In [16]:
precision_arr, recall_arr, f1score_arr = calcPrecRec(picked_coff)
precision_arr_avg, recall_arr_avg, f1score_arr_avg = calcAvg(precision_arr, recall_arr, f1score_arr)

### Method (Library)

In [17]:
def splitCoff(picked_coff):
    actual = []
    predict = []
    for i in range(len(picked_coff)):
        predict.append([])
        actual.append([])
        for j in range(len(picked_coff[i])):
            for k in range(len(picked_coff[i][j])):
                for l in range(picked_coff[i][j][k]):
                    if j == k:
                        actual[i].append(j)
                        predict[i].append(j)
                    else:
                        actual[i].append(j)
                        predict[i].append(k)
    return actual, predict

In [18]:
picked_coff = pickCoff(np.array(arr_coff))

precision_lib_arr = []
recall_lib_arr = []
f1score_lib_arr = []

precision_lib_arr_avg = []
recall_lib_arr_avg = []
f1score_lib_arr_avg = []

for i in range(len(picked_coff)):
    actual, predict = splitCoff(picked_coff)
    
    recall_lib = recall_score(actual[i], predict[i], average=None)
    precision_lib = precision_score(actual[i], predict[i], average=None)
    f1score_lib = f1_score(actual[i], predict[i], average=None)
    
    recall_lib_avg = recall_score(actual[i], predict[i], average='macro')
    precision_lib_avg = precision_score(actual[i], predict[i], average='macro')
    f1score_lib_avg = f1_score(actual[i], predict[i], average='macro')
    
    recall_lib_arr.append(recall_lib)
    precision_lib_arr.append(precision_lib)
    f1score_lib_arr.append(f1score_lib)
    
    precision_lib_arr_avg.append(precision_lib_avg)
    recall_lib_arr_avg.append(recall_lib_avg)
    f1score_lib_arr_avg.append(f1score_lib_avg)

### Output (Manual)

#### Precision, Recall and F1-Score (Each Class)

In [19]:
print("-------------Precision--------------")
print(precision_arr)
print()
print("---------------Recall----------------")
print(recall_arr)
print()
print("--------------F1-SCORE---------------")
print(f1score_arr)
print()

-------------Precision--------------
[[0.9353612  0.9559748  0.97402596]
 [0.89928055 0.9737705  1.        ]
 [0.9442231  0.9689441  0.9764706 ]
 [0.9139785  0.9661017  1.        ]
 [0.9390244  0.96755165 0.9589041 ]
 [0.96031743 0.9737705  1.        ]
 [0.9312977  0.9773463  1.        ]
 [0.91570884 0.95731705 1.        ]
 [0.9348659  0.9745223  1.        ]
 [0.8924731  0.986532   0.9878049 ]
 [0.8975265  0.9799331  0.9868421 ]
 [0.8932384  0.97359735 1.        ]
 [0.92607003 0.9746032  0.9767442 ]
 [0.9007092  0.97231835 1.        ]
 [0.92828685 0.9703264  0.98571426]]

---------------Recall----------------
[[0.95719844 0.9411765  0.96153843]
 [0.98039216 0.91384614 0.96153843]
 [0.97530866 0.9512195  0.954023  ]
 [0.9883721  0.9223301  0.9230769 ]
 [0.95454544 0.9479769  1.        ]
 [0.9797571  0.9674267  0.97115386]
 [0.9799197  0.94375    0.9775281 ]
 [0.956      0.9345238  0.9583333 ]
 [0.976      0.94736844 0.9764706 ]
 [0.9920319  0.904321   0.97590363]
 [0.98832685 0.90712076

#### Precision, Recall and F1-Score (Average)

In [20]:
print("-------------Precision--------------")
print(precision_arr_avg)
print()
print("---------------Recall----------------")
print(recall_arr_avg)
print()
print("--------------F1-SCORE---------------")
print(f1score_arr_avg)
print()

-------------Precision--------------
[0.9551206  0.95768374 0.96321267 0.96002674 0.9551601  0.9780293
 0.969548   0.9576753  0.969796   0.9556033  0.9547672  0.95561194
 0.9591391  0.9576759  0.9614425 ]

---------------Recall----------------
[0.95330447 0.9519256  0.9601838  0.944593   0.9675074  0.9727793
 0.967066   0.94961905 0.966613   0.95741886 0.9523287  0.946908
 0.96030563 0.949975   0.96453696]

--------------F1-SCORE---------------
[0.95413786 0.9537785  0.9615435  0.9511431  0.96113557 0.975298
 0.96796036 0.95330906 0.96794635 0.9550267  0.95229626 0.949914
 0.9593324  0.9526894  0.9628293 ]



### Output (Library)

#### Precision, Recall and F1-Score (Each Class)

In [21]:
print("-------------Precision--------------")
print(np.array(precision_lib_arr))
print()
print("---------------Recall----------------")
print(np.array(recall_lib_arr))
print()
print("--------------F1-SCORE---------------")
print(np.array(f1score_lib_arr))
print()

-------------Precision--------------
[[0.93536122 0.95597484 0.97402597]
 [0.89928058 0.97377049 1.        ]
 [0.94422311 0.9689441  0.97647059]
 [0.91397849 0.96610169 1.        ]
 [0.93902439 0.96755162 0.95890411]
 [0.96031746 0.97377049 1.        ]
 [0.93129771 0.97734628 1.        ]
 [0.91570881 0.95731707 1.        ]
 [0.9348659  0.97452229 1.        ]
 [0.89247312 0.98653199 0.98780488]
 [0.8975265  0.97993311 0.98684211]
 [0.89323843 0.97359736 1.        ]
 [0.92607004 0.97460317 0.97674419]
 [0.90070922 0.97231834 1.        ]
 [0.92828685 0.97032641 0.98571429]]

---------------Recall----------------
[[0.95719844 0.94117647 0.96153846]
 [0.98039216 0.91384615 0.96153846]
 [0.97530864 0.95121951 0.95402299]
 [0.98837209 0.9223301  0.92307692]
 [0.95454545 0.94797688 1.        ]
 [0.97975709 0.96742671 0.97115385]
 [0.97991968 0.94375    0.97752809]
 [0.956      0.93452381 0.95833333]
 [0.976      0.94736842 0.97647059]
 [0.99203187 0.90432099 0.97590361]
 [0.98832685 0.90712074

#### Precision, Recall and F1-Score (Average)

In [22]:
print("-------------Precision--------------")
print(np.array(precision_lib_arr_avg))
print()
print("---------------Recall----------------")
print(np.array(recall_lib_arr_avg))
print()
print("--------------F1-SCORE---------------")
print(np.array(f1score_lib_arr_avg))
print()

-------------Precision--------------
[0.95512068 0.95768369 0.9632126  0.96002673 0.95516004 0.97802932
 0.969548   0.9576753  0.96979606 0.95560333 0.95476724 0.95561193
 0.95913913 0.95767585 0.96144252]

---------------Recall----------------
[0.95330446 0.95192559 0.96018371 0.94459304 0.96750744 0.97277921
 0.96706592 0.94961905 0.966613   0.95741882 0.95232868 0.94690799
 0.96030557 0.94997506 0.96453697]

--------------F1-SCORE---------------
[0.95413791 0.95377853 0.96154348 0.95114309 0.96113551 0.97529799
 0.96796032 0.95330909 0.96794633 0.9550267  0.9522963  0.94991394
 0.95933239 0.95268939 0.96282927]



# Save Data

In [24]:
# dill.dump_session('KNN.db')