# Import Library

In [1]:
import numpy as np
import pandas as pd
import math
import random
import dill
try:
    dill.load_session('Pengujian.db')
except Exception as ex:
    print("Data Belum Tersimpan!")

# Support

## Identify Label

In [2]:
def count_label(data):
    distinct = [data[0][-1]]
    for i in range(len(data)):
        check = True
        for j in range(len(distinct)):
            if distinct[j] == data[i][-1]:
                check = False
        if check:
            distinct.append(data[i][-1])
    return np.array(distinct).astype(int), len(distinct)

## Model Evaluation Performance

### Mean + STD

In [3]:
def evaluation(arr_acc):
    mean = np.mean(arr_acc)
    varian = sum((arr_acc - mean) * (arr_acc - mean)) / len(arr_acc)
    std = math.sqrt(varian)
    return mean, std

### Pick Random Confussion Matrix

In [4]:
def toNumpyArray(coff):
    coff = np.array(coff)
    for i in range(len(coff)):
        coff[i] = np.array(coff[i])
        for j in range(len(coff[i])):
            coff[i][j] = np.array(coff[i][j])
    return coff

def pickCoff(coff):
    coff = toNumpyArray(coff)
    choosen_coff = []
    for i in range(len(coff)):
        for j in range(len(coff[i])):
            acc = 0
            for k in range(len(coff[i][j])):
                for l in range(len(coff[i][j][k])):
                    if k == l:
                        acc += coff[i][j][k][l]
            acc /= np.sum(coff[i][j])
            if acc != 1:
                choosen_coff.append(coff[i][j])
                break
            elif j == len(coff[i]) - 1:
                choosen_coff.append(coff[i][j])
    return np.array(choosen_coff)

## Euclidian Distance

In [5]:
def euclidian(X, Y, categories = []):
    euc = 0
    for i in range(len(X)):
        if not i in categories: ## similarity matrix
            euc += math.pow((X[i]-Y[i]), 2)
    euc = math.sqrt(euc)
    ##------------Similiratity Matrix-------------##
    if len(categories) != 0:
        p = len(categories)
        m = 0
        for i in range(len(categories)):
            if X[categories[i]] == Y[categories[i]]:
                m += 1
        cat_dist = (p-m)/p
        euc += cat_dist
    ##--------------------------------------------##
    return euc

# Pre Processing

## Random Data

In [6]:
def randomize(arr):
    for i in range(len(arr)-1, 0, -1):
        j = random.randint(0, i)
        arr[i], arr[j] = arr[j], arr[i]
    return arr

## Normalisasi

In [7]:
def normalize(data):
    data = np.array(data).astype(float)
    max_value = np.max(data, axis=0)
    min_value = np.min(data, axis=0)
    for i in range(len(data)):
        for j in range(len(data[0])-1):
            data[i][j] = (data[i][j] - min_value[j]) / (max_value[j] - min_value[j])
    return data

## Missing Value

In [8]:
def handleMissValue(data):
    filled_data = np.array([[0 for i in range(len(data[0]))] for j in range(len(data))])
    cLabel = count_label(data)[1]
    arr_data = [[] for j in range(cLabel)]
    mean = []
    for i in range(len(data)):
        arr_data[int(data[i][-1])-1].append(data[i])
    counter = 0
    for i in range(len(arr_data)):
        mean.append(np.nanmean(arr_data[i], axis=0))
        for j in range(len(arr_data[i])):
            for k in range(len(arr_data[i][j])):
                if np.isnan(arr_data[i][j][k]):
                    arr_data[i][j][k] = mean[i][k]
            filled_data[counter] = arr_data[i][j]
            counter += 1
    return filled_data

## Ekstraksi File 


In [9]:
def extraction(file, is_normal = True, is_random = True):
    data = pd.read_excel(file)
    data = handleMissValue(np.array(data))
    data = pd.DataFrame(data)
    data.drop_duplicates()
    data = np.array(data)
    if is_normal:
        data = normalize(data)
    if is_random:
        data = randomize(data.tolist())
    return data

# LMKHNCN

## Set of KNCN

### 1 NCN

In [10]:
def closest(Xx, Xx_label, Y, cLabel, nominal):
    distance = [[None for i in range(len(Xx))] for j in range(cLabel)]
    min_distance_value = [0 for i in range(cLabel)]
    min_distance_index = [0 for i in range(cLabel)]
    for i in range(len(Xx)):
        distance[((Xx_label[i][0])-1)][i] = euclidian(Xx[i], Y, nominal)
    for i in range(cLabel):
        min_distance_value[i] = min(x for x in distance[i] if x is not None)
        min_distance_index[i] = distance[i].index(min(x for x in distance[i] if x is not None))
    return np.array(distance), min_distance_index, min_distance_value

### N NCN

In [11]:
def kncn(k, Xx, Xx_label, Y, cLabel, close_dist, nominal):
    ncn = np.array([[0 for i in range(k)] for j in range(cLabel)])
    for i in range(cLabel):
        ncn[i][0] = close_dist[i]

    for i in range (1, k):
        distance = [[None for i in range(len(Xx))] for j in range(cLabel)]
        min_distance_value = [0 for i in range(cLabel)]
        min_distance_index = [0 for i in range(cLabel)]
        for j in range(len(Xx)):
            label = Xx_label[j][0] - 1
            check = True
            for a in range(i):
                if ncn[label][a] == j:
                    check = False
            if check:
                add = 0 + Xx[j]
                for a in range(i):
                    indeks_ncn = ncn[label][a]
                    add += Xx[indeks_ncn]
                centroid = add / (i+1)
                distance[label][j] = euclidian(Y, centroid, nominal)
        for j in range(cLabel):
            min_distance_value[j] = min(x for x in distance[j] if x is not None)
            min_distance_index[j] = distance[j].index(min(x for x in distance[j] if x is not None))
            ncn[j][i] = min_distance_index[j]
    return ncn

## Local Mean Vector

In [12]:
def localMeanVector(Xx, ncn):
    lm_ncn = [[0 for i in range(len(ncn[0]))] for j in range(len(ncn))]
    for i in range(len(ncn)):
        lm_ncn[i][0] = Xx[ncn[i][0]]
        for j in range(1, len(ncn[0])):
            add = 0 + lm_ncn[i][0]
            for a in range(j):
                add += Xx[ncn[i][a + 1]]
            lm_ncn[i][j] = add / (j+1)
    return np.array(lm_ncn)

## Harmonic Mean Distance

In [13]:
def harmonicMean(lm_ncn, Y, nominal):
    k = len(lm_ncn[0])
    hm_ncn = []
    for i in range(len(lm_ncn)):
        add = 0
        for j in range(len(lm_ncn[0])):
            distance = euclidian(Y, lm_ncn[i][j], nominal)
            ## Centroid Lokal i Tepat Pada Data Uji
            if distance == 0:
                add =+ 0
            else:
                add += (1/distance)
        ## Semua Centroid Lokal Tepat Pada Data Uji
        if add == 0:
            harmonic_mean = 0
        else:
            harmonic_mean = (k/add)
        hm_ncn.append(harmonic_mean)
    return hm_ncn

## Classification Decision

In [14]:
def decision(hm_ncn, Y_label):
    min_distance_index = hm_ncn.index(min(x for x in hm_ncn))
    return (min_distance_index + 1), Y_label[0]

# Performance Evaluation

## K Cross Fold Validation

In [15]:
def crossValidation(data, cross_val, cross_index, label_index):
    split_test = int(len(data) * (1 / cross_val))
    change_index = (cross_index - 1) * split_test
    
    for i in range(split_test):
        j = (i + change_index)
        data[i], data[j] = data[j], data[i]
    
    data_train_feature = pd.DataFrame(data[split_test:]).drop([label_index], axis=1)
    data_train_label = pd.DataFrame(pd.DataFrame(data[split_test:]), columns=[label_index]).astype(np.int)
    data_test_feature = pd.DataFrame(data[0:split_test]).drop([label_index], axis=1)
    data_test_label = pd.DataFrame(pd.DataFrame(data[0:split_test]), columns=[label_index]).astype(np.int)
    
    data_train_feature = np.array(data_train_feature)
    data_train_label = np.array(data_train_label)
    data_test_feature = np.array(data_test_feature)
    data_test_label = np.array(data_test_label)

    return data, data_train_feature, data_train_label, data_test_feature, data_test_label

## LMKHNCN

In [16]:
def lmkhncn(k, Xx, Xx_label, Yy, Yy_label, cLabel, nominal):
    acc = 0
    coff = np.array([[0 for i in range(jml_kelas)] for j in range(jml_kelas)])
    
    for i in range(len(Yy)):
        jarak, jarak_index_min, jarak_nilai_min = closest(Xx, Xx_label, Yy[i], cLabel, nominal)
        ncn = kncn(k, Xx, Xx_label, Yy[i], cLabel, jarak_index_min, nominal)
        local_mean = localMeanVector(Xx, ncn)
        harmonic_mean = harmonicMean(local_mean, Yy[i], nominal)
        predict, actual = decision(harmonic_mean, Yy_label[i])
        
        coff[actual-1][predict-1] += 1
        if(predict == actual):
            acc += 1
#         print(str(round((i+1)/len(Yy),2)*100)+"%. Prediksi :"+ str(predict) + " Actual :" + str(actual) + " Acc Lokal: "+ str(acc/(i+1)))
    
    acc /= len(Yy)
    return acc, coff

# Inisialisasi

In [17]:
k = 15
nominal = [8] ##Kawin Attribute
file = 'Dataset Pegawai.xlsx'
cross_val = 10
label_index = 9 #Kinerja
normalisasi = False
random_data = True
data = extraction(file, normalisasi, random_data)
arr_kelas, jml_kelas = count_label(data)

# Testing

In [None]:
arr_coff = [[] for i in range(k)]
arr_mean_acc = [0 for i in range(k)]
arr_std_acc  = [0 for i in range(k)]

for a in range(1, k + 1): #Pengujian Parameter K
    arr_acc = [0 for i in range(cross_val)]
    for i in range(1, cross_val + 1):
        data, train_f, train_l, test_f, test_l = crossValidation(data, cross_val, i, label_index)
        acc, coff = lmkhncn(a, train_f, train_l, test_f, test_l, jml_kelas, nominal)
        arr_acc[i-1] = acc
        arr_coff[a-1].append(np.array(coff))
        print("Cross-"+str(i)+" Acc:"+ str(acc))
    mean, std = evaluation(arr_acc)
    arr_mean_acc[a-1] = mean
    arr_std_acc[a-1] = std
    print('K-{0} Mean : {1} with Standar Deviation : {2}.'.format(a,mean,std))

In [18]:
print(np.array(arr_mean_acc))
print(np.array(arr_std_acc))
# print(np.array(arr_coff))

[0.91352584 0.91306991 0.91231003 0.9106383  0.9106383  0.9100304
 0.90927052 0.90790274 0.90638298 0.90683891 0.90668693 0.90714286
 0.90683891 0.90668693 0.90668693]
[0.0074592  0.00534298 0.00763063 0.00734532 0.00583872 0.00595621
 0.0063047  0.00612445 0.0075132  0.00741573 0.00690445 0.00667483
 0.00666097 0.00649063 0.00706973]


## Manual Testing (Manual vs Library => Precision, Recall, dan F1-Score)

In [19]:
picked_coff = pickCoff(np.array(arr_coff))
# print(np.array(picked_coff))

### Method (Manual)

In [20]:
def calcPrecRec(coff):
    tfn = []
    tfp = []
    ttp = []
    precision = []
    recall = []
    f1score = []
    for i in range(len(coff)):
        tfn.append(np.sum(coff[i], axis = 1))
        tfp.append(np.sum(coff[i], axis = 0))
        ttp.append(np.sum(coff[i], axis = 1))
        for j in range(len(coff[i])):
            for k in range(len(coff[i][j])):
                if j == k:
                    tfn[i][j] -= coff[i][j][k]
                    tfp[i][j] -= coff[i][j][k]
                else:
                    ttp[i][j] -= coff[i][j][k]
    for i in range(len(tfn)):
        precision.append(np.array([0 for i in range(len(tfn[i]))]).astype(np.float32))
        recall.append(np.array([0 for i in range(len(tfn[i]))]).astype(np.float32))
        f1score.append(np.array([0 for i in range(len(tfn[i]))]).astype(np.float32))
        for j in range(len(tfn[i])):
            precision[i][j] = ttp[i][j]/(ttp[i][j] + tfp[i][j])
            recall[i][j] = ttp[i][j]/(ttp[i][j] + tfn[i][j])
            f1score[i][j] = 2 * precision[i][j] * recall[i][j] / (precision[i][j] + recall[i][j])
    return np.array(precision), np.array(recall), np.array(f1score)

In [21]:
def calcAvg(precision, recall, f1score):
    precision_mean = []
    recall_mean = []
    f1score_mean = []
    for i in range(len(precision)):
        precision_mean.append(np.average(precision[i]))
        recall_mean.append(np.average(recall[i]))
        f1score_mean.append(np.average(f1score[i]))
    return np.array(precision_mean), np.array(recall_mean), np.array(f1score_mean)

In [22]:
precision_arr, recall_arr, f1score_arr = calcPrecRec(picked_coff)
f1Score_arr = calcF1Score(precision,recall)
precision_arr_avg, recall_arr_avg, f1score_arr_avg = calcAvg(precision_arr, recall_arr, f1score_arr)

### Method (Library)

In [23]:
from sklearn.metrics import confusion_matrix #confussion matrix
from sklearn.metrics import recall_score # recall
from sklearn.metrics import precision_score #precision
from sklearn.metrics import f1_score #f1 score

In [24]:
def splitCoff(picked_coff):
    actual = []
    predict = []
    for i in range(len(picked_coff)):
        predict.append([])
        actual.append([])
        for j in range(len(picked_coff[i])):
            for k in range(len(picked_coff[i][j])):
                for l in range(picked_coff[i][j][k]):
                    if j == k:
                        actual[i].append(j)
                        predict[i].append(j)
                    else:
                        actual[i].append(j)
                        predict[i].append(k)
    return actual, predict

In [25]:
picked_coff = pickCoff(np.array(arr_coff))

precision_lib_arr = []
recall_lib_arr = []
f1score_lib_arr = []

precision_lib_arr_avg = []
recall_lib_arr_avg = []
f1score_lib_arr_avg = []

for i in range(len(picked_coff)):
    actual, predict = splitCoff(picked_coff)
    
    recall_lib = recall_score(actual[i], predict[i], average=None)
    precision_lib = precision_score(actual[i], predict[i], average=None)
    f1score_lib = f1_score(actual[i], predict[i], average=None)
    
    recall_lib_avg = recall_score(actual[i], predict[i], average='macro')
    precision_lib_avg = precision_score(actual[i], predict[i], average='macro')
    f1score_lib_avg = f1_score(actual[i], predict[i], average='macro')
    
    recall_lib_arr.append(recall_lib)
    precision_lib_arr.append(precision_lib)
    f1score_lib_arr.append(f1score_lib)
    
    precision_lib_arr_avg.append(precision_lib_avg)
    recall_lib_arr_avg.append(recall_lib_avg)
    f1score_lib_arr_avg.append(f1score_lib_avg)

### Output (Manual)

#### Precision, Recall and F1-Score (Each Class)

In [26]:
print("-------------Precision--------------")
print(precision_arr)
print()
print("---------------Recall----------------")
print(recall_arr)
print()
print("--------------F1-SCORE---------------")
print(f1score_arr)
print()

-------------Precision--------------
[[0.8955823  0.92705166 0.9375    ]
 [0.94505495 0.88535035 0.87323946]
 [0.9285714  0.91131496 0.9113924 ]
 [0.8978723  0.9046243  0.9480519 ]
 [0.9327731  0.8961424  0.96385545]
 [0.94466406 0.88081396 0.91803277]
 [0.9437751  0.9032258  0.9558824 ]
 [0.91189426 0.9093567  0.8876405 ]
 [0.91845495 0.8959538  0.9113924 ]
 [0.90833336 0.87941176 0.974359  ]
 [0.90869564 0.88135594 0.9189189 ]
 [0.9379562  0.886076   0.9264706 ]
 [0.92741936 0.88724035 0.9315069 ]
 [0.9166667  0.89044946 0.9594595 ]
 [0.93191487 0.88529414 0.96385545]]

---------------Recall----------------
[[0.9291667  0.92145014 0.86206895]
 [0.92142856 0.9205298  0.81578946]
 [0.91764706 0.9341693  0.85714287]
 [0.8940678  0.9343284  0.83908045]
 [0.90983605 0.95268136 0.82474226]
 [0.8786765  0.9498433  0.8358209 ]
 [0.90733594 0.9506173  0.8666667 ]
 [0.9        0.92011833 0.87777776]
 [0.8953975  0.92537314 0.85714287]
 [0.88259107 0.934375   0.83516484]
 [0.87083334 0.94259816

#### Precision, Recall and F1-Score (Average)

In [27]:
print("-------------Precision--------------")
print(precision_arr_avg)
print()
print("---------------Recall----------------")
print(recall_arr_avg)
print()
print("--------------F1-SCORE---------------")
print(f1score_arr_avg)
print()

-------------Precision--------------
[0.92004466 0.9012149  0.917093   0.91684955 0.9309237  0.9145036
 0.9342944  0.9029638  0.9086004  0.9207013  0.90299016 0.9168343
 0.9153889  0.92219186 0.9270215 ]

---------------Recall----------------
[0.9042286  0.88591594 0.90298647 0.8891589  0.8957532  0.88811356
 0.90820664 0.89929867 0.8926378  0.88404363 0.8650136  0.891319
 0.88292956 0.8826513  0.89060336]

--------------F1-SCORE---------------
[0.9115038  0.8930757  0.9097044  0.9018154  0.91119933 0.89983445
 0.9202011  0.9010985  0.9002137  0.9002487  0.881677   0.90298223
 0.8974468  0.9001202  0.9065266 ]



### Output (Library)

#### Precision, Recall and F1-Score (Each Class)

In [28]:
print("-------------Precision--------------")
print(np.array(precision_lib_arr))
print()
print("---------------Recall----------------")
print(np.array(recall_lib_arr))
print()
print("--------------F1-SCORE---------------")
print(np.array(f1score_lib_arr))
print()

-------------Precision--------------
[[0.89558233 0.92705167 0.9375    ]
 [0.94505495 0.88535032 0.87323944]
 [0.92857143 0.91131498 0.91139241]
 [0.89787234 0.90462428 0.94805195]
 [0.93277311 0.89614243 0.96385542]
 [0.94466403 0.88081395 0.91803279]
 [0.9437751  0.90322581 0.95588235]
 [0.91189427 0.90935673 0.88764045]
 [0.91845494 0.89595376 0.91139241]
 [0.90833333 0.87941176 0.97435897]
 [0.90869565 0.88135593 0.91891892]
 [0.9379562  0.88607595 0.92647059]
 [0.92741935 0.88724036 0.93150685]
 [0.91666667 0.89044944 0.95945946]
 [0.93191489 0.88529412 0.96385542]]

---------------Recall----------------
[[0.92916667 0.92145015 0.86206897]
 [0.92142857 0.9205298  0.81578947]
 [0.91764706 0.93416928 0.85714286]
 [0.8940678  0.93432836 0.83908046]
 [0.90983607 0.95268139 0.82474227]
 [0.87867647 0.94984326 0.8358209 ]
 [0.90733591 0.95061728 0.86666667]
 [0.9        0.92011834 0.87777778]
 [0.89539749 0.92537313 0.85714286]
 [0.88259109 0.934375   0.83516484]
 [0.87083333 0.94259819

#### Precision, Recall and F1-Score (Average)

In [29]:
print("-------------Precision--------------")
print(np.array(precision_lib_arr_avg))
print()
print("---------------Recall----------------")
print(np.array(recall_lib_arr_avg))
print()
print("--------------F1-SCORE---------------")
print(np.array(f1score_lib_arr_avg))
print()

-------------Precision--------------
[0.92004467 0.9012149  0.91709294 0.91684952 0.93092365 0.91450359
 0.93429442 0.90296382 0.90860037 0.92070136 0.90299017 0.91683425
 0.91538885 0.92219185 0.92702148]

---------------Recall----------------
[0.90422859 0.88591595 0.9029864  0.88915887 0.89575324 0.88811354
 0.90820662 0.89929871 0.89263783 0.88404364 0.86501357 0.89131894
 0.88292956 0.88265128 0.89060336]

--------------F1-SCORE---------------
[0.91150382 0.89307568 0.90970438 0.90181545 0.91119937 0.89983445
 0.92020118 0.90109851 0.9002137  0.9002487  0.88167703 0.90298223
 0.89744678 0.90012017 0.90652668]



# Save Data

In [None]:
# dill.dump_session('Pengujian.db')