# Import Library

In [1]:
import numpy as np
import pandas as pd
import math
import random
import dill
try:
    dill.load_session('Pengujian.db')
except Exception as ex:
    print("Data Belum Tersimpan!")

# Support

## Identify Label

In [2]:
def count_label(data):
    distinct = [data[0][-1]]
    for i in range(len(data)):
        check = True
        for j in range(len(distinct)):
            if distinct[j] == data[i][-1]:
                check = False
        if check:
            distinct.append(data[i][-1])
    return np.array(distinct).astype(int), len(distinct)

## Evaluation STD dan Mean

In [3]:
def evaluation(arr_acc):
    mean = np.mean(arr_acc)
    varian = sum((arr_acc - mean) * (arr_acc - mean)) / len(arr_acc)
    std = math.sqrt(varian)
    return mean, std

## Euclidian Distance

In [4]:
def euclidian(X, Y):
    euc = 0
    for i in range(len(X)):
        euc += math.pow((X[i]-Y[i]), 2)
    euc = math.sqrt(euc)
    return euc

# Pre Processing

## Random Data

In [5]:
def randomize(arr):
    for i in range(len(arr)-1, 0, -1):
        j = random.randint(0, i)
        arr[i], arr[j] = arr[j], arr[i]
    return arr

## Normalisasi

In [6]:
def normalize(data):
    data = np.array(data).astype(float)
    max_value = np.max(data, axis=0)
    min_value = np.min(data, axis=0)
    for i in range(len(data)):
        for j in range(len(data[0])-1):
            data[i][j] = (data[i][j] - min_value[j]) / (max_value[j] - min_value[j])
    return data

## Missing Value

In [7]:
def handleMissValue(data):
    filled_data = np.array([[0 for i in range(len(data[0]))] for j in range(len(data))])
    cLabel = count_label(data)[1]
    arr_data = [[] for j in range(cLabel)]
    mean = []
    for i in range(len(data)):
        arr_data[int(data[i][-1])-1].append(data[i])
    counter = 0
    for i in range(len(arr_data)):
        mean.append(np.nanmean(arr_data[i], axis=0))
        for j in range(len(arr_data[i])):
            for k in range(len(arr_data[i][j])):
                if np.isnan(arr_data[i][j][k]):
                    arr_data[i][j][k] = mean[i][k]
            filled_data[counter] = arr_data[i][j]
            counter += 1
    return filled_data

## Ekstraksi File 


In [8]:
def extraction(file, is_normal = True, is_random = True):
    data = pd.read_excel(file)
    data = handleMissValue(np.array(data))
    data = pd.DataFrame(data)
    data.drop_duplicates()
    data = np.array(data)
    if is_normal:
        data = normalize(data)
    if is_random:
        data = randomize(data.tolist())
    return data

# LMKHNCN

## Set of KNCN

### 1 NCN

In [9]:
def closest(Xx, Xx_label, Y, cLabel):
    distance = [[None for i in range(len(Xx))] for j in range(cLabel)]
    min_distance_value = [0 for i in range(cLabel)]
    min_distance_index = [0 for i in range(cLabel)]
    for i in range(len(Xx)):
        distance[((Xx_label[i][0])-1)][i] = euclidian(Xx[i], Y)
    for i in range(cLabel):
        min_distance_value[i] = min(x for x in distance[i] if x is not None)
        min_distance_index[i] = distance[i].index(min(x for x in distance[i] if x is not None))
    return np.array(distance), min_distance_index, min_distance_value

### N NCN

In [10]:
def kncn(k, Xx, Xx_label, Y, cLabel, close_dist):
    ncn = np.array([[0 for i in range(k)] for j in range(cLabel)])
    for i in range(cLabel):
        ncn[i][0] = close_dist[i]

    for i in range (1, k):
        distance = [[None for i in range(len(Xx))] for j in range(cLabel)]
        min_distance_value = [0 for i in range(cLabel)]
        min_distance_index = [0 for i in range(cLabel)]
        for j in range(len(Xx)):
            label = Xx_label[j][0] - 1
            check = True
            for a in range(i):
                if ncn[label][a] == j:
                    check = False
            if check:
                add = 0 + Xx[j]
                for a in range(i):
                    indeks_ncn = ncn[label][a]
                    add += Xx[indeks_ncn]
                centroid = add / (i+1)
                distance[label][j] = euclidian(Y, centroid)
        for j in range(cLabel):
            min_distance_value[j] = min(x for x in distance[j] if x is not None)
            min_distance_index[j] = distance[j].index(min(x for x in distance[j] if x is not None))
            ncn[j][i] = min_distance_index[j]
    return ncn

## Local Mean Vector

In [11]:
def localMeanVector(Xx, ncn):
    lm_ncn = [[0 for i in range(len(ncn[0]))] for j in range(len(ncn))]
    for i in range(len(ncn)):
        lm_ncn[i][0] = Xx[ncn[i][0]]
        for j in range(1, len(ncn[0])):
            add = 0 + lm_ncn[i][0]
            for a in range(j):
                add += Xx[ncn[i][a + 1]]
            lm_ncn[i][j] = add / (j+1)
    return np.array(lm_ncn)

## Harmonic Mean Distance

In [12]:
def harmonicMean(lm_ncn, Y):
    k = len(lm_ncn[0])
    hm_ncn = []
    for i in range(len(lm_ncn)):
        add = 0
        for j in range(len(lm_ncn[0])):
            distance = euclidian(Y, lm_ncn[i][j])
            ## Centroid Lokal i Tepat Pada Data Uji
            if distance == 0:
                add =+ 0
            else:
                add += (1/distance)
        ## Semua Centroid Lokal Tepat Pada Data Uji
        if add == 0:
            harmonic_mean = 0
        else:
            harmonic_mean = (k/add)
        hm_ncn.append(harmonic_mean)
    return hm_ncn

## Classification Decision

In [13]:
def decision(hm_ncn, Y_label):
    min_distance_index = hm_ncn.index(min(x for x in hm_ncn))
    return (min_distance_index + 1), Y_label[0]

# Performance Evaluation

## K Cross Fold Validation

In [14]:
def crossValidation(data, cross_val, cross_index, label_index):
    split_test = int(len(data) * (1 / cross_val))
    change_index = (cross_index - 1) * split_test
    
    for i in range(split_test):
        j = (i + change_index)
        data[i], data[j] = data[j], data[i]
    
    data_train_feature = pd.DataFrame(data[split_test:]).drop([label_index], axis=1)
    data_train_label = pd.DataFrame(pd.DataFrame(data[split_test:]), columns=[label_index]).astype(np.int)
    data_test_feature = pd.DataFrame(data[0:split_test]).drop([label_index], axis=1)
    data_test_label = pd.DataFrame(pd.DataFrame(data[0:split_test]), columns=[label_index]).astype(np.int)
    
    data_train_feature = np.array(data_train_feature)
    data_train_label = np.array(data_train_label)
    data_test_feature = np.array(data_test_feature)
    data_test_label = np.array(data_test_label)

    return data, data_train_feature, data_train_label, data_test_feature, data_test_label

## LMKHNCN

In [15]:
def lmkhncn(k, Xx, Xx_label, Yy, Yy_label, cLabel):
    acc = 0
    coff = np.array([[0 for i in range(jml_kelas)] for j in range(jml_kelas)])
    
    for i in range(len(Yy)):
        jarak, jarak_index_min, jarak_nilai_min = closest(Xx, Xx_label, Yy[i], cLabel)
        ncn = kncn(k, Xx, Xx_label, Yy[i], cLabel, jarak_index_min)
        local_mean = localMeanVector(Xx, ncn)
        harmonic_mean = harmonicMean(local_mean, Yy[i])
        predict, actual = decision(harmonic_mean, Yy_label[i])
        
        coff[predict-1][actual-1] += 1
        if(predict == actual):
            acc += 1
#         print(str(round((i+1)/len(Yy),2)*100)+"%. Prediksi :"+ str(predict) + " Actual :" + str(actual) + " Acc Lokal: "+ str(acc/(i+1)))
    
    acc /= len(Yy)
    return acc, coff

# Inisialisasi

In [16]:
# k = 1
k = 15
file = 'Dataset Pegawai.xlsx'
cross_val = 10
label_index = 9 #Kinerja
normalisasi = False
random_data = True
data = extraction(file, normalisasi, random_data)
arr_kelas, jml_kelas = count_label(data)

# Testing

In [None]:
arr_coff = [[] for i in range(k)]
arr_mean_acc = [0 for i in range(k)]
arr_std_acc  = [0 for i in range(k)]

for a in range(1, k + 1): #Pengujian Parameter K
    arr_acc = [0 for i in range(cross_val)]
    for i in range(1, cross_val + 1):
        data, train_f, train_l, test_f, test_l = crossValidation(data, cross_val, i, label_index)
        acc, coff = lmkhncn(a, train_f, train_l, test_f, test_l, jml_kelas)
        arr_acc[i-1] = acc
        arr_coff[a-1].append(np.array(coff))
#         print("Cross-"+str(i)+" Acc:"+ str(acc))
    mean, std = evaluation(arr_acc)
    arr_mean_acc[a-1] = mean
    arr_std_acc[a-1] = std
    print('K-{0} Mean : {1} with Standar Deviation : {2}.'.format(a,mean,std))

In [18]:
print(np.array(arr_mean_acc))
print(np.array(arr_std_acc))

[0.99969605 1.         0.95987842 0.9331307  0.92142857 0.91048632
 0.91291793 0.91291793 0.91458967 0.91322188 0.91337386 0.91276596
 0.9156535  0.91474164 0.91428571]
[0.00091185 0.         0.01735719 0.0159972  0.01851423 0.01724037
 0.01772391 0.01744811 0.01737847 0.01899939 0.0176318  0.01866519
 0.01870289 0.01882843 0.01881309]


In [19]:
print(np.array(arr_coff))

[[[[  0   2   0]
   [  0 418   0]
   [  0   0 238]]

  [[147   0   0]
   [  0 376   0]
   [  0   0 135]]

  [[227   0   0]
   [  0 348   0]
   [  0   0  83]]

  ...

  [[320   0   0]
   [  0 286   0]
   [  0   0  52]]

  [[296   0   0]
   [  0 306   0]
   [  0   0  56]]

  [[253   0   0]
   [  0 346   0]
   [  0   0  59]]]


 [[[253   0   0]
   [  0 346   0]
   [  0   0  59]]

  [[147   0   0]
   [  0 376   0]
   [  0   0 135]]

  [[227   0   0]
   [  0 348   0]
   [  0   0  83]]

  ...

  [[320   0   0]
   [  0 286   0]
   [  0   0  52]]

  [[296   0   0]
   [  0 306   0]
   [  0   0  56]]

  [[253   0   0]
   [  0 346   0]
   [  0   0  59]]]


 [[[245  25   0]
   [  6 319  10]
   [  2   2  49]]

  [[147   2   1]
   [  0 373   3]
   [  0   1 131]]

  [[222   7   1]
   [  5 341   1]
   [  0   0  81]]

  ...

  [[304   7   2]
   [ 16 278   4]
   [  0   1  46]]

  [[285  10   1]
   [ 11 296   3]
   [  0   0  52]]

  [[245  25   0]
   [  6 319  10]
   [  2   2  49]]]


 ...


 [[[235  24 

# Save Data

In [None]:
dill.dump_session('Pengujian.db')

In [20]:
def coff_transform(coff):
    prec = np.array([[0 for i in range(len(coff[0][0]))] for j in range(len(coff))]).astype(np.float)
    rec = np.array([[0 for i in range(len(coff[0][0]))] for j in range(len(coff))]).astype(np.float)
    acc = np.array([0 for j in range(len(coff))]).astype(np.float)
    for i in range(len(coff)):
        ttp = 0
        tfp = np.array([0 for i in range(len(coff[0][0]))])
        tfn = np.array([0 for i in range(len(coff[0][0]))])
        for j in range(len(coff[i])):
            count_test = sum(sum(arr_coff[i][j]))
            for k in range(len(coff[i][j])):
                ttp += coff[i][j][k][k]
#                 tfp[k] = sum(coff[i][j])[k]
#                 tfn[k] = sum(coff[i][j][k])
                tfp[k] = sum(coff[i][j][k])
                tfn[k] = sum(coff[i][j])[k]
            for k in range(len(coff[i][j])):
                prec[i][k] += ttp / (ttp + tfp[k])
                rec[i][k] += ttp / (ttp + tfn[k])
        for j in range(len(coff[i][j])):
            prec[i][j] /= len(coff[i])
            rec[i][j] /= len(coff[i])
        acc[i] = (ttp / count_test) / len(coff[i])  
    return prec, rec, acc

In [21]:
precision, recall, accuracy = coff_transform(arr_coff)
print(precision)
print()
print(recall)
print()
print(accuracy)

[[0.93247584 0.8766382  0.95203487]
 [0.90505356 0.88115786 0.97044897]
 [0.89983666 0.87735683 0.97030519]
 [0.89835685 0.87471203 0.97010332]
 [0.8970579  0.87269726 0.97043041]
 [0.89687101 0.87081146 0.97029016]
 [0.89736744 0.87056183 0.9708983 ]
 [0.89751105 0.8707635  0.97074801]
 [0.89771301 0.870859   0.97074662]
 [0.89791955 0.87074665 0.97087181]
 [0.89772192 0.87079459 0.97101457]
 [0.89733245 0.87100333 0.97120672]
 [0.89816754 0.870723   0.97149089]
 [0.89806837 0.87050191 0.97165386]
 [0.8978679  0.87063569 0.9712309 ]]

[[0.93277979 0.87652466 0.95203487]
 [0.90505356 0.88115786 0.97044897]
 [0.90134695 0.87681365 0.96913198]
 [0.89949231 0.87473035 0.96859035]
 [0.89822637 0.87327141 0.96815777]
 [0.89726997 0.87217153 0.96785904]
 [0.89752652 0.87245534 0.96792992]
 [0.89765192 0.87261865 0.96799498]
 [0.89778395 0.87275746 0.96802782]
 [0.89785033 0.87285784 0.96807534]
 [0.89783024 0.87283332 0.96806952]
 [0.89780666 0.87281299 0.96806715]
 [0.89809459 0.87314702 0.