In [1]:
import numpy as np
import pandas as pd
import math
import random
import dill
try:
    dill.load_session('Pengujian.db')
except Exception as ex:
    print("Data Belum Tersimpan!")

# Support

In [2]:
def count_label(data):
    distinct = [data[0][-1]]
    for i in range(len(data)):
        check = True
        for j in range(len(distinct)):
            if distinct[j] == data[i][-1]:
                check = False
        if check:
            distinct.append(data[i][-1])
    return np.array(distinct).astype(int), len(distinct)

In [3]:
def evaluation(arr_acc):
    mean = np.mean(arr_acc)
    varian = sum((arr_acc - mean) * (arr_acc - mean)) / len(arr_acc)
    std = math.sqrt(varian)
    return mean, std

In [4]:
def euclidian(X, Y):
    euc = 0
    for i in range(len(X)):
        euc += math.pow((X[i]-Y[i]), 2)
    euc = math.sqrt(euc)
    return euc

# Pre Processing

In [5]:
def randomize(arr):
    for i in range(len(arr)-1, 0, -1):
        j = random.randint(0, i)
        arr[i], arr[j] = arr[j], arr[i]
    return arr

In [6]:
def normalize(data):
    data = np.array(data).astype(float)
    max_value = np.max(data, axis=0)
    min_value = np.min(data, axis=0)
    for i in range(len(data)):
        for j in range(len(data[0])-1):
            data[i][j] = (data[i][j] - min_value[j]) / (max_value[j] - min_value[j])
    return data

In [7]:
def handleMissValue(data):
    filled_data = np.array([[0 for i in range(len(data[0]))] for j in range(len(data))])
    cLabel = count_label(data)[1]
    arr_data = [[] for j in range(cLabel)]
    mean = []
    for i in range(len(data)):
        arr_data[int(data[i][-1])-1].append(data[i])
    counter = 0
    for i in range(len(arr_data)):
        mean.append(np.nanmean(arr_data[i], axis=0))
        for j in range(len(arr_data[i])):
            for k in range(len(arr_data[i][j])):
                if np.isnan(arr_data[i][j][k]):
                    arr_data[i][j][k] = mean[i][k]
            filled_data[counter] = arr_data[i][j]
            counter += 1
    return filled_data

In [8]:
def extraction(file, is_normal = True, is_random = True):
    data = pd.read_excel(file)
    data = handleMissValue(np.array(data))
    data = pd.DataFrame(data)
    data.drop_duplicates()
    data = np.array(data)
    if is_normal:
        data = normalize(data)
    if is_random:
        data = randomize(data.tolist())
    return data

In [9]:
def crossValidation(data, cross_val, cross_index, label_index):
    split_test = int(len(data) * (1 / cross_val))
    change_index = (cross_index - 1) * split_test
    
    for i in range(split_test):
        j = (i + change_index)
        data[i], data[j] = data[j], data[i]
    
    data_train_feature = pd.DataFrame(data[split_test:]).drop([label_index], axis=1)
    data_train_label = pd.DataFrame(pd.DataFrame(data[split_test:]), columns=[label_index]).astype(np.int)
    data_test_feature = pd.DataFrame(data[0:split_test]).drop([label_index], axis=1)
    data_test_label = pd.DataFrame(pd.DataFrame(data[0:split_test]), columns=[label_index]).astype(np.int)
    
    data_train_feature = np.array(data_train_feature)
    data_train_label = np.array(data_train_label)
    data_test_feature = np.array(data_test_feature)
    data_test_label = np.array(data_test_label)

    return data, data_train_feature, data_train_label, data_test_feature, data_test_label

# LMKNN

In [10]:
def closest(Xx, Xx_label, Y, cLabel, k):
    distance = [[None for i in range(len(Xx))] for j in range(cLabel)]
    min_distance_value = []
    min_distance_index = []
    for i in range(len(Xx)):
        distance[((Xx_label[i][0])-1)][i] = euclidian(Xx[i], Y)
    for i in range(cLabel):
        min_distance_value.append([])
        min_distance_index.append([])
        for j in range(k):
            mdv = min(x for x in distance[i] if x not in min_distance_value[i] and x is not None)
            mdi = distance[i].index(min(x for x in distance[i] if x not in min_distance_value[i] and x is not None)) 
            min_distance_value[i].append(mdv)
            min_distance_index[i].append(mdi)
    return distance, min_distance_value, min_distance_index

In [11]:
def localMeanVector(closest, Xx, Y, k):
    local_mean = [0 for i in range(len(closest))]
    for i in range(len(closest)):
        for j in range(len(closest[i])):
            local_mean[i] += Xx[closest[i][j]]
        local_mean[i] = local_mean[i] / k
    return local_mean

In [12]:
def decision(local_mean, Y, Y_label):
    distance = [0 for i in range(len(local_mean))]
    for i in range(len(local_mean)):
        distance[i] = euclidian(local_mean[i], Y)
    argmin = distance.index(min(distance)) + 1
    return argmin, Y_label[0]

# Testing

In [13]:
k = 15
file = 'Dataset Pegawai.xlsx'
cross_val = 10
label_index = 9 #Kinerja
normalisasi = False
random_data = True
data = extraction(file, normalisasi, random_data)
arr_kelas, jml_kelas = count_label(data)

In [None]:
arr_mean_acc = [0 for i in range(k)]
arr_std_acc  = [0 for i in range(k)]

for a in range(1, k + 1): #Pengujian Parameter K
    arr_acc = [0 for i in range(cross_val)]
    for i in range(1, cross_val + 1):
        acc = 0
        data, train_f, train_l, test_f, test_l = crossValidation(data, cross_val, i, label_index)
        for j in range(len(test_f)):
            distance, min_value, min_index = closest(train_f, train_l, test_f[j], jml_kelas, a)
            local_mean = localMeanVector(min_index, train_f, test_f[j], a)
            predict, actual = decision(local_mean, test_f[j], test_l[j])
            if predict == actual:
                acc += 1
        arr_acc[i-1] = acc/len(test_f)
    mean, std = evaluation(arr_acc)
    arr_mean_acc[a-1] = mean
    arr_std_acc[a-1] = std
    print('K-{0} Mean : {1} with Standar Deviation : {2}.'.format(a,mean,std))

In [14]:
print(arr_mean_acc)
print(arr_std_acc)

[0.9121580547112462, 0.9037993920972645, 0.898176291793313, 0.8919452887537993, 0.8817629179331306, 0.874468085106383, 0.8673252279635258, 0.8585106382978722, 0.8579027355623101, 0.8465045592705167, 0.8399696048632219, 0.8382978723404255, 0.8337386018237082, 0.825531914893617, 0.8240121580547113]
[0.009388112590565632, 0.008128923477395945, 0.011811160624757119, 0.010339932336276007, 0.009189189336067452, 0.012809305474593009, 0.012514715426164252, 0.01049512867514866, 0.012295011288863747, 0.014449683068698732, 0.009966870779373607, 0.013041603618799356, 0.015396869473451607, 0.009984235687329396, 0.015936445871025315]


# Save Data

In [None]:
dill.dump_session('Pengujian.db')