# LIBRARY

In [None]:
# import dataset
import pandas as pd
import numpy as np
import warnings
import time
import random
import matplotlib.pyplot as plt
import seaborn as sns
import time
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV

# library teknik sampling
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.combine import SMOTEENN

# library algoritma klasifikasi
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier
from keras.layers import BatchNormalization, Dense, Dropout, Flatten
from keras.optimizers import Adam
from keras.models import Sequential
from keras.callbacks import EarlyStopping

# library evaluation metric
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold
from statistics import mean
from sklearn import metrics

# DATASET

In [None]:
class GenerateDataset:
    def get_data(self):
        # dataset
        URL = "https://raw.githubusercontent.com/adizaen/dataset-tesis/main/spam.csv"
        # URL = "https://raw.githubusercontent.com/adizaen/dataset-tesis/main/ionosphere.csv"
        # URL = "https://raw.githubusercontent.com/adizaen/dataset-tesis/main/voice.csv"
        # URL = "https://raw.githubusercontent.com/adizaen/dataset-tesis/main/parkinson.csv"
        # URL = "https://raw.githubusercontent.com/adizaen/dataset-tesis/main/malware.csv"
        # URL = "https://raw.githubusercontent.com/adizaen/dataset-tesis/main/mri.csv"
        # URL = "https://raw.githubusercontent.com/adizaen/dataset-tesis/main/heart-attack.csv"
        # URL = "https://raw.githubusercontent.com/adizaen/dataset-tesis/main/ozone-level.csv"

        # akses dataset melalui URL
        dataset = pd.read_csv(URL)

        # menentukan data dan kelas
        X = dataset.iloc[:, :-1]
        y = dataset.iloc[:, -1]

        # membagi dataset menjadi data training dan data testing
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

        # mengembalikan dataset yang sudah displit
        return X_train, X_test, y_train, y_test, URL

# PRE-PROCESSING

In [None]:
# class yang berisi sekumpulan fungsi untuk melakukan pre-processing

class Preprocessing:

    # fungsi untuk menangani data bernilai konstan
    def handling_constant_value(self, X, y):
        constant_value = []

        # mengecek jumlah data berbeda (unik) suatu kolom. Jika bernilai 1,
        # maka kolom tersebut bernilai konstan (tetap)
        for col in X.columns:
            if (X[col].nunique() == 1):
                constant_value.append(col)

        # jika ada kolom dengan nilai konstan, maka akan dihapus
        if (len(constant_value) > 0):
            for col in constant_value:
                # menghapus kolom yang bernilai konstan
                X.drop(col, inplace = True, axis = 1)

        # mengembalikan dataset
        return X, y

    # fungsi untuk menangani kolom kosong
    def handling_missing_value(self, X, y):
        # cek apakah ada nilai kosong atau tidak
        is_missing = X.isnull().values.any()

        # jika ada nilai kosong (True), maka dilakukan proses fillna menggunakan median
        if (is_missing == True):

            # membuat list kolom yang memiliki nilai kosong
            miss_column = X.columns[X.isnull().any()].tolist()

            # perulangan untuk proses fillna
            for col in miss_column:
                X.fillna({
                    col: X[col].median()
                }, inplace = True)

        # mengembalikan dataset
        return X, y

    # fungsi untuk label encoding
    def label_encoding(self, X, y):
        # menghitung jumlah class dalam numpy array
        class_counts = Counter(y)

        # mencari label class yang memiliki jumlah paling sedikit (minority) dan paling banyak (majority)
        majority_class = max(class_counts, key=class_counts.get)
        minority_class = min(class_counts, key=class_counts.get)

        # mengganti label class
        y = y.replace(to_replace = [majority_class, minority_class], value = [0, 1])

        # mengembalikan dataset
        return X, y

    # fungsi untuk membersihkan dataset sebelum masuk tahap normalisasi
    def get_clean_dataset(self, X, y):
        # memanggil method untuk menangani data konstan dan data kosong
        X, y = self.handling_constant_value(X, y)
        X, y = self.handling_missing_value(X, y)
        X, y = self.label_encoding(X, y)

        # mengembalikan dataset
        return X, y

    # normalisasi dataset ke dalam range tertentu
    def normalisasi(self, X, y):
        # membuat objek dari class MinMaxScaler
        scaler = MinMaxScaler(feature_range = (0.1, 0.9))

        # get kolom dataset
        attribute_name = X.columns

        # proses fit dan transform pada dataset
        X[attribute_name] = scaler.fit_transform(X[attribute_name])

        # mengembalikan dataset
        return X, y, attribute_name, scaler

    # proses seleksi fitur untuk membuat fitur yang tidak relevan
    def seleksi_fitur(self, X, y):
        # menggabungkan X_train dan y_train
        class_target = pd.Series(y, name="class")
        X = pd.concat([X, class_target], axis=1)

        # inisiasi kelas target
        target = X.columns[-1]

        # menghitung korelasi r antara semua fitur dengan kelas target
        correlation_value = abs(X.corr()[target])

        # mengurutkan nilai r dari yang paling besar (paling berpengaruh terhadap kelas target)
        most_impact_attribute = correlation_value.sort_values(ascending=False)

        # menghapus atribut dengan nilai r yang paling besar (nilai r = 1 yaitu dirinya sendiri)
        most_impact_attribute = most_impact_attribute.drop(most_impact_attribute.index[0])

        # menyimpan nama atribut yang sudah diurutkan ke dalam list
        all_columns = most_impact_attribute.index.tolist()

        # duplikasi nama atribut ke variabel baru
        available_columns = all_columns.copy()

        # inisialisasi list untuk menampung nama kolom yang akan dihapus
        columns_to_drop = []

        # perulangan untuk seleksi fitur
        for i in all_columns:
            for j in available_columns:
                # mencari nilai korelasi (r) antar variabel
                correlation_coefficient = abs(X[i].corr(X[j]))

                # logika if jika kolom dirinya sendiri, maka diabaikan
                if (i == j):
                    continue
                elif (correlation_coefficient >= 0.7):
                    if not (j in columns_to_drop):
                        columns_to_drop.append(j)

            # update available_columns tiap 1x iterasi pada all_columns
            available_columns.remove(i)

        # menghapus kolom pada dataset
        X.drop(columns_to_drop, axis=1, inplace=True)

        # memisah kembali antara X_train dan y_train
        X = X.iloc[:, :-1]

        # kolom yang digunakan
        list_kolom = X.columns

        # kembalikan dataset hasil seleksi fitur
        return X, y, list_kolom

# SAMPLING

In [None]:
class FILTERING:
    def __init__(self, neighbors):
        self.neighbors = neighbors

    def fit_transform(self, X, y):
        # get bobot untuk tiap fitur
        weight, reversed_weight = self.get_attribute_weight(X, y)

        # jika ada nilai NaN, ganti dengan 0
        weight[np.isnan(weight)] = 0
        reversed_weight[np.isnan(reversed_weight)] = 0

        # menyimpan nama kolom dari X dan y
        X_columns = X.columns.tolist()
        y_columns = y.name

        # konversi dataframe X dan y menjadi numpy array
        X = X.to_numpy()
        y = y.to_numpy()

        # proses oversampling menggunakan metode WKSMOTE
        X, y = self.filter(X, y)

        # konversi X numpy array ke dataFrame dan y ke pandas Series
        X = pd.DataFrame(X, columns = X_columns)
        y = pd.Series(y, name = y_columns)

        # kembalikan data X dan y hasil hibrid WKSMOTE-RENN
        return X, y


    def get_attribute_weight(self, X, y):
        # menghitung nilai absolut korelasi fitur
        weight = abs(X.corrwith(y).to_numpy())

        # normalisasi fitur sehingga totalnya 1
        normalized_weight = weight /sum(weight)

        # menghitung nilai korelasi fitur yang dibalik (reverse)
        reversed_weight = sum(weight) / weight

        # normalisasi fitur sehingga totalnya 1
        normalized_reversed_weight = reversed_weight / sum(reversed_weight)

        # mengembalikan bobot
        return normalized_weight, normalized_reversed_weight

    def get_minority_majority_data(self, X_train, y_train):
        # menghitung jumlah class dalam numpy array
        class_counts = Counter(y_train)

        # mencari label class yang memiliki jumlah paling sedikit (minority) dan paling banyak (majority)
        majority_class = max(class_counts, key=class_counts.get)
        minority_class = min(class_counts, key=class_counts.get)

        # mencari index dari data yang memiliki label class sama dengan target class
        index_majority = np.where(y_train == majority_class)[0]
        index_minority = np.where(y_train == minority_class)[0]

        # ekstrak data X yang memiliki kelas minoritas dan mayoritas
        X_majority = X_train[index_majority]
        X_minority = X_train[index_minority]

        # ekstrak data y yang memiliki kelas minoritas dan mayoritas
        y_majority = y_train[index_majority]
        y_minority = y_train[index_minority]

        return X_majority, X_minority, y_majority, y_minority, majority_class, minority_class

    def filter(self, X, y):
        # get minority dan majority data
        X_majority, X_minority, y_majority, y_minority, majority_class, minority_class = self.get_minority_majority_data(X, y)

        # membuat objek dari class KNN
        classifier = KNeighborsClassifier()

        # proses fitting
        classifier.fit(X, y)

        # prediksi data mayoritas
        y_predict = classifier.predict(X)

        # membandingkan y_true dan y_predict
        result = (y == y_predict)
        index_remove = [index[0] for index, value in np.ndenumerate(result) if value==False]

        # hapus data noise
        X_filter = np.delete(X, index_remove, axis=0)
        y_filter = np.delete(y, index_remove, axis=0)

        # kembalikan data
        return X_filter, y_filter

In [None]:
class WKSMOTE_RENN:
    def __init__(self, neighbors, n_neighbors_filter):
        self.neighbors = neighbors
        self. n_neighbors_filter =  n_neighbors_filter

    def fit_resample(self, X, y):
        # get bobot untuk tiap fitur
        weight, reversed_weight = self.get_attribute_weight(X, y)

        # jika ada nilai NaN, ganti dengan 0
        weight[np.isnan(weight)] = 0
        reversed_weight[np.isnan(reversed_weight)] = 0

        # menyimpan nama kolom dari X dan y
        X_columns = X.columns.tolist()
        y_columns = y.name

        # konversi dataframe X dan y menjadi numpy array
        X = X.to_numpy()
        y = y.to_numpy()

        # proses filter
        X, y = self.filter(X, y)

        # proses oversampling menggunakan metode WKSMOTE
        X, y = self.wksmote(X, y, weight)

        # proses undersampling menggunakan metode RENN
        X, y = self.renn(X, y, reversed_weight)

        # konversi X numpy array ke dataFrame dan y ke pandas Series
        X = pd.DataFrame(X, columns = X_columns)
        y = pd.Series(y, name = y_columns)

        # kembalikan data X dan y hasil hibrid WKSMOTE-RENN
        return X, y


    def wksmote(self, X, y, weight):
        # get data minoritas dan mayoritas
        X_majority, X_minority, y_majority, y_minority, majority_class, minority_class  = self.get_minority_majority_data(X, y)

        # fit model kNN untuk digunakan dalam mencari k tetangga terdekat
        model_neighbors_smote = KNeighborsClassifier(n_neighbors=self.neighbors + 1, metric='minkowski', p=1, metric_params={'w': weight})
        model_neighbors_smote = model_neighbors_smote.fit(X_minority, y_minority)

        # membuat numpy array untuk menampung data sintetis
        X_synthetic_data = np.zeros((0, X_majority.shape[1]))

        # index saat ini
        current_index = 0

        # perulangan untuk membuat data sintetis sampai count_minority == count_majority (seimbang)
        while True:

            # cari k tetangga terdekat dari data point yang dipilih
            data_point = np.array([X_minority[current_index]])
            sorted_nearest_distance = model_neighbors_smote.kneighbors(data_point, return_distance=True)

            # memisah data jarak dan index dari k tetangga terdekat yang terpilih
            distance = list(sorted_nearest_distance[0][0])
            index = list(sorted_nearest_distance[1][0])
            distance.pop(0)
            index.pop(0)

            # menggabungkan data jarak dan index menjadi pandas series
            sorted_nearest_distance = pd.Series(distance, index = index)

            # membuat data sintetis
            X_synthetic_data = self.create_synthetic_data(X_synthetic_data, data_point, X_minority, sorted_nearest_distance)

            # cek apakah sudah seimbang
            balance_status = self.is_balance(y_majority, y_minority, X_synthetic_data)
            if balance_status: break

            # jika belum seimbang dan index data minority sudah sampai akhir, maka ulangi loop dari awal
            if current_index == len(y_minority) - 1:
                current_index = 0
            else:
                current_index += 1

        # membuat y_synthetic_data
        y_synthetic_data = np.full(X_synthetic_data.shape[0], minority_class)

        # menggabungkan X dan y baik untuk minority maupun majority
        X_wksmote = np.row_stack((X, X_synthetic_data))
        y_wksmote = np.append(y, y_synthetic_data)

        # mengembalikan X dan y hasil WKSMOTE
        return X_wksmote, y_wksmote

    def renn(self, X, y, weight):
        # get minority dan majority data
        X_majority, X_minority, y_majority, y_minority, majority_class, minority_class = self.get_minority_majority_data(X, y)

        # fit model kNN untuk digunakan dalam mencari k tetangga terdekat
        model_neighbors_renn = KNeighborsClassifier(n_neighbors=self.neighbors + 1, metric='minkowski', p=1, metric_params={'w': weight})
        model_neighbors_renn = model_neighbors_renn.fit(X, y)

        # membuat list untuk menampung index data yang akan dihapus
        index_remove = []

        # index saat ini
        current_index = 0

        while True:
            # cari k tetangga terdekat dari data point yang dipilih
            data_point = np.array([X[current_index]])
            sorted_nearest_distance = model_neighbors_renn.kneighbors(data_point, return_distance=True)

            # memisah data jarak dan index dari k tetangga terdekat yang terpilih
            distance = list(sorted_nearest_distance[0][0])
            index = list(sorted_nearest_distance[1][0])
            distance.pop(0)
            index.pop(0)

            # menggabungkan data jarak dan index menjadi pandas series
            sorted_nearest_distance = pd.Series(distance, index = index)

            # menampung index yang akan dihapus
            index_remove = self.remove_data(index_remove, X, y, data_point, sorted_nearest_distance)

            # jika current index sudah sampai data minoritas terakhir, maka berhenti
            if current_index == len(y) - 1:
                break
            else:
                current_index += 1

        # menghapus data
        X_renn = np.delete(X, index_remove, axis=0)
        y_renn = np.delete(y, index_remove, axis=0)

        # mengembalikan X dan y hasil RENN
        return X_renn, y_renn

    def is_balance(self, y_majority, y_minority, X_synthetic_data):
        count_minority = len(y_minority) + X_synthetic_data.shape[0]
        lower_limit = len(y_majority) - self.neighbors * 2
        return True if count_minority >= lower_limit else False

    def get_attribute_weight(self, X, y):
        # menghitung nilai absolut korelasi fitur
        weight = abs(X.corrwith(y).to_numpy())

        # normalisasi fitur sehingga totalnya 1
        normalized_weight = weight /sum(weight)

        # menghitung nilai korelasi fitur yang dibalik (reverse)
        reversed_weight = sum(weight) / weight

        # normalisasi fitur sehingga totalnya 1
        normalized_reversed_weight = reversed_weight / sum(reversed_weight)

        # mengembalikan bobot
        return normalized_weight, normalized_reversed_weight


    def create_synthetic_data(self, X_synthetic_data, data_point, X_safe_minority, sorted_nearest_distance):
        random.seed(42)
        # perulangan untuk membuat data sintetis sebanyak k neighbors
        for i in sorted_nearest_distance.index:
            # mencari selisih antara 2 data
            different1 = data_point - X_safe_minority[i]
            different2 = X_safe_minority[i] - data_point

            # membuat data sintetis
            new_data1 = X_safe_minority[i] + random.uniform(0.2, 0.4) * different1
            new_data2 = X_safe_minority[i] + random.uniform(0.2, 0.4) * different2
            new_data = np.vstack((new_data1, new_data2))

            # menggabungkan dengan data sintetis yang ada
            X_synthetic_data = np.vstack((X_synthetic_data, new_data))

        # kembalikan data sintetis yang dibuat
        return X_synthetic_data

    def remove_data(self, index_remove, X_wksmote, y_wksmote, data_point, sorted_nearest_distance):
        # mencari label class pada y berdasarkan index dari sorted_nearest_distance
        y_sorted_nearest_distance = y_wksmote[sorted_nearest_distance.index]

        # menghitung jumlah class dalam numpy array
        class_counts = Counter(y_sorted_nearest_distance)

        # mencari label class yang memiliki jumlah paling banyak (majority) dan paling sedikit (minority) dari k tetangga terdekat
        majority_class_k_neighbors = max(class_counts, key=class_counts.get)

        # mencari index dari data point
        data_point_index = np.where((X_wksmote == data_point).all(axis=1))[0][0]

        # mencari label class dari data point
        data_point_class = y_wksmote[data_point_index]

        # cari index nya
        if majority_class_k_neighbors != data_point_class:
            index_remove.append(data_point_index)

            for index in sorted_nearest_distance.index:
                if y_wksmote[index] != majority_class_k_neighbors:
                    index_remove.append(index)

        # kembalikan data sintetis yang dibuat
        return index_remove

    def get_minority_majority_data(self, X_train, y_train):
        # menghitung jumlah class dalam numpy array
        class_counts = Counter(y_train)

        # mencari label class yang memiliki jumlah paling sedikit (minority) dan paling banyak (majority)
        majority_class = max(class_counts, key=class_counts.get)
        minority_class = min(class_counts, key=class_counts.get)

        # mencari index dari data yang memiliki label class sama dengan target class
        index_majority = np.where(y_train == majority_class)[0]
        index_minority = np.where(y_train == minority_class)[0]

        # ekstrak data X yang memiliki kelas minoritas dan mayoritas
        X_majority = X_train[index_majority]
        X_minority = X_train[index_minority]

        # ekstrak data y yang memiliki kelas minoritas dan mayoritas
        y_majority = y_train[index_majority]
        y_minority = y_train[index_minority]

        return X_majority, X_minority, y_majority, y_minority, majority_class, minority_class

    def filter(self, X, y):
        # get minority dan majority data
        X_majority, X_minority, y_majority, y_minority, majority_class, minority_class = self.get_minority_majority_data(X, y)

        # membuat objek dari class KNN
        classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors_filter)

        # proses fitting
        classifier.fit(X, y)

        # prediksi data mayoritas
        y_predict = classifier.predict(X)

        # membandingkan y_true dan y_predict
        result = (y == y_predict)
        index_remove = [index[0] for index, value in np.ndenumerate(result) if value==False]

        # hapus data noise
        X_filter = np.delete(X, index_remove, axis=0)
        y_filter = np.delete(y, index_remove, axis=0)

        # kembalikan data
        return X_filter, y_filter

In [None]:
class SMOTE_ENN:
    def __init__(self, neighbors):
        self.neighbors = neighbors

    def fit_resample(self, X, y):
        smote = SMOTE(k_neighbors=self.neighbors, random_state=42)
        enn = EditedNearestNeighbours(n_neighbors=self.neighbors)

        # sampling dengan SMOTE-ENN
        smoteenn = SMOTEENN(smote=smote, enn=enn)
        X_smote_enn, y_smote_enn = smoteenn.fit_resample(X, y)

        return X_smote_enn, y_smote_enn

# CLASSIFIER

In [None]:
class Classification:
    def __init__(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def naive_bayes(self):
        # membuat objek dari class GaussianNB
        model = GaussianNB()

        # proses training
        model.fit(self.X_train, self.y_train)

        # mengembalikan model hasil training
        return model

    def ann(self):
        # membuat objek dari class ANN
        model = MLPClassifier(random_state=42, tol=0.001, max_iter=500)

        # proses training
        model.fit(self.X_train, self.y_train)

        # mengembalikan model hasil training
        return model

# TRAINING & TESTING PROCESS

In [None]:
class Train_Test_Process:
    def train(self, X_train, y_train, object_class_sampling=None, classifier="NB"):
        # konversi X dataframe dan y series ke bentuk numpy array
        X = X_train.to_numpy()
        y = y_train.to_numpy()

        # membuat objek dari class StratifiedKFold
        objStratified = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

        # inisialiasi awal untuk menampung hasil evaluasi
        list_gmean_stratified = []

        # menampung model tiap fold
        list_model = []
        list_X_train = []
        list_X_test = []
        list_y_train = []
        list_y_test = []

        # inisialisasi awal model terbaik
        best_model_index = 0

        # inisialisasi awal index untuk perulangan
        index = 0

        for train_index, test_index in objStratified.split(X, y):
            # membagi data menjadi train dan test
            X_train_fold, X_test_fold = X[train_index], X[test_index]
            y_train_fold, y_test_fold = y[train_index], y[test_index]

            # konversi X array ke dataframe dan y array ke series
            X_train_fold = pd.DataFrame(X_train_fold, columns=X_train.columns)
            X_test_fold = pd.DataFrame(X_test_fold, columns=X_train.columns)
            y_train_fold = pd.Series(y_train_fold)
            y_test_fold = pd.Series(y_test_fold)

            # lakukan sampling
            if object_class_sampling == None:
                X_sampling, y_sampling = X_train_fold, y_train_fold
            else:
                X_sampling, y_sampling = object_class_sampling.fit_resample(X_train_fold, y_train_fold)

            # membuat objek dari class Classification
            objClassification = Classification(X_sampling, y_sampling)

            # membuat model ML
            if classifier == "NB":
                model = objClassification.naive_bayes()
            else:
                model = objClassification.ann()

            # proses prediksi
            y_predict = model.predict(X_test_fold)
            y_predict = pd.Series(y_predict)

            # evaluasi
            tn, fp, fn, tp = confusion_matrix(y_test_fold, y_predict).ravel()
            recall = tp/(tp + fn)
            specificity = tn/(tn + fp)
            g_mean = (recall * specificity)**0.5

            # gabungkan skor
            list_gmean_stratified.append(g_mean)

            # menyimpan model dan data validasi
            list_model.append(model)
            list_X_train.append(X_sampling)
            list_X_test.append(X_test_fold)
            list_y_train.append(y_sampling)
            list_y_test.append(y_test_fold)

            # menyimpan model terbaik
            if list_gmean_stratified[best_model_index] < g_mean:
                best_model_index = index

            # increment
            index += 1

        # retrain ulang model terbaik dengan dataset utuh
        model = list_model[best_model_index]

        X_train_best = list_X_train[best_model_index]
        X_test_best = list_X_test[best_model_index]
        y_train_best = list_y_train[best_model_index]
        y_test_best = list_y_test[best_model_index]

        X_train = pd.concat([X_train_best, X_test_best], axis=0)
        y_train = pd.concat([y_train_best, y_test_best], axis=0)

        model.fit(X_train, y_train)

        # kembalikan model terbaik yang sudah di-retrain
        return model, X_train, y_train

    def test(self, model, X_test, y_test):
        # proses predict
        y_predict = model.predict(X_test)
        y_predict = pd.Series(y_predict)

        # evaluasi
        tn, fp, fn, tp = confusion_matrix(y_test, y_predict).ravel()
        accuracy = ((tn + tp)/(tn + tp + fp + fn)) * 100
        recall = (tp/(tp + fn)) * 100
        specificity = (tn/(tn + fp)) * 100
        g_mean = ((recall * specificity)**0.5)

        # pembulatan 2 digit
        accuracy = round(accuracy, 2)
        recall = round(recall, 2)
        specificity = round(specificity, 2)
        g_mean = round(g_mean, 2)

        # membuat list untuk menampung performa
        list_performa = [accuracy, recall, specificity, g_mean]

        # kembalikan hasil
        return list_performa

# VISUALIZATION

In [None]:
def visualize(list_smote_enn, list_wksmote_renn):
    labels = ["Accuracy", "Recall", "Specificity", "G-Mean"]

    x = np.arange(len(labels))
    width = 0.30

    fig, ax = plt.subplots()
    border_color="#000000"
    rects1 = ax.bar(x - 0.5 * width, list_smote_enn, width, label='SMOTE-ENN', color="#694897", edgecolor=border_color)
    rects2 = ax.bar(x + 0.5 * width, list_wksmote_renn, width, label='WKSMOTE-RENN', color="#FEE21D", edgecolor=border_color)

    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_title('Perbandingan Performa')
    ax.set_xticks(x)
    ax.set_xticklabels(labels)
    ax.legend(loc="lower center", ncol = 2, bbox_to_anchor=(0.5, -0.2, 0, 0))

    def autolabel(rects):
        for rect in rects:
            height = rect.get_height()
            ax.annotate('{}'.format(height),
                        xy=(rect.get_x() + rect.get_width() / 2, height),
                        xytext=(0, 1),
                        textcoords="offset points",
                        ha='center', va='bottom')


    autolabel(rects1)
    autolabel(rects2)

    fig.tight_layout()
    plt.ylim(0, 100)
    plt.show()

In [None]:
def visualize_v2(list_base, list_smote_enn, list_wksmote_renn):
    labels = ["Accuracy", "Recall", "Specificity", "G-Mean"]

    x = np.arange(len(labels))
    width = 0.30

    fig, ax = plt.subplots()
    border_color="#000000"
    rects1 = ax.bar(x - width, list_base, width, label='Original', color="#00B8BC", edgecolor=border_color)
    rects2 = ax.bar(x, list_smote_enn, width, label='SMOTE-ENN', color="#694897", edgecolor=border_color)
    rects3 = ax.bar(x + width, list_wksmote_renn, width, label='WKSMOTE-RENN', color="#FEE21D", edgecolor=border_color)

    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_title('Perbandingan Performa')
    ax.set_xticks(x)
    ax.set_xticklabels(labels)
    ax.legend(loc="lower center", ncol = 3, bbox_to_anchor=(0.5, -0.2, 0, 0))

    def autolabel(rects):
        for rect in rects:
            height = rect.get_height()
            ax.annotate('{}'.format(height),
                        xy=(rect.get_x() + rect.get_width() / 2, height),
                        xytext=(0, 1),
                        textcoords="offset points",
                        ha='center', va='bottom')


    autolabel(rects1)
    autolabel(rects2)
    autolabel(rects3)

    fig.tight_layout()
    plt.ylim(0, 100)
    plt.show()

In [None]:
def show_plot(X_base, X_smote_enn, X_wksmote_renn, y_base, y_smote_enn, y_wksmote_renn, column1, column2):
    # set color
    colors = ["#440154", "#FC9900"]

    # Create a figure and a grid of subplots
    f, axes = plt.subplots(1,3,figsize=(10,5), dpi=100)

    # Create scatter plots on each subplot
    sns.scatterplot(x = column1, y = column2, data = X_base, hue = y_base, ax = axes[0], s=20, palette=colors)
    axes[0].set_title('Data Asli')

    sns.scatterplot(x = column1, y = column2, data = X_smote_enn, hue = y_smote_enn, ax = axes[1], s=20, palette=colors)
    axes[1].set_title('SMOTE-ENN')

    sns.scatterplot(x = column1, y = column2, data = X_wksmote_renn, hue = y_wksmote_renn, ax = axes[2], s=20, palette=colors)
    axes[2].set_title('WKSMOTE-RENN')

    plt.legend(loc="best")
    plt.tight_layout()
    plt.show()

In [None]:
def show_confusion_matrix(model, X_test, y_test):
    actual = y_test
    y_predict = model.predict(X_test)
    predicted = pd.Series(y_predict)

    confusion_matrix = metrics.confusion_matrix(actual, predicted)

    cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [False, True])

    cm_display.plot()
    plt.show()

# WORKING AREA

In [None]:
# # get data training dan data testing dengan memanggil method
# X_train, X_test, y_train, y_test, URL = GenerateDataset().get_data()

# # encoding
# X_train, y_train = Preprocessing().label_encoding(X_train, y_train)

# # cleaning data
# X_train, y_train = Preprocessing().get_clean_dataset(X_train, y_train)

# # implementasi normalisasi data pada data latih
# X_train, y_train, list_column_norm, scaler = Preprocessing().normalisasi(X_train, y_train)

# # implementasi seleksi fitur
# X_train, y_train, list_column_selected = Preprocessing().seleksi_fitur(X_train, y_train)

# # membuang fitur yang tidak digunakan pada proses preprocessing
# X_test = X_test[list_column_norm]

# # encoding
# X_test, y_test = Preprocessing().label_encoding(X_test, y_test)

# # implementasi normalisasi data pada data uji
# X_test, y_test = pd.DataFrame(scaler.transform(X_test), columns=list_column_norm), y_test.reset_index(drop=True)

# # menggunakan kolom yang akan digunakan saja
# X_test, y_test = X_test[list_column_selected], y_test

In [None]:
# object dari class beberapa algoritma sampling
# smote_enn = SMOTE_ENN(neighbors=5)
# wksmote_renn = WKSMOTE_RENN(neighbors=5)

# proses training dengan menerapkan beberapa algoritma sampling di naive bayes
# model_base, X_base, y_base = Train_Test_Process().train(X_train, y_train, classifier="NB")
# model_smote_enn, X_smote_enn, y_smote_enn = Train_Test_Process().train(X_train, y_train, smote_enn, classifier="NB")
# model_wksmote_renn, X_wksmote_renn, y_wksmote_renn = Train_Test_Process().train(X_train, y_train, wksmote_renn, classifier="NB")

In [None]:
# proses testing
# result_base = Train_Test_Process().test(model_base, X_base, y_base)
# result_smote_enn = Train_Test_Process().test(model_smote_enn, X_test, y_test)
# result_wksmote_renn = Train_Test_Process().test(model_wksmote_renn, X_test, y_test)

# HASIL

In [None]:
# visualize(result_smote_enn, result_wksmote_renn)

In [None]:
# show_confusion_matrix(model_wksmote_renn, X_test, y_test)
# print("accuracy: ", result_wksmote_renn[0])
# print("recall: ", result_wksmote_renn[1])
# print("specificity: ", result_wksmote_renn[2])
# print("gmean: ", result_wksmote_renn[3])

In [None]:
# URL_DATA = URL.split("/")[-1]

# if URL_DATA == 'spam.csv':
#     column1 = "word_freq_all"
#     column2 = "char_freq_$"
# elif URL_DATA == 'ionosphere.csv':
#     column1 = "V8"
#     column2 = "V30"
# elif URL_DATA == 'colon-cancer.csv':
#     column1 = "V53"
#     column2 = "V30"
# elif URL_DATA == 'voice.csv':
#     column1 = "Jitter->pitch_PQ5_classical_Baken"
#     column2 = "HNR->HNR_dB_Praat_std"
# elif URL_DATA == 'parkinson.csv':
#     column1 = "f1"
#     column2 = "tqwt_kurtosisValue_dec_21"
# elif URL_DATA == 'mri.csv':
#     column1 = "obstime"
#     column2 = "plt"
# elif URL_DATA == 'heart-attack.csv':
#     column1 = "BMI"
#     column2 = "PhysHlth"
# elif URL_DATA == 'ozone-level.csv':
#     column1 = "V12"
#     column2 = "V59"

# show_plot(X_train, X_smote_enn, X_wksmote_renn, y_train, y_smote_enn, y_wksmote_renn, column1, column2)

# AUTOMATION OF FEATURE SELECTION

In [None]:
# class GenerateDatasetAutomation:
#     def get_data(self, URL):

#         # akses dataset melalui URL
#         dataset = pd.read_csv(URL)

#         # menentukan data dan kelas
#         X = dataset.iloc[:, :-1]
#         y = dataset.iloc[:, -1]

#         # membagi dataset menjadi data training dan data testing
#         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

#         # mengembalikan dataset yang sudah displit
#         return X_train, X_test, y_train, y_test, URL

In [None]:
# def GetAutomationResult():
#     list_dataset = ["spam.csv", "ionosphere.csv", "voice.csv", "parkinson.csv", "malware.csv", "mri.csv", "heart-attack.csv", "ozone-level.csv"]

#     for i in list_dataset:
#         url = "https://raw.githubusercontent.com/adizaen/dataset-tesis/main/" + i

#         # get data training dan data testing dengan memanggil method
#         X_train, X_test, y_train, y_test, URL = GenerateDatasetAutomation().get_data(url)

#         # encoding
#         X_train, y_train = Preprocessing().label_encoding(X_train, y_train)

#         # cleaning data
#         X_train, y_train = Preprocessing().get_clean_dataset(X_train, y_train)

#         # implementasi normalisasi data pada data latih
#         X_train, y_train, list_column_norm, scaler = Preprocessing().normalisasi(X_train, y_train)

#         # 0: tidak memakai seleksi fitur, 1: memakai seleksi fitur
#         for j in range(2):
#             if j == 1:
#                 # implementasi seleksi fitur
#                 X_train, y_train, list_column_selected = Preprocessing().seleksi_fitur(X_train, y_train)
#                 print("Jumlah fitur setelah SF: ", len(X_train.columns))
#             else:
#                 print("Jumlah fitur original: ", len(X_train.columns))

#             # # membuang fitur yang tidak digunakan pada proses preprocessing
#             # X_test = X_test[list_column_norm]

#             # # encoding
#             # X_test, y_test = Preprocessing().label_encoding(X_test, y_test)

#             # # implementasi normalisasi data pada data uji
#             # X_test, y_test = pd.DataFrame(scaler.transform(X_test), columns=list_column_norm), y_test.reset_index(drop=True)

#             # # menggunakan kolom yang akan digunakan saja
#             # if j == 1:
#             #     X_test, y_test = X_test[list_column_selected], y_test

#             # # generate model
#             # start = time.time()
#             # model_base, X_base, y_base = Train_Test_Process().train(X_train, y_train, classifier="ANN")
#             # end = time.time()

#             # # get performa model pada data test
#             # result_base = Train_Test_Process().test(model_base, X_base, y_base)

#             # # print hasil
#             # if j == 0:
#             #     print("----------- DATASET " + i + " ORI -----------")
#             # else:
#             #     print("----------- DATASET " + i + " DENGAN SF -----------")

#             # print(result_base)
#             # print("Training time {:.3f} detik" . format(end - start))
#             # print("\n")

In [None]:
# GetAutomationResult()

# AUTOMATION OF FILTER

In [None]:
class GenerateDatasetParallel:
    def get_data(self, URL):

        # akses dataset melalui URL
        dataset = pd.read_csv(URL)

        # menentukan data dan kelas
        X = dataset.iloc[:, :-1]
        y = dataset.iloc[:, -1]

        # membagi dataset menjadi data training dan data testing
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

        # mengembalikan dataset yang sudah displit
        return X_train, X_test, y_train, y_test, URL

In [None]:
def GetParallelResult(n_neighbors, n_neighbors_filter, classifier):
    list_dataset = ["spam.csv", "ionosphere.csv", "voice.csv", "parkinson.csv", "malware.csv", "mri.csv", "heart-attack.csv", "ozone-level.csv"]

    for i in list_dataset:
        url = "https://raw.githubusercontent.com/adizaen/dataset-tesis/main/" + i

        # get data training dan data testing dengan memanggil method
        X_train, X_test, y_train, y_test, URL = GenerateDatasetParallel().get_data(url)

        # encoding
        X_train, y_train = Preprocessing().label_encoding(X_train, y_train)

        # cleaning data
        X_train, y_train = Preprocessing().get_clean_dataset(X_train, y_train)

        # implementasi normalisasi data pada data latih
        X_train, y_train, list_column_norm, scaler = Preprocessing().normalisasi(X_train, y_train)

        # implementasi seleksi fitur
        X_train, y_train, list_column_selected = Preprocessing().seleksi_fitur(X_train, y_train)

        # membuang fitur yang tidak digunakan pada proses preprocessing
        X_test = X_test[list_column_norm]

        # encoding
        X_test, y_test = Preprocessing().label_encoding(X_test, y_test)

        # implementasi normalisasi data pada data uji
        X_test, y_test = pd.DataFrame(scaler.transform(X_test), columns=list_column_norm), y_test.reset_index(drop=True)

        # menggunakan kolom yang akan digunakan saja
        X_test, y_test = X_test[list_column_selected], y_test

        # object dari class beberapa algoritma sampling
        smote_enn = SMOTE_ENN(neighbors=n_neighbors)
        wksmote_renn = WKSMOTE_RENN(neighbors=n_neighbors, n_neighbors_filter=n_neighbors_filter)

        # proses training dengan menerapkan beberapa algoritma sampling di naive bayes
        # model_base, X_base, y_base = Train_Test_Process().train(X_train, y_train, classifier=classifier)
        model_smote_enn, X_smote_enn, y_smote_enn = Train_Test_Process().train(X_train, y_train, smote_enn, classifier=classifier)
        model_wksmote_renn, X_wksmote_renn, y_wksmote_renn = Train_Test_Process().train(X_train, y_train, wksmote_renn, classifier=classifier)

        # proses testing
        # result_base = Train_Test_Process().test(model_base, X_test, y_test)
        result_smote_enn = Train_Test_Process().test(model_smote_enn, X_test, y_test)
        result_wksmote_renn = Train_Test_Process().test(model_wksmote_renn, X_test, y_test)

        # proses plot
        URL_DATA = URL.split("/")[-1]

        if URL_DATA == 'spam.csv':
            column1 = "word_freq_all"
            column2 = "char_freq_$"
        elif URL_DATA == 'ionosphere.csv':
            column1 = "V8"
            column2 = "V30"
        elif URL_DATA == 'colon-cancer.csv':
            column1 = "V53"
            column2 = "V30"
        elif URL_DATA == 'voice.csv':
            column1 = "Jitter->pitch_PQ5_classical_Baken"
            column2 = "HNR->HNR_dB_Praat_std"
        elif URL_DATA == 'parkinson.csv':
            column1 = "f1"
            column2 = "tqwt_kurtosisValue_dec_21"
        elif URL_DATA == 'mri.csv':
            column1 = "obstime"
            column2 = "plt"
        elif URL_DATA == 'heart-attack.csv':
            column1 = "BMI"
            column2 = "PhysHlth"
        elif URL_DATA == 'ozone-level.csv':
            column1 = "V12"
            column2 = "V59"

        print("Dataset: ", i)
        visualize(result_smote_enn, result_wksmote_renn)

In [None]:
GetParallelResult(n_neighbors=3, n_neighbors_filter = 5, classifier="NB")

In [None]:
GetParallelResult(n_neighbors=3, n_neighbors_filter = 5, classifier="ANN")

In [None]:
GetParallelResult(n_neighbors=4, n_neighbors_filter = 5, classifier="NB")

In [None]:
GetParallelResult(n_neighbors=4, n_neighbors_filter = 5, classifier="ANN")

In [None]:
GetParallelResult(n_neighbors=5, n_neighbors_filter = 5, classifier="NB")

In [None]:
GetParallelResult(n_neighbors=5, n_neighbors_filter = 5, classifier="ANN")

In [None]:
GetParallelResult(n_neighbors=6, n_neighbors_filter = 5, classifier="NB")

In [None]:
GetParallelResult(n_neighbors=6, n_neighbors_filter = 5, classifier="ANN")

In [None]:
GetParallelResult(n_neighbors=7, n_neighbors_filter = 5, classifier="NB")

In [None]:
GetParallelResult(n_neighbors=7, n_neighbors_filter = 5, classifier="ANN")