# IMPORT DATA

Import data dataset asli untuk kemudian dilakukan preprocessing data

https://blog.tensorflow.org/2020/01/hyperparameter-tuning-with-keras-tuner.html

In [None]:
#import library

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import csv
from sklearn.preprocessing import StandardScaler

In [None]:
# fungsi untuk validasi nama kolom

def HeaderFile():
    list = []
    
    # perulangan untuk menghapus karakter double quote("")
    for col in df.columns:
        col = col.replace('"', '')
        list.append(col)
    
    # mengganti kolom pada dataframe dengan kolom yang telah divalidasi
    df.columns = list

In [None]:
#import data csv

df = pd.read_csv('fraud_data.csv', quoting = csv.QUOTE_NONE)
HeaderFile()
df.head(5)

# PREPROCESSING

Proses untuk membersihkan data dan mentransformasikan data sebelum dilakukan klasifikasi. Langkah-langkah yang dilakukan pada tahap ini yaitu mengisi kolom kosong, menghapus kolom dengan nilai konstan, dan normalisasi ke dalam range 0 dan 1

In [None]:
# fungsi untuk menangani kolom kosong

def HandlingMissingValue():
    
    # cek apakah ada nilai kosong atau tidak
    isMissing = df.isnull().values.any()
    
    # jika ada nilai kosong (True), maka dilakukan proses fillna menggunakan median
    if (isMissing == True):
        
        # membuat list kolom yang memiliki nilai kosong
        missColumn = df.columns[df.isnull().any()].tolist() 
        
        # perulangan untuk proses fillna
        for col in missColumn:
            df.fillna({
                col: df[col].median()
            }, inplace = True)
        
        print("Successfully handle missing values!")
        print(missColumn)
    else:
        print("Your data is complete! No handling missing value required.")

In [None]:
# fungsi untuk menangani kolom dengan nilai konstan

def HandlingConstantValue():
    constantValue = []
    
    # mengecek jumlah data berbeda (unik) suatu kolom. Jika bernilai 1, maka kolom tersebut bernilai konstan (tetap)
    for col in df.columns:
        if (df[col].nunique() == 1):
            constantValue.append(col)
        
    # jika ada kolom dengan nilai konstan, maka akan dihapus
    if (len(constantValue) > 0):
        for col in constantValue:
            # menghapus kolom yang bernilai konstan
            df.drop(col, inplace = True, axis = 1)
        
        print("Successfully handle constant values!")
        print(constantValue)
    else:
        print("Your data is good! No handling constant value required.")

In [None]:
# fungsi untuk normalisasi menggunakan Z Score

def Normalization(dataFrame):
    # membuat object dari class StandardScaler()
    std_scaler = StandardScaler()
    
    # menyimpan list kolom selain kolom kelas
    target = "flag_transaksi_fraud"
    classColumn = dataFrame[target]
    
    cols = dataFrame.columns.tolist()
    cols = [c for c in cols if c not in [target]]
    
    # membagi dataframe fitur dan dataframe kelas
    targetColumn = dataFrame[cols]
    
    # proses fit (mencari nilai rerata dan standar deviasi) dan transform (menerapkan ke data)
    df_std = pd.DataFrame(std_scaler.fit_transform(targetColumn), columns = cols)
    
    # menggabungkan dataframe hasil normalisasi dengan kolom target
    frameClass = pd.DataFrame(classColumn)
    df_std[target] = frameClass
    
    print("Data normalization complete!")
    
    # mengembalikan nilai hasil normalisasi
    return df_std

In [None]:
HandlingMissingValue()

In [None]:
HandlingConstantValue()

In [None]:
df = Normalization(df)
df

# SAMPLING DATA DENGAN SMOTE

In [None]:
# SMOTE
import imblearn
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)

cols = df.columns.tolist()
cols = [c for c in cols if c not in ["flag_transaksi_fraud"]]
target = "flag_transaksi_fraud"

#define X and Y
X = df[cols]
Y = df[target]

#smote
X_smote, Y_smote = sm.fit_resample(X, Y)

In [None]:
from pandas import DataFrame
test = pd.DataFrame(Y_smote, columns = ['flag_transaksi_fraud'])

In [None]:
#visualizing smote results
fig, axs = plt.subplots(ncols=2, figsize=(13,4.5))
sns.countplot(x="flag_transaksi_fraud", data=df, ax=axs[0])
sns.countplot(x="flag_transaksi_fraud", data=test, ax=axs[1])

fig.suptitle("Class repartition before and after smote")
a1=fig.axes[0]
a1.set_title("Before")
a2=fig.axes[1]
a2.set_title("After")

print('Before SMOTE')
print('--------------------')
print('Normal Transactions \t: ', (Y.values == 0).sum())
print('Fraud Transactions \t: ', (Y.values == 1).sum())

print('\nAfter SMOTE')
print('--------------------')
print('Normal Transactions \t: ', (test.values == 0).sum())
print('Fraud Transactions \t: ', (test.values == 1).sum())

# BUILDING NN MODEL

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_smote, Y_smote, test_size=0.2, random_state=1)

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.metrics import auc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

# MANUAL TUNING HIDDEN LAYER

In [None]:
#train the model

def TuningHiddenLayer():
    # import library for count training time
    import time
    
    List_Time = []
    List_Accuracy = []
    List_Precission = []
    List_Recall = []
    List_Specificity = []
    List_F1_Score = []
    List_Error = []
    List_AUC = []

    for x in range(5):
        start_time = time.time()
        
        model = Sequential()
        model.add(Dense(512, activation='relu', input_dim=24))
        model.add(Dense(256, activation='relu'))
        model.add(Dense(128, activation='relu'))
        model.add(Dense(64, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))

        opt = tf.keras.optimizers.Adam(learning_rate=0.01) #optimizer
        model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy']) #metrics

        earlystopper = EarlyStopping(monitor='val_loss', min_delta=0, patience=15, verbose=1, mode='auto', baseline=None, restore_best_weights=False)

        history = model.fit(X_train.values, y_train.values, epochs = 100, batch_size=32, validation_split = 0.20, verbose = 0, callbacks = [earlystopper])
        history_dict = history.history
        
        end_time = time.time()
        training_time = end_time - start_time

        #predictions
        y_pred_nn = (model.predict(X_test) > 0.5).astype("int32")

        #CM matrix
        matrix_nn = confusion_matrix(y_test, y_pred_nn)
        tp, fn, fp, tn = matrix_nn.reshape(-1)

        accuracy = ((tn+tp)/(tn+tp+fp+fn)) * 100
        precission = (tp/(tp+fp)) * 100
        recall = (tp/(tp+fn)) * 100
        specificity = (tn/(tn+fp)) * 100
        f1_score = 2 * (recall * precission) / (recall + precission)
        error = 100 - accuracy

        #AUC
        y_pred_nn_proba = model.predict(X_test)
        fpr_keras, tpr_keras, thresholds_keras = roc_curve(y_test,y_pred_nn_proba)
        auc_keras = auc(fpr_keras, tpr_keras) * 100

        List_Time.append(training_time)
        List_Accuracy.append(accuracy)
        List_Precission.append(precission)
        List_Recall.append(recall)
        List_Specificity.append(specificity)
        List_F1_Score.append(f1_score)
        List_Error.append(error)
        List_AUC.append(auc_keras)
        
    print(List_Time)
    print(List_Accuracy)
    print(List_Precission)
    print(List_Recall)
    print(List_Specificity)
    print(List_F1_Score)
    print(List_Error)
    print(List_AUC)

In [None]:
# TuningHiddenLayer()

# MANUAL TUNING BATCH SIZE

In [None]:
#train the model

def TuningBatchSize(numBatchSize):
     # import library for count training time
    import time
    
    List_Time = []
    List_Accuracy = []
    List_Precission = []
    List_Recall = []
    List_Specificity = []
    List_F1_Score = []
    List_Error = []
    List_AUC = []

    for x in range(5):
        start_time = time.time()
        
        model = Sequential()
        model.add(Dense(512, activation='relu', input_dim=24))
        model.add(Dense(256, activation='relu'))
        model.add(Dense(128, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))

        opt = tf.keras.optimizers.Adam(learning_rate=0.01) #optimizer
        model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy']) #metrics

        earlystopper = EarlyStopping(monitor='val_loss', min_delta=0, patience=15, verbose=1, mode='auto', baseline=None, restore_best_weights=False)

        history = model.fit(X_train.values, y_train.values, epochs = 100, batch_size = numBatchSize, validation_split = 0.20, verbose = 0, callbacks = [earlystopper])
        history_dict = history.history
        
        end_time = time.time()
        training_time = end_time - start_time

        #predictions
        y_pred_nn = (model.predict(X_test) > 0.5).astype("int32")

        #CM matrix
        matrix_nn = confusion_matrix(y_test, y_pred_nn)
        tp, fn, fp, tn = matrix_nn.reshape(-1)

        accuracy = ((tn+tp)/(tn+tp+fp+fn)) * 100
        precission = (tp/(tp+fp)) * 100
        recall = (tp/(tp+fn)) * 100
        specificity = (tn/(tn+fp)) * 100
        f1_score = 2 * (recall * precission) / (recall + precission)
        error = 100 - accuracy

        #AUC
        y_pred_nn_proba = model.predict(X_test)
        fpr_keras, tpr_keras, thresholds_keras = roc_curve(y_test,y_pred_nn_proba)
        auc_keras = auc(fpr_keras, tpr_keras) * 100

        List_Time.append(training_time)
        List_Accuracy.append(accuracy)
        List_Precission.append(precission)
        List_Recall.append(recall)
        List_Specificity.append(specificity)
        List_F1_Score.append(f1_score)
        List_Error.append(error)
        List_AUC.append(auc_keras)
        
    print(List_Time)
    print(List_Accuracy)
    print(List_Precission)
    print(List_Recall)
    print(List_Specificity)
    print(List_F1_Score)
    print(List_Error)
    print(List_AUC)

In [None]:
# TuningBatchSize(16)

In [None]:
# TuningBatchSize(32)

In [None]:
# TuningBatchSize(64)

In [None]:
# TuningBatchSize(128)

In [None]:
# TuningBatchSize(256)

# MANUAL TUNING LEARNING RATE

In [None]:
#train the model

def TuningLearningRate(numLearningRate):
    # import library for count training time
    import time
        
    List_Time = []
    List_Accuracy = []
    List_Precission = []
    List_Recall = []
    List_Specificity = []
    List_F1_Score = []
    List_Error = []
    List_AUC = []

    for x in range(5):
        start_time = time.time()
        
        model = Sequential()
        model.add(Dense(512, activation='relu', input_dim=24))
        model.add(Dense(256, activation='relu'))
        model.add(Dense(128, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))

        opt = tf.keras.optimizers.Adam(learning_rate = numLearningRate) #optimizer
        model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy']) #metrics

        earlystopper = EarlyStopping(monitor='val_loss', min_delta=0, patience=15, verbose=1, mode='auto', baseline=None, restore_best_weights=False)

        history = model.fit(X_train.values, y_train.values, epochs = 100, batch_size = 128, validation_split = 0.20, verbose = 0, callbacks = [earlystopper])
        history_dict = history.history
        
        end_time = time.time()
        training_time = end_time - start_time

        #predictions
        y_pred_nn = (model.predict(X_test) > 0.5).astype("int32")

        #CM matrix
        matrix_nn = confusion_matrix(y_test, y_pred_nn)
        tp, fn, fp, tn = matrix_nn.reshape(-1)

        accuracy = ((tn+tp)/(tn+tp+fp+fn)) * 100
        precission = (tp/(tp+fp)) * 100
        recall = (tp/(tp+fn)) * 100
        specificity = (tn/(tn+fp)) * 100
        f1_score = 2 * (recall * precission) / (recall + precission)
        error = 100 - accuracy

        #AUC
        y_pred_nn_proba = model.predict(X_test)
        fpr_keras, tpr_keras, thresholds_keras = roc_curve(y_test,y_pred_nn_proba)
        auc_keras = auc(fpr_keras, tpr_keras) * 100

        List_Time.append(training_time)
        List_Accuracy.append(accuracy)
        List_Precission.append(precission)
        List_Recall.append(recall)
        List_Specificity.append(specificity)
        List_F1_Score.append(f1_score)
        List_Error.append(error)
        List_AUC.append(auc_keras)
        
    print(List_Time)
    print(List_Accuracy)
    print(List_Precission)
    print(List_Recall)
    print(List_Specificity)
    print(List_F1_Score)
    print(List_Error)
    print(List_AUC)

In [None]:
# TuningLearningRate(0.1)

In [None]:
# TuningLearningRate(0.01)

In [None]:
# TuningLearningRate(0.001)

In [None]:
# TuningLearningRate(0.0001)

In [None]:
# TuningLearningRate(0.00001)

 # FINAL TEST

In [None]:
#train the model

def BuildModel():
    # import library for count training time
    import time
    
    # BEST PARAMETER
    learningRate = 0.0001
    batchSize = 128
    
    List_Time = []
    List_Accuracy = []
    List_Precission = []
    List_Recall = []
    List_Specificity = []
    List_F1_Score = []
    List_Error = []
    List_AUC = []

    for x in range(10):
        start_time = time.time()
        
        model = Sequential()
        model.add(Dense(512, activation='relu', input_dim=24))
        model.add(Dense(256, activation='relu'))
        model.add(Dense(128, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))

        opt = tf.keras.optimizers.Adam(learning_rate = learningRate) #optimizer
        model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy']) #metrics

        earlystopper = EarlyStopping(monitor = 'val_loss', min_delta = 0, patience = 15, verbose = 1, mode = 'auto', baseline = None, restore_best_weights=False)

        history = model.fit(X_train.values, y_train.values, epochs = 100, batch_size = batchSize, validation_split = 0.20, verbose = 0, callbacks = [earlystopper])
        history_dict = history.history
        
        end_time = time.time()
        training_time = end_time - start_time

        #predictions
        y_pred_nn = (model.predict(X_test) > 0.5).astype("int32")

        #CM matrix
        matrix_nn = confusion_matrix(y_test, y_pred_nn)
        tp, fn, fp, tn = matrix_nn.reshape(-1)

        accuracy = ((tn+tp)/(tn+tp+fp+fn)) * 100
        precission = (tp/(tp+fp)) * 100
        recall = (tp/(tp+fn)) * 100
        specificity = (tn/(tn+fp)) * 100
        f1_score = 2 * (recall * precission) / (recall + precission)
        error = 100 - accuracy

        #AUC
        y_pred_nn_proba = model.predict(X_test)
        fpr_keras, tpr_keras, thresholds_keras = roc_curve(y_test,y_pred_nn_proba)
        auc_keras = auc(fpr_keras, tpr_keras) * 100

        List_Time.append(training_time)
        List_Accuracy.append(accuracy)
        List_Precission.append(precission)
        List_Recall.append(recall)
        List_Specificity.append(specificity)
        List_F1_Score.append(f1_score)
        List_Error.append(error)
        List_AUC.append(auc_keras)
        
    print(List_Time)
    print(List_Accuracy)
    print(List_Precission)
    print(List_Recall)
    print(List_Specificity)
    print(List_F1_Score)
    print(List_Error)
    print(List_AUC)

In [None]:
BuildModel()