PACKAGES

In [1]:
from deepod.models.tabular import DevNet, PReNet, DeepSAD, FeaWAD, RoSAS
from deepod.metrics import tabular_metrics
from autoencodernn import *
from tapnet import *

In [2]:
import pickle
import os
from datetime import datetime

In [3]:
import numpy as np
import pandas as pd
import seaborn as sns

In [4]:
# from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.metrics import geometric_mean_score

DATASET

In [5]:
dfs = []
for filename in tqdm(os.listdir('./data/')):
    if 'preprocessed' in filename:
        dfs.append(pd.read_csv(f'./data/{filename}', index_col = 0))
df = pd.concat(dfs).reset_index(drop = True)
del(dfs)

 77%|███████▋  | 27/35 [00:03<00:00,  8.41it/s]


ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
train_valid_df, test_df = train_test_split(df, test_size = 0.2, stratify = df['label'])
train_df, valid_df = train_test_split(train_valid_df, test_size = 0.125, stratify = train_valid_df['label'])

In [None]:
X_train = train_df.drop(['label'], axis = 1)
y_train = train_df['label']

X_valid = valid_df.drop(['label'], axis = 1)
y_valid = valid_df['label']

X_test = test_df.drop(['label'], axis = 1)
y_test = test_df['label']

In [None]:
# train_df = pd.read_csv('./checkpoint/train_df.csv', index_col = 0)
# valid_df = pd.read_csv('./checkpoint/valid_df.csv', index_col = 0)
# test_df = pd.read_csv('./checkpoint/test_df.csv', index_col = 0)

In [None]:
# X_train = pd.read_csv('./checkpoint/x_train.csv')
# y_train = pd.read_csv('./checkpoint/y_train.csv')

# X_valid = pd.read_csv('./checkpoint/x_valid.csv')
# y_valid = pd.read_csv('./checkpoint/y_valid.csv')

# X_test = pd.read_csv('./checkpoint/x_test.csv')
# y_test = pd.read_csv('./checkpoint/y_test.csv')

MODEL

In [None]:
models = []

In [None]:
def anomaly_preprocessing(df1, df2, df3):
    df1['type'] = 'train'
    df2['type'] = 'valid'
    df3['type'] = 'test'

    df = pd.concat([df1, df2, df3]).reset_index(drop = True)

    country_onehot = pd.get_dummies(df['country']).astype(int)
    risk_grades = df[['region_risk_grade', 'city_risk_grade', 'name_risk_grade']]
    browser_onehot = pd.get_dummies(df['browser_name']).astype(int)
    os_onehot = pd.get_dummies(df['os_name']).astype(int)
    legacys = df[['browser_is_legacy', 'os_is_legacy']]
    device_types = pd.get_dummies(df['device_type']).astype(int)
    rtts = df['rtt']
    type = df['type']
    label = df['label']
    df = pd.concat([country_onehot, risk_grades, browser_onehot, os_onehot, legacys, device_types, rtts, type, label], axis = 1)

    df1 = df[df['type'] == 'train'].drop('type', axis = 1)
    df2 = df[df['type'] == 'valid'].drop('type', axis = 1)
    df3 = df[df['type'] == 'test'].drop('type', axis = 1)

    return df1, df2, df3

In [None]:
# Anomaly Detection
print('Anomaly Detection Model')
ad_model_names = ['DevNet', 'PReNet', 'DeepSAD', 'FeaWAD', 'RoSAS']
ad_models = [
    # DevNet(),           # 86.5s
    # PReNet(),           # Too long. (2.4 hours)
    DeepSAD(epochs = 50),          # 75.4s
    # FeaWAD(epochs = 10000, lr = 0.01),           # Very fast but poor.
    # RoSAS(),            # Too long.
]

train_df_ad, valid_df_ad, test_df_ad = anomaly_preprocessing(train_df, valid_df, test_df)

X_train_ad = train_df_ad.drop(['label'], axis = 1)
y_train_ad = train_df_ad['label']

X_valid_ad = valid_df_ad.drop(['label'], axis = 1)
y_valid_ad = valid_df_ad['label']

X_test_ad = test_df_ad.drop(['label'], axis = 1)
y_test_ad = test_df_ad['label']

for model_name, model in zip(ad_model_names, ad_models):
    print('start -', datetime.now())
    
    model.fit(X_train_ad.to_numpy(), y_train_ad.to_numpy())
    print('Train Finish')
    pred_train = (model.decision_function(X_train_ad.to_numpy()) > 0.5).astype(int)
    auc_train, ap_train, f1_train = tabular_metrics(y_train_ad, pred_train)
    
    pred_valid = (model.decision_function(X_valid_ad.to_numpy()) > 0.5).astype(int)
    auc_valid, ap_valid, f1_valid = tabular_metrics(y_valid_ad, pred_valid)
    
    print(f'Trained with {model}')
    print(f'Train - AUC: {auc_train}, AP: {ap_train}, F1: {f1_train}')
    print(f'Valid - AUC: {auc_valid}, AP: {ap_valid}, F1: {f1_valid}')

    models.append(model)
    print('end -', datetime.now(), '\n')

In [None]:
# # NOTE: For faster execution, use this cell and comment the upper one.
# with open('./checkpoint/deepsad.pkl', 'rb') as f:
#     deepsad_model = pickle.load(f)[0]

In [None]:
# AutoEncoder + NN
print('AutoEncoder + NN Model')
autoencoder_nn_model = model_v2(
    train_data = train_df,          # Train set
    valid_data = valid_df,          # Validation set
    test_data = test_df,            # Test set
    criteria = 0.5,                 # Classification threshold
    split_ratio = [7, 1, 2],        # split ratio (format: [train,validation,test])
    autoencoder_epochs = 50,        # epochs of autoencoder
    classifier_epochs = 200,        # epochs of classifier
    weight_for_attack = 15,         # weight for attack
)

models.append(autoencoder_nn_model)

In [None]:
# # NOTE: For faster execution, use this cell and comment the upper one.
# with open('./checkpoint/autoencoder_nn.pkl', 'rb') as f:
#     autoencoder_nn_model = pickle.load(f)[0]

In [None]:
# TabNet
print('TabNet Model')
selected_columns = ['country_code', 'region', 'city_risk_grade', 'name_risk_grade', 'login_success', 'browser_is_legacy', 'os_is_legacy', 'rtt', 'device_type', 'label']
categorical_columns = ['country_code', 'device_type', 'region']

tabnet_model = TabNetModel(train_df, valid_df, test_df, selected_columns, categorical_columns, 'label')         #, pre_train_epochs = 5, epochs = 5)
models.append(tabnet_model)

In [None]:
# # NOTE: For faster execution, use this cell and comment the upper one.
# with open('./checkpoint/tabnet.pkl', 'rb') as f:
#     tabnet_model = pickle.load(f)[0]

In [None]:
anomaly_pred = (deepsad_model.decision_function(X_test_ad.to_numpy()) > 0.5).astype(int)

In [None]:
print(classification_report(y_test.to_numpy(), anomaly_pred))

In [None]:
print(accuracy_score(y_test.to_numpy(), anomaly_pred))
print(precision_score(y_test.to_numpy(), anomaly_pred))
print(recall_score(y_test.to_numpy(), anomaly_pred))
print(f1_score(y_test.to_numpy(), anomaly_pred))
print(geometric_mean_score(y_test.to_numpy(), anomaly_pred))

In [None]:
an_df = autoencoder_nn_model.predicted_df
autonn_pred = an_df['Predicted Label'].to_numpy().astype(int)
print(classification_report(an_df['Actual Label'], an_df['Predicted Label']))

In [None]:
print(accuracy_score(an_df['Actual Label'].to_numpy(), autonn_pred))
print(precision_score(an_df['Actual Label'].to_numpy(), autonn_pred))
print(recall_score(an_df['Actual Label'].to_numpy(), autonn_pred))
print(f1_score(an_df['Actual Label'].to_numpy(), autonn_pred))
print(geometric_mean_score(an_df['Actual Label'].to_numpy(), autonn_pred))

In [None]:
tabnet_soft = tabnet_model.clf.predict_proba(tabnet_model.X_test.values)[:, 1]
tabnet_pred = tabnet_soft > 0.5
print(classification_report(tabnet_model.y_test, tabnet_pred))

In [None]:
print(classification_report(tabnet_model.y_test, tabnet_pred))

In [None]:
print(accuracy_score(y_test.to_numpy(), tabnet_pred))
print(precision_score(y_test.to_numpy(), tabnet_pred))
print(recall_score(y_test.to_numpy(), tabnet_pred))
print(f1_score(y_test.to_numpy(), tabnet_pred))
print(geometric_mean_score(y_test.to_numpy(), tabnet_pred))

In [None]:
disp1 = ConfusionMatrixDisplay.from_predictions(y_test.to_numpy(), anomaly_pred, normalize = 'true', cmap = plt.cm.Blues, text_kw = {'fontsize': 'x-large'})
disp2 = ConfusionMatrixDisplay.from_predictions(an_df['Actual Label'].astype(int), an_df['Predicted Label'].astype(int), normalize = 'true', cmap = plt.cm.Blues, text_kw = {'fontsize': 'x-large'})
disp3 = ConfusionMatrixDisplay.from_predictions(tabnet_model.y_test, tabnet_pred, normalize = 'true', cmap = plt.cm.Blues, text_kw = {'fontsize': 'x-large'})

In [None]:
# Majority Voting
majority_pred = (np.stack([anomaly_pred, autonn_pred, tabnet_pred]).mean(axis = 0) > 0.5).astype(int)
majority_disp = ConfusionMatrixDisplay.from_predictions(y_test.to_numpy(), majority_pred, normalize = 'true', cmap = plt.cm.Blues, text_kw = {'fontsize': 'x-large'})

In [None]:
print(accuracy_score(y_test.to_numpy(), majority_pred))
print(precision_score(y_test.to_numpy(), majority_pred))
print(recall_score(y_test.to_numpy(), majority_pred))
print(f1_score(y_test.to_numpy(), majority_pred))
print(geometric_mean_score(y_test.to_numpy(), majority_pred))

In [None]:
# Or Voting
or_pred = (np.stack([anomaly_pred, autonn_pred, tabnet_pred]).sum(axis = 0) != 0).astype(int)
or_disp = ConfusionMatrixDisplay.from_predictions(y_test.to_numpy(), or_pred, normalize = 'true', cmap = plt.cm.Blues, text_kw = {'fontsize': 'x-large'})

In [None]:
print(accuracy_score(y_test.to_numpy(), or_pred))
print(precision_score(y_test.to_numpy(), or_pred))
print(recall_score(y_test.to_numpy(), or_pred))
print(f1_score(y_test.to_numpy(), or_pred))
print(geometric_mean_score(y_test.to_numpy(), or_pred))

In [None]:
# Soft Voting
deepsad_soft = deepsad_model.decision_function(X_test_ad.to_numpy())
autonn_soft = an_df['Probability'].to_numpy()
# tabnet_soft = tabnet_model.clf.predict_proba(tabnet_model.X_test.values)[:, 1]
tabnet_soft

In [None]:
with open('./checkpoint/deepsad_soft.pkl', 'wb') as f:
    pickle.dump(deepsad_soft, f)
with open('./checkpoint/autonn_soft.pkl', 'wb') as f:
    pickle.dump(autonn_soft, f)
with open('./checkpoint/tabnet_soft.pkl', 'wb') as f:
    pickle.dump(tabnet_soft, f)

In [None]:
soft_pred = deepsad_soft.astype(float) + autonn_soft.astype(float) + tabnet_soft.astype(float) > 1.5
soft_disp = ConfusionMatrixDisplay.from_predictions(y_test.to_numpy(), soft_pred, normalize = 'true', cmap = plt.cm.Blues, text_kw = {'fontsize': 'x-large'})

In [None]:
print(accuracy_score(y_test.to_numpy(), soft_pred))
print(precision_score(y_test.to_numpy(), soft_pred))
print(recall_score(y_test.to_numpy(), soft_pred))
print(f1_score(y_test.to_numpy(), soft_pred))
print(geometric_mean_score(y_test.to_numpy(), soft_pred))

In [None]:
def get_threshold(y_true, y_pred, target_tpr):
    fpr, tpr, threshold = roc_curve(y_true, y_pred)
    index = np.argmin(np.abs(tpr - target_tpr))
    return threshold[index]

In [None]:
deepsad_threshold_09990 = get_threshold(y_test.to_numpy(), deepsad_soft, 0.9990)
deepsad_threshold_09950 = get_threshold(y_test.to_numpy(), deepsad_soft, 0.9950)
deepsad_threshold_09900 = get_threshold(y_test.to_numpy(), deepsad_soft, 0.9900)
deepsad_threshold_09800 = get_threshold(y_test.to_numpy(), deepsad_soft, 0.9800)
deepsad_threshold_09700 = get_threshold(y_test.to_numpy(), deepsad_soft, 0.9700)

In [None]:
autonn_threshold_09990 = get_threshold(an_df['Predicted Label'].to_numpy(), autonn_soft, 0.9990)
autonn_threshold_09950 = get_threshold(an_df['Predicted Label'].to_numpy(), autonn_soft, 0.9950)
autonn_threshold_09900 = get_threshold(an_df['Predicted Label'].to_numpy(), autonn_soft, 0.9900)
autonn_threshold_09800 = get_threshold(an_df['Predicted Label'].to_numpy(), autonn_soft, 0.9800)
autonn_threshold_09700 = get_threshold(an_df['Predicted Label'].to_numpy(), autonn_soft, 0.9700)

In [None]:
tabnet_threshold_09990 = get_threshold(tabnet_model.y_test.to_numpy(), tabnet_soft, 0.9990)
tabnet_threshold_09950 = get_threshold(tabnet_model.y_test.to_numpy(), tabnet_soft, 0.9950)
tabnet_threshold_09900 = get_threshold(tabnet_model.y_test.to_numpy(), tabnet_soft, 0.9900)
tabnet_threshold_09800 = get_threshold(tabnet_model.y_test.to_numpy(), tabnet_soft, 0.9800)
tabnet_threshold_09700 = get_threshold(tabnet_model.y_test.to_numpy(), tabnet_soft, 0.9700)

In [None]:
def get_recall(y_true, y_pred, threshold):
    matrix = confusion_matrix(y_true, y_pred > threshold)
    return matrix[1][1] / (matrix[0][1] + matrix[1][1])

In [None]:
deepsad_hard = (deepsad_soft > deepsad_threshold_09900).astype(int)
autonn_hard = (autonn_soft > autonn_threshold_09900).astype(int)
tabnet_hard = (tabnet_soft > tabnet_threshold_09900).astype(int)

In [None]:
deepsad_hard = (deepsad_soft > deepsad_threshold_09990).astype(int)
autonn_hard = (autonn_soft > autonn_threshold_09990).astype(int)
tabnet_hard = (tabnet_soft > tabnet_threshold_09990).astype(int)
or_pred_09990 = (np.stack([deepsad_hard, autonn_hard, tabnet_hard]).sum(axis = 0) != 0).astype(int)
ConfusionMatrixDisplay.from_predictions(y_test.to_numpy(), or_pred_09990, normalize = 'true', cmap = plt.cm.Blues, text_kw = {'fontsize': 'x-large'}, values_format = '.5f')

In [None]:
deepsad_hard = (deepsad_soft > deepsad_threshold_09950).astype(int)
autonn_hard = (autonn_soft > autonn_threshold_09950).astype(int)
tabnet_hard = (tabnet_soft > tabnet_threshold_09950).astype(int)
or_pred_09950 = (np.stack([deepsad_hard, autonn_hard, tabnet_hard]).sum(axis = 0) != 0).astype(int)
ConfusionMatrixDisplay.from_predictions(y_test.to_numpy(), or_pred_09950, normalize = 'true', cmap = plt.cm.Blues, text_kw = {'fontsize': 'x-large'}, values_format = '.5f')

In [None]:
deepsad_hard = (deepsad_soft > deepsad_threshold_09900).astype(int)
autonn_hard = (autonn_soft > autonn_threshold_09900).astype(int)
tabnet_hard = (tabnet_soft > tabnet_threshold_09900).astype(int)
or_pred_09900 = (np.stack([deepsad_hard, autonn_hard, tabnet_hard]).sum(axis = 0) != 0).astype(int)
ConfusionMatrixDisplay.from_predictions(y_test.to_numpy(), or_pred_09900, normalize = 'true', cmap = plt.cm.Blues, text_kw = {'fontsize': 'x-large'}, values_format = '.5f')

In [None]:
deepsad_hard = (deepsad_soft > deepsad_threshold_09800).astype(int)
autonn_hard = (autonn_soft > autonn_threshold_09800).astype(int)
tabnet_hard = (tabnet_soft > tabnet_threshold_09800).astype(int)
or_pred_09800 = (np.stack([deepsad_hard, autonn_hard, tabnet_hard]).sum(axis = 0) != 0).astype(int)
ConfusionMatrixDisplay.from_predictions(y_test.to_numpy(), or_pred_09800, normalize = 'true', cmap = plt.cm.Blues, text_kw = {'fontsize': 'x-large'}, values_format = '.5f')

In [None]:
deepsad_hard = (deepsad_soft > deepsad_threshold_09700).astype(int)
autonn_hard = (autonn_soft > autonn_threshold_09700).astype(int)
tabnet_hard = (tabnet_soft > tabnet_threshold_09700).astype(int)
or_pred_09700 = (np.stack([deepsad_hard, autonn_hard, tabnet_hard]).sum(axis = 0) != 0).astype(int)
ConfusionMatrixDisplay.from_predictions(y_test.to_numpy(), or_pred_09700, normalize = 'true', cmap = plt.cm.Blues, text_kw = {'fontsize': 'x-large'}, values_format = '.5f')

In [None]:
deepsad_soft

In [None]:
x = np.linspace(0.999, 0.90, 100)
deepsad_ratio = []
autonn_ratio = []
tabnet_ratio = []
majority_ratio = []
or_ratio = []
soft_ratio = []

for threshold in tqdm(x):
    deepsad_threshold = get_threshold(y_test.to_numpy(), deepsad_soft, threshold)
    autonn_threshold = get_threshold(an_df['Predicted Label'].to_numpy(), autonn_soft, threshold)
    tabnet_threshold = get_threshold(tabnet_model.y_test.to_numpy(), tabnet_soft, threshold)

    deepsad_hard = (deepsad_soft > deepsad_threshold).astype(int)
    autonn_hard = (autonn_soft > autonn_threshold).astype(int)
    tabnet_hard = (tabnet_soft > tabnet_threshold).astype(int)

    majority_pred = (np.stack([deepsad_hard, autonn_hard, tabnet_hard]).sum(axis = 0) > 1.5).astype(int)
    or_pred = (np.stack([deepsad_hard, autonn_hard, tabnet_hard]).sum(axis = 0) != 0).astype(int)
    soft_pred = (np.stack([deepsad_soft, autonn_soft, tabnet_soft]).sum(axis = 0) > deepsad_threshold + autonn_threshold + tabnet_threshold).astype(int)

    matrix = confusion_matrix(y_test.to_numpy(), deepsad_hard)
    deepsad_ratio.append(matrix[0][1] / (matrix[0][0] + matrix[0][1]))

    matrix = confusion_matrix(y_test.to_numpy(), autonn_hard)
    autonn_ratio.append(matrix[0][1] / (matrix[0][0] + matrix[0][1]))

    matrix = confusion_matrix(y_test.to_numpy(), tabnet_hard)
    tabnet_ratio.append(matrix[0][1] / (matrix[0][0] + matrix[0][1]))

    matrix = confusion_matrix(y_test.to_numpy(), majority_pred)
    majority_ratio.append(matrix[0][1] / (matrix[0][0] + matrix[0][1]))

    matrix = confusion_matrix(y_test.to_numpy(), or_pred)
    or_ratio.append(matrix[0][1] / (matrix[0][0] + matrix[0][1]))

    matrix = confusion_matrix(y_test.to_numpy(), soft_pred)
    soft_ratio.append(matrix[0][1] / (matrix[0][0] + matrix[0][1]))

In [None]:
# plt.plot(np.linspace(0.999, 0.90, 100), deepsad_ratio, label = 'DeepSAD')
# plt.plot(np.linspace(0.999, 0.90, 100), autonn_ratio, label = 'Autoencoder + MLP')
# plt.plot(np.linspace(0.999, 0.90, 100), tabnet_ratio, label = 'TabNet')
plt.plot(np.linspace(0.999, 0.90, 100), majority_ratio, label = 'Majority Vote')
plt.plot(np.linspace(0.999, 0.90, 100), or_ratio, label = 'OR Vote')
plt.plot(np.linspace(0.999, 0.90, 100), soft_ratio, label = 'Soft Vote')

plt.xlabel('TPR')
plt.ylabel('FPR')
plt.gca().invert_xaxis()
plt.legend()

plt.show()

In [None]:
prohibit_ratio = soft_ratio
danger_ratio = [x - y for x, y in zip(majority_ratio, soft_ratio)]
caution_ratio = [x - y for x, y in zip(or_ratio, majority_ratio)]
normal_ratio = [1 - x for x in or_ratio]

In [None]:
elapsed_time = [n * 1 + c * 3 + d * 5 + p * 10 for n, c, d, p in zip(normal_ratio, caution_ratio, danger_ratio, prohibit_ratio)]

In [None]:
plt.plot(np.linspace(0.999, 0.90, 100), elapsed_time, label = 'elapsed time (s)')

plt.xlabel('TPR')
plt.ylabel('expected elapsed time (s)')
plt.gca().invert_xaxis()
plt.legend()

plt.show()

In [None]:
np.where(np.linspace(0.999, 0.90, 100) == 0.9990)

In [None]:
elapsed_time[0]

In [None]:
prohibit_ratio[0]