In [1]:
import pandas as pd
import numpy as np
import torch 
import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from tqdm import tqdm
import time

NGIDS_path = './dataset/NGIDS_host_log_1-99.csv'

device = torch.device('cuda') # GPU 사용
batch_size = 256
slide_window_size = 50
learning_rate = 0.001
max_epochs = 150
hidden_size = 8
hhidden_size = 6
num_layers = 1
early_stop = 20

In [2]:
import NGIDS_dataset

X_train, y_train, X_vali, y_vali, X_test, y_test = NGIDS_dataset.NGIDS_get(slide_window_size)

print(len(X_train), len(y_train))
print(len(X_vali), len(y_vali))
print(len(X_test), len(y_test))


KeyboardInterrupt: 

In [None]:
import gensim

def save_path(vector_size, window, data_name="NGIDS_path_w2v"):
    return "./dataset/PathSystem/" + f"vectorsize{vector_size}_window{window}_" + data_name

def save_sys(vector_size, window, data_name = "NGIDS_vector"):
    return "./dataset/PathSystem/" + f"vectorsize{vector_size}_window{window}_" + data_name


vector_size = 10
window = 3
input_size = vector_size

NGIDS_sys_model = gensim.models.Word2Vec.load(save_sys(vector_size, window))
NGIDS_path_model = gensim.models.Word2Vec.load(save_path(vector_size, window, "NGIDS_vector"))

In [None]:
p2i = NGIDS_path_model.wv.key_to_index
s2i = NGIDS_sys_model.wv.key_to_index

NGIDS_trainset = NGIDS_dataset.NGIDS_Dataset(X_train, y_train, p2i, s2i)
train_loader = DataLoader(NGIDS_trainset, batch_size=batch_size, shuffle = True)

NGIDS_valiset = NGIDS_dataset.NGIDS_Dataset(X_vali, y_vali, p2i, s2i)
vali_loader = DataLoader(NGIDS_valiset, batch_size=batch_size, shuffle = True)

In [None]:
%reload_ext autoreload
%autoreload 2

import models
from models import GRU_AutoEncoder

model = GRU_AutoEncoder(input_size, hidden_size, hhidden_size, num_layers, NGIDS_path_model.wv.vectors, NGIDS_sys_model.wv.vectors, device)
model.to(device)

model = models.run(model, train_loader, vali_loader, learning_rate, max_epochs, early_stop)

torch.save(model, "positive_trainingAutoEncoder.model")

In [None]:
from sklearn.ensemble import IsolationForest

from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.metrics import roc_curve, roc_auc_score
import numpy as np
import torch 
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt

In [None]:
def print_result(isfo, X_test, y_test):
    
    NGIDS_testset = NGIDS_Dataset(X_test, y_test, p2i, s2i, slide_window_size)
    test_loader = DataLoader(NGIDS_testset, batch_size=len(NGIDS_testset), shuffle = True)
    
    tmp = iter(test_loader)
    data = tmp.next()
    _, label = data
    
    output, _ = model(data)
    output = output.detach().cpu().numpy()

    y_pred = isfo.predict(output[:, -1, :])
    y_score_sample = isfo.score_samples(output[:, -1, :])

    for idx, j in enumerate(y_pred):
        if j == -1 :
            y_pred[idx] = 1
        else :
            y_pred[idx] = 0
    

    print("accuracy score :", accuracy_score(label, y_pred))
    print("recall score :", recall_score(label, y_pred))
    print("precision score :", precision_score(label, y_pred))
    print("roc_auc :", roc_auc_score(label, -y_score_sample))

In [None]:
def ISFO_result(model, n_estimators = 100, max_samples="auto", contamination = 'auto', max_features = 1.0):
    isfo = IsolationForest()

    model.eval()

    train_loader = DataLoader(NGIDS_dataset, batch_size=batch_size, shuffle = True)
    train_iterator = tqdm(enumerate(train_loader), total=len(train_loader), position=0, leave=True, desc="training")

    for i, batch in train_iterator :
        
        output, _ = model(batch)
        output = output.detach().cpu().numpy()
        isfo.fit(output[:, -1, :])
    
    return isfo


In [None]:
model = torch.load("positive_trainingAutoEncoder.model")

isfo = IsolationForest()

model.eval()

train_loader = DataLoader(NGIDS_dataset, batch_size=batch_size, shuffle = True)
train_iterator = tqdm(enumerate(train_loader), total=len(train_loader), position=0, leave=True, desc="training")

for i, batch in train_iterator :
    
    output, _ = model(batch)
    output = output.detach().cpu().numpy()
    isfo.fit(output[:, -1, :])

In [None]:
NGIDS_testset = NGIDS_Dataset(X_test, y_test, p2i, s2i, slide_window_size)
test_loader = DataLoader(NGIDS_testset, batch_size=len(NGIDS_testset), shuffle = True)

tmp = iter(test_loader)
data = tmp.next()
_, label = data

output, _ = model(data)
output = output.detach().cpu().numpy()

y_pred = isfo.predict(output[:, -1, :])
y_score_sample = isfo.score_samples(output[:, -1, :])

for idx, j in enumerate(y_pred):
    if j == -1 :
        y_pred[idx] = 1
    else :
        y_pred[idx] = 0

fpr, tpr, thresholds = roc_curve(label, -y_score_sample)

print("accuracy score :", accuracy_score(label, y_pred))
print("recall score :", recall_score(label, y_pred))
print("precision score :", precision_score(label, y_pred))
print("roc_auc :", roc_auc_score(label, -y_score_sample))

plt.plot(fpr, tpr, color="navy", linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()