In [2]:
import pandas as pd
import numpy as np
import torch 
import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from tqdm import tqdm


NGIDS_path = './dataset/NGIDS_host_log_1-99.csv'


device = torch.device('cuda') # GPU 사용
batch_size = 1024
slide_window_size = 30
learning_rate = 0.001
max_epochs = 100
input_size = 10
hidden_size = 50
num_layers = 2


In [3]:
NGIDS = pd.read_csv(NGIDS_path)
        
dropna_NGIDS = NGIDS.dropna(subset=['path', 'sys_call', 'label'])

path = np.array(dropna_NGIDS['path'].to_list())
syscall = np.array(dropna_NGIDS['sys_call'].to_list())
label = np.array(dropna_NGIDS['label'].to_list())

def data_split(data, label) :
    
    X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.4, random_state=42)
    X_vali, X_test, y_vali, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)
    
    return X_train, y_train, X_vali, y_vali, X_test, y_test

X_train, y_train, X_vali, y_vali, X_test, y_test = data_split(list(zip(path, syscall)), label)

In [4]:
import gensim

def save_path(vector_size, window, data_name="NGIDS_path_w2v"):
    return "./dataset/path/" + f"vectorsize{vector_size}_window{window}_" + data_name

def save_sys(vector_size, window, data_name = "NGIDS_vector"):
    return "./dataset/SystemCall/" + f"vectorsize{vector_size}_window{window}_" + data_name


vector_size = 10
window = 1

NGIDS_sys_model = gensim.models.Word2Vec.load(save_sys(vector_size, window))
NGIDS_path_model = gensim.models.Word2Vec.load(save_path(vector_size, window))



In [5]:
class NGIDS_Dataset(torch.utils.data.Dataset):
    def __init__(self, data, label, p2i, s2i, slide_window_size):
        self.label = label
        self.slide_size = slide_window_size

        path_l = []
        sys_l = []

        for path, sys in data :
            path_l.append(p2i[path])
            sys_l.append(s2i[sys])

        self.data = list(zip(path_l, sys_l))

    def __len__(self):
        return len(self.data) - self.slide_size + 1

    def __getitem__(self, i):
        return torch.tensor(self.data[i:i + self.slide_size]).reshape(1, self.slide_size, 2).to(device), torch.tensor(self.label[i:i + self.slide_size], dtype=torch.long).reshape(1, self.slide_size).to(device)

In [6]:
p2i = NGIDS_path_model.wv.key_to_index
s2i = NGIDS_sys_model.wv.key_to_index
NGIDS_dataset = NGIDS_Dataset(X_train, y_train, p2i, s2i, slide_window_size)
train_loader = DataLoader(NGIDS_dataset, batch_size=batch_size, shuffle = True)

In [56]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout_p=0.2):
        super(Encoder, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout = dropout_p)
        
    def forward(self, batch):
        outpus, (hidden, cell) = self.lstm(batch)
        return (hidden, cell)

class Decoder(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, dropout_p=0.2):
        super(Decoder, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout = dropout_p)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden):
        output, (hidden, cell) = self.lstm(x, hidden)
        pred = self.fc(output)
        return pred, (hidden, cell)

In [57]:
class LSTM_AutoEncoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, path_vecs, sys_vecs):
        super(LSTM_AutoEncoder, self).__init__()
        
        self.path_emb = nn.Embedding.from_pretrained(torch.tensor(path_vecs, dtype=torch.float).cuda(), freeze=True)
        self.sys_emb = nn.Embedding.from_pretrained(torch.tensor(sys_vecs, dtype=torch.float).cuda(), freeze=True)

        self.encoder = Encoder(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers)
        self.reconstruct_decoder = Decoder(input_size=input_size, hidden_size=hidden_size, output_size = input_size, num_layers=num_layers)
        #self.predict_decoder = Decoder(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers)
        self.input_size = input_size

        self.criterion = nn.MSELoss()

    def forward(self, batch):
        batch, _  = batch
        batch_size, _, sequence_length, _ = batch.size()
        vector_size = self.input_size
        
        path_batch = self.path_emb(batch[:,:,:,0])
        sys_batch = self.sys_emb(batch[:,:,:,1])

        batch = path_batch + sys_batch
        batch = batch.reshape(batch_size, sequence_length, vector_size)
        
        encoder_hidden = self.encoder(batch)
        
        '''predict_output = []
        temp_input = torch.zeros((batch_size, 1, vector_size), dtype=torch.float).to(device)
        hidden = encoder_hidden

        for t in range(sequence_length):
            temp_input, hidden = self.predict_decoder(temp_input, hidden)
            predict_output.append(temp_input)
            
        predict_output = torch.cat(predict_output, dim=1)
        predict_loss = self.criterion(predict_output, trg)'''
        
        inv_idx = torch.arange(sequence_length - 1, -1, -1).long()

        reconstruct_output = []
        temp_input = torch.zeros((batch_size, 1, vector_size), dtype=torch.float).to(device)
        hidden = encoder_hidden
        for t in range(sequence_length):
            print(f"temp_intput : {temp_input.size()}")
            temp_input, hidden = self.reconstruct_decoder(temp_input, hidden)
            reconstruct_output.append(temp_input)
        reconstruct_output = torch.cat(reconstruct_output, dim=1)
        reconstruct_loss = self.criterion(reconstruct_output, batch[:, inv_idx, :])
            
        return reconstruct_loss

In [58]:
def run(model, train_loader):

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    epochs = tqdm(range(max_epochs))
    
    count = 0
    for epoch in epochs:
       
        model.train()
        optimizer.zero_grad()
        train_iterator = tqdm(enumerate(train_loader), total=len(train_loader), desc="training")

        for i, batch_data in train_iterator:
            
            reconstruct_loss = model(batch_data)

            loss = reconstruct_loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
    return model

In [59]:
model = LSTM_AutoEncoder(input_size, hidden_size, num_layers, NGIDS_path_model.wv.vectors, NGIDS_sys_model.wv.vectors)
model.to(device)

model = run(model, train_loader)

TypeError: __init__() missing 1 required positional argument: 'output_size'