In [6]:
import pandas as pd
import numpy as np
import torch 
import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from tqdm import tqdm


NGIDS_path = './dataset/NGIDS_host_log_1-99.csv'


device = torch.device('cuda') # GPU 사용
batch_size = 1024
slide_window_size = 200
learning_rate = 0.001
max_epochs = 100
input_size = 10
hidden_size = 50
num_layers = 2


In [7]:
NGIDS = pd.read_csv(NGIDS_path)
        
dropna_NGIDS = NGIDS.dropna(subset=['path', 'sys_call', 'label'])

path = np.array(dropna_NGIDS['path'].to_list())
syscall = np.array(dropna_NGIDS['sys_call'].to_list())
label = np.array(dropna_NGIDS['label'].to_list())

ngids_len = int(len(path)/100)

print(ngids_len)

path = path[:ngids_len]
syscall = syscall[:ngids_len]
label = label[:ngids_len]

def data_split(data, label) :
    
    X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.4, random_state=42)
    X_vali, X_test, y_vali, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)
    
    return X_train, y_train, X_vali, y_vali, X_test, y_test

X_train, y_train, X_vali, y_vali, X_test, y_test = data_split(list(zip(path, syscall)), label)

900542


In [8]:
import gensim

def save_path(vector_size, window, data_name="NGIDS_path_w2v"):
    return "./dataset/path/" + f"vectorsize{vector_size}_window{window}_" + data_name

def save_sys(vector_size, window, data_name = "NGIDS_vector"):
    return "./dataset/SystemCall/" + f"vectorsize{vector_size}_window{window}_" + data_name


vector_size = 10
window = 1

NGIDS_sys_model = gensim.models.Word2Vec.load(save_sys(vector_size, window))
NGIDS_path_model = gensim.models.Word2Vec.load(save_path(vector_size, window))



In [9]:
class NGIDS_Dataset(torch.utils.data.Dataset):
    def __init__(self, data, label, p2i, s2i, slide_window_size):
        self.label = label
        self.slide_size = slide_window_size

        path_l = []
        sys_l = []

        for path, sys in data :
            path_l.append(p2i[path])
            sys_l.append(s2i[sys])

        self.data = list(zip(path_l, sys_l))

    def __len__(self):
        return len(self.data) - self.slide_size + 1

    def __getitem__(self, i):
        return torch.tensor(self.data[i:i + self.slide_size]).reshape(1, self.slide_size, 2).to(device), torch.tensor(self.label[i:i + self.slide_size], dtype=torch.long).reshape(1, self.slide_size).to(device)

In [10]:
p2i = NGIDS_path_model.wv.key_to_index
s2i = NGIDS_sys_model.wv.key_to_index
NGIDS_dataset = NGIDS_Dataset(X_train, y_train, p2i, s2i, slide_window_size)
train_loader = DataLoader(NGIDS_dataset, batch_size=batch_size, shuffle = True)

In [11]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, hhidden_size, num_layers, dropout_p=0.5):
        super(Encoder, self).__init__()
        self.gru1 = nn.GRU(input_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(p=dropout_p)
        self.gru2 = nn.GRU(hidden_size, hhidden_size, batch_first=True)
        
    def forward(self, batch):
        batch, _ = self.gru1(batch)
        batch = self.dropout(batch)
        outputs, hidden = self.gru2(batch)
        return outputs, hidden

class Decoder(nn.Module):
    def __init__(self, input_size, hidden_size, hhidden_size, num_layers, dropout_p=0.5):
        super(Decoder, self).__init__()
        self.gru1 = nn.GRU(hhidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(p=dropout_p)
        self.gru2 = nn.GRU(hidden_size, input_size, batch_first=True)

    def forward(self, batch):
        batch, _ = self.gru1(batch)
        batch = self.dropout(batch)
        output, hidden = self.gru2(batch)
        return output, hidden

In [12]:
class CNN_AutoEncoder(nn.Module):
    def __init__(self, input_size, hidden_size, hhidden_size, num_layers, path_vecs, sys_vecs):
        super(CNN_AutoEncoder, self).__init__()
        
        self.path_emb = nn.Embedding.from_pretrained(torch.tensor(path_vecs, dtype=torch.float).cuda(), freeze=True)
        self.sys_emb = nn.Embedding.from_pretrained(torch.tensor(sys_vecs, dtype=torch.float).cuda(), freeze=True)

        self.encoder = Encoder(input_size=input_size, hidden_size=hidden_size, hhidden_size=hhidden_size, num_layers=num_layers)
        self.reconstruct_decoder = Decoder(input_size=input_size, hidden_size=hidden_size, hhidden_size=hhidden_size, num_layers=num_layers)
        self.input_size = input_size

        self.criterion = nn.MSELoss()

    def forward(self, batch):
        batch, _  = batch

        batch = batch.to(device)

        batch_size, sequence_length, _ = batch.size()
        vector_size = self.input_size
        
        path_batch = self.path_emb(batch[:,:,0])
        sys_batch = self.sys_emb(batch[:,:,1])

        batch = path_batch + sys_batch
        batch = batch.reshape(batch_size, sequence_length, vector_size)
        
        outputs, encoder_hidden = self.encoder(batch)
        outputs, decoder_hidden = self.reconstruct_decoder(outputs)
        
        reconstruct_loss = self.criterion(outputs, batch)

        batch = batch.to("cpu")
        torch.cuda.empty_cache()

        return outputs, reconstruct_loss

In [13]:
def run(model, train_loader):

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    epochs = tqdm(range(max_epochs))
    
    count = 0
    for epoch in epochs:
       
        model.train()
        optimizer.zero_grad()
        train_iterator = tqdm(enumerate(train_loader), total=len(train_loader), desc="training")

        for i, batch_data in train_iterator:
            
            reconstruct_loss = model(batch_data)

            loss = reconstruct_loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        torch.save(model, f"model/Auto_encoder_epoch{epoch}.model")
            
    return model

In [14]:
model = CNN_AutoEncoder(input_size, hidden_size, num_layers, NGIDS_path_model.wv.vectors, NGIDS_sys_model.wv.vectors)
model.to(device)

model = run(model, train_loader)

training: 100%|██████████| 528/528 [02:42<00:00,  3.25it/s]
training: 100%|██████████| 528/528 [02:40<00:00,  3.28it/s]
training: 100%|██████████| 528/528 [02:41<00:00,  3.28it/s]
training: 100%|██████████| 528/528 [02:40<00:00,  3.29it/s]
training: 100%|██████████| 528/528 [02:39<00:00,  3.30it/s]
training: 100%|██████████| 528/528 [02:40<00:00,  3.28it/s]
training: 100%|██████████| 528/528 [02:40<00:00,  3.28it/s]
training: 100%|██████████| 528/528 [02:41<00:00,  3.27it/s]
training: 100%|██████████| 528/528 [02:41<00:00,  3.26it/s]
training: 100%|██████████| 528/528 [02:39<00:00,  3.31it/s]
training: 100%|██████████| 528/528 [02:40<00:00,  3.30it/s]
training: 100%|██████████| 528/528 [02:39<00:00,  3.30it/s]
training: 100%|██████████| 528/528 [02:40<00:00,  3.30it/s]
training: 100%|██████████| 528/528 [02:39<00:00,  3.31it/s]
training: 100%|██████████| 528/528 [02:41<00:00,  3.27it/s]
training: 100%|██████████| 528/528 [02:42<00:00,  3.25it/s]
training: 100%|██████████| 528/528 [02:4

In [15]:
torch.save(model, "AutoEncoder.model")