In [9]:
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import TensorDataset, DataLoader
import os
from tqdm import tqdm
# from tqdm import tqdm
from ast import literal_eval
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
num_classes = 2
num_epochs = 20
batch_size = 50
input_size = 768
model_dir = 'model'
window_size = 10
num_layers = 2
hidden_size = 64
file_dir = 'data_supervised'

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
def generate_bert_data(file_dir,bert_cache_path):
    eventId_to_bert = torch.load(bert_cache_path)
    padding = torch.zeros_like(eventId_to_bert[5][1][0])
    eventId_to_bert[0] = [[], [padding]]
    sessions = []
    labels = []
    max_len = 50
    normal_data = set()
    abnormal_data = set()
    data = pd.read_csv('data/lstm/dataset/train.csv', engine='c', na_filter=False, memory_map=True)
    blockId_list = data['BlockId'].tolist()
    seqs = data['EventSequence'].apply(literal_eval).tolist()
    for line in tqdm(seqs, "loading data"):
        if len(line) > 50:
            continue
        normal_data.add(tuple(line))
    data = pd.read_csv('data/lstm/dataset/abnormal.csv', engine='c', na_filter=False, memory_map=True)
    blockId_list = data['BlockId'].tolist()
    seqs = data['EventSequence'].apply(literal_eval).tolist()
    for line in tqdm(seqs, "loading data"):
        if len(line) > 50:
            continue
        abnormal_data.add(tuple(line))
    for line in tqdm(normal_data, "normal:"):
        line = list(line) + [0] * (max_len - len(line))
        bert_input = []
        for id in line:
            bert_input.append(eventId_to_bert[id][1][0].cpu().numpy())
        sessions.append(tuple(bert_input))
        labels.append(0)
    for line in tqdm(abnormal_data, "abnormal:"):
        line = list(line) + [0] * (max_len - len(line))
        bert_input = []
        for id in line:
            bert_input.append(eventId_to_bert[id][1][0].cpu().numpy())
        sessions.append(tuple(bert_input))
        labels.append(1)

    print('Number of sessions({}): {}'.format(file_dir, len(sessions)))
    print('Number of normal sessions: {}'.format(len(normal_data)))
    print('Number of abnormal sessions: {}'.format(len(abnormal_data)))
    train_x, test_x, train_y, test_y = train_test_split(sessions, labels, test_size=0.3)
    train_data = TensorDataset(torch.tensor(train_x, dtype=torch.float), torch.tensor(train_y))
    # train_data = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    test_data = TensorDataset(torch.tensor(test_x, dtype=torch.float), torch.tensor(test_y))
    # test_data = DataLoader(test_data, batch_size=batch_size)
    return train_data, test_data, train_x, test_x, train_y, test_y

In [10]:
train_data, test_data, train_x, test_x, train_y, test_y =generate_bert_data("data/lstm/dataset/","./data/lstm/bert_cache.pth")

loading data: 100%|████████████████████████████████████████████████████████████| 5582/5582 [00:00<00:00, 278943.98it/s]
loading data: 100%|██████████████████████████████████████████████████████████| 16838/16838 [00:00<00:00, 244075.35it/s]
normal:: 100%|███████████████████████████████████████████████████████████████████████| 909/909 [00:19<00:00, 45.79it/s]
abnormal:: 100%|███████████████████████████████████████████████████████████████████| 4111/4111 [01:22<00:00, 49.86it/s]


Number of sessions(data/lstm/dataset/): 5020
Number of normal sessions: 909
Number of abnormal sessions: 4111


In [11]:
train_data = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_data = DataLoader(test_data, batch_size=batch_size)

In [12]:
class Model(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_keys):
        super(Model, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True,bidirectional=True)
        self.fc = nn.Linear(2*hidden_size, num_keys)

    def forward(self, x):
        h0 = torch.zeros(2*self.num_layers, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(2*self.num_layers, x.size(0), self.hidden_size).to(device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

In [14]:
model = Model(input_size, hidden_size, num_layers, num_classes).to(device)

In [15]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

In [16]:
def train(model,dataloader,criterion,optimizer,current_epoch=0,num_epochs=10,input_size=1):
    total_step = len(dataloader)
    start_time = time.time()
    for epoch in range(current_epoch,current_epoch+num_epochs):  # Loop over the dataset multiple times
        train_loss = 0
        for step, (seq, label) in enumerate(dataloader):
            # Forward pass
            seq = seq.clone().detach().view(-1, seq.shape[1], input_size).to(device)
            output = model(seq)
            loss = criterion(output, label.to(device))

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            train_loss += loss.item()
            optimizer.step()
            # writer.add_graph(model, seq)
        print('Epoch [{}/{}], train_loss: {:.4f}'.format(epoch + 1, current_epoch+num_epochs, train_loss / total_step))
        # writer.add_scalar('train_loss', train_loss / total_step, epoch + 1)
    elapsed_time = time.time() - start_time
    print('elapsed_time: {:.3f}s'.format(elapsed_time))

In [None]:
train(model,train_data,criterion,optimizer,current_epoch=0,num_epochs=10,input_size=input_size)

In [None]:
    with torch.no_grad():
        epoch_loss = 0
        for step, (seq, label) in enumerate(train_data):
            seq = seq.clone().detach().view(-1, seq.shape[1], input_size).to(device)
            test_output = model(seq.to(device))
            if step == 0:
                output = test_output
                labels = label
            else:
                labels = torch.cat([labels, label], 0)
                output = torch.cat([output, test_output], 0)
            epoch_loss += criterion(test_output, label.to(device)).data
        epoch_accuracy = accuracy(output, labels)
        epoch_loss = epoch_loss / len(train_data)
        print('loss: ', round(epoch_loss.item(), 3), 'accuracy: ', round(epoch_accuracy.item(), 3))
        epoch_loss = 0
        for step, (seq, label) in enumerate(test_data):
            seq = seq.clone().detach().view(-1, seq.shape[1], input_size).to(device)
            test_output = model(seq.to(device))
            if step == 0:
                output = test_output
                labels = label
            else:
                labels = torch.cat([labels, label], 0)
                output = torch.cat([output, test_output], 0)
            epoch_loss += criterion(test_output, label.to(device)).data
        epoch_accuracy = accuracy(output, labels)
        epoch_loss = epoch_loss / len(train_data)
        print('test_loss: ', round(epoch_loss.item(), 3), 'test_accuracy: ', round(epoch_accuracy.item(), 3))