In [1]:
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import TensorDataset, DataLoader
import os
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
num_classes = 2
num_epochs = 20
batch_size = 50
input_size = 768
model_dir = 'model'
window_size = 10
num_layers = 2
hidden_size = 64
file_dir = 'data_supervised'

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
eventId_to_bert = torch.load("../bert/bert_raw_data.pth")

In [4]:
padding = torch.zeros_like(eventId_to_bert['E5'][1][0])

In [5]:
eventId_to_bert['E0'] = [[],[padding]]

In [7]:
def generate_bert_dataset(file_dir):
    eventId_to_bert = torch.load("../bert/bert_raw_data.pth")
    padding = torch.zeros_like(eventId_to_bert['E5'][1][0])
    eventId_to_bert['E0'] = [[],[padding]]
    sessions = []
    labels = []
    max_len = 50
    normal_data = set()
    with open(file_dir+'/normal.csv', 'r') as f:
        for ln in f.readlines():
            ln = list(map(lambda n: n, map(int, ln.strip().split())))
            if len(ln)>50:
                continue
    #             max_len = max(max_len,len(ln))
            normal_data.add(tuple(ln))
    abnormal_data = set()
    with open(file_dir+'/abnormal.csv', 'r') as f:
        for ln in f.readlines():
            ln =list(map(lambda n: n, map(int, ln.strip().split())))
            if len(ln)>50:
                continue
    #             max_len = max(max_len,len(ln))
            abnormal_data.add(tuple(ln))
    print(max_len)
    for line in tqdm(normal_data, "normal:"):
        line = list(line) + [0]*(max_len-len(line))
        bert_input = []
        for id in line:
            bert_input.append(eventId_to_bert['E'+str(id)][1][0].cpu().numpy())
        sessions.append(tuple(bert_input))
        labels.append(0)
    for line in tqdm(abnormal_data, "abnormal:"):
        line = list(line) + [0]*(max_len-len(line))
        bert_input = []
        for id in line:
            bert_input.append(eventId_to_bert['E'+str(id)][1][0].cpu().numpy())
        sessions.append(tuple(bert_input))
        labels.append(1)
    print('Number of sessions({}): {}'.format(file_dir, len(sessions)))
    print('Number of normal sessions: {}'.format(len(normal_data)))
    print('Number of abnormal sessions: {}'.format(len(abnormal_data)))
    train_x, test_x, train_y, test_y = train_test_split(sessions, labels,test_size=0.3 )
    train_data = TensorDataset(torch.tensor(train_x, dtype=torch.float), torch.tensor(train_y))
    train_data= DataLoader(train_data, batch_size = batch_size, shuffle = True)
    test_data = TensorDataset(torch.tensor(test_x, dtype=torch.float), torch.tensor(test_y))
    test_data = DataLoader(test_data, batch_size = batch_size)

50


In [None]:
sessions = []
labels = []
max_len = 50
normal_data = set()
with open(file_dir+'/normal.csv', 'r') as f:
    for ln in f.readlines():
        ln = list(map(lambda n: n, map(int, ln.strip().split())))
        if len(ln)>50:
            continue
#             max_len = max(max_len,len(ln))
        normal_data.add(tuple(ln))
abnormal_data = set()
with open(file_dir+'/abnormal.csv', 'r') as f:
    for ln in f.readlines():
        ln =list(map(lambda n: n, map(int, ln.strip().split())))
        if len(ln)>50:
            continue
#             max_len = max(max_len,len(ln))
        abnormal_data.add(tuple(ln))

In [8]:
for line in tqdm(normal_data, "normal:"):
    line = list(line) + [0]*(max_len-len(line))
    bert_input = []
    for id in line:
        bert_input.append(eventId_to_bert['E'+str(id)][1][0].cpu().numpy())
    sessions.append(tuple(bert_input))
    labels.append(0)

normal:: 100%|█████████████████████████████████████████████████████████████████████| 3977/3977 [01:07<00:00, 59.31it/s]


In [9]:
for line in tqdm(abnormal_data, "abnormal:"):
    line = list(line) + [0]*(max_len-len(line))
    bert_input = []
    for id in line:
        bert_input.append(eventId_to_bert['E'+str(id)][1][0].cpu().numpy())
    sessions.append(tuple(bert_input))
    labels.append(1)

abnormal:: 100%|███████████████████████████████████████████████████████████████████| 4110/4110 [01:09<00:00, 59.21it/s]


In [11]:
import gc
gc.collect()

3773

In [12]:
print('Number of sessions({}): {}'.format(file_dir, len(sessions)))
print('Number of normal sessions: {}'.format(len(normal_data)))
print('Number of abnormal sessions: {}'.format(len(abnormal_data)))
train_x, test_x, train_y, test_y = train_test_split(sessions, labels,test_size=0.3 )
train_data = TensorDataset(torch.tensor(train_x, dtype=torch.float), torch.tensor(train_y))
train_data= DataLoader(train_data, batch_size = batch_size, shuffle = True)
test_data = TensorDataset(torch.tensor(test_x, dtype=torch.float), torch.tensor(test_y))
test_data = DataLoader(test_data, batch_size = batch_size)

Number of sessions(data_supervised): 8087
Number of normal sessions: 3977
Number of abnormal sessions: 4110


In [7]:
def generate_dataset(file_dir):
#     eventId_to_bert = torch.load("../bert/bert_raw_data.pth")
#     padding = torch.zeros_like(eventId_to_bert['E5'][1][0])
#     eventId_to_bert['E0'] = [[],[padding]]
    sessions = []
    normal_data = set()
    labels = []
    max_len = 50
    with open(file_dir+'/test_normal', 'r') as f:
        for ln in f.readlines():
            ln = list(map(lambda n: n, map(int, ln.strip().split())))
            if len(ln)>50:
                continue
    #             max_len = max(max_len,len(ln))
            normal_data.add(tuple(ln))
    abnormal_data = set()
    with open(file_dir+'/abnormal', 'r') as f:
        for ln in f.readlines():
            ln =list(map(lambda n: n, map(int, ln.strip().split())))
            if len(ln)>50:
                continue
    #             max_len = max(max_len,len(ln))
            abnormal_data.add(tuple(ln))
    print(max_len)
    for line in tqdm(normal_data, "normal:"):
        line = list(line) + [0]*(max_len-len(line))
        bert_input = []
        for id in line:
            bert_input.append(eventId_to_bert['E'+str(id)][1][0].cpu().numpy())
        sessions.append(tuple(bert_input))
        labels.append(0)
    for line in tqdm(abnormal_data, "abnormal:"):
        line = list(line) + [0]*(max_len-len(line))
        bert_input = []
        for id in line:
            bert_input.append(eventId_to_bert['E'+str(id)][1][0].cpu().numpy())
        sessions.append(tuple(bert_input))
        labels.append(1)
    print('Number of sessions({}): {}'.format(file_dir, len(sessions)))
    print('Number of normal sessions: {}'.format(len(normal_data)))
    print('Number of abnormal sessions: {}'.format(len(abnormal_data)))
    train_x, test_x, train_y, test_y = train_test_split(sessions, labels,test_size=0.2 )
    train_data = TensorDataset(torch.tensor(train_x, dtype=torch.float), torch.tensor(train_y))
    train_data= DataLoader(train_data, batch_size = batch_size, shuffle = True)
    test_data = TensorDataset(torch.tensor(test_x, dtype=torch.float), torch.tensor(test_y))
    test_data = DataLoader(test_data, batch_size = batch_size)
    return train_data,test_data,train_x,train_y,test_x,test_y

In [17]:
class Model(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_keys):
        super(Model, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True,bidirectional=True)
        self.fc = nn.Linear(2*hidden_size, num_keys)

    def forward(self, x):
        h0 = torch.zeros(2*self.num_layers, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(2*self.num_layers, x.size(0), self.hidden_size).to(device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

In [18]:
train_data

<torch.utils.data.dataloader.DataLoader at 0x20f40cd24a8>

In [9]:
train_data,test_data,train_x,train_y,test_x,test_y= generate_dataset(file_dir)

normal::   0%|                                                                       | 3/14161 [00:00<08:34, 27.52it/s]

50


normal::  86%|█████████████████████████████████████████████████████████▍         | 12149/14161 [03:13<00:23, 84.00it/s]

KeyError: 'E265'

normal::  86%|█████████████████████████████████████████████████████████▍         | 12149/14161 [03:30<00:23, 84.00it/s]

In [18]:
model = Model(input_size, hidden_size, num_layers, num_classes).to(device)

In [19]:
model

Model(
  (lstm): LSTM(768, 64, num_layers=2, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=128, out_features=2, bias=True)
)

In [20]:
# writer = Summary# writer(log_dir='log/' + log)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

In [22]:
total_step = len(train_data)
start_time = time.time()
num_epochs=20
for epoch in range(num_epochs):  # Loop over the dataset multiple times
    train_loss = 0
    for step, (seq, label) in enumerate(train_data):
        # Forward pass
        seq = seq.clone().detach().view(-1, seq.shape[1], input_size).to(device)
        output = model(seq)
        loss = criterion(output, label.to(device))

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
#         writer.add_graph(model, seq)
    print('Epoch [{}/{}], train_loss: {:.4f}'.format(epoch + 1, num_epochs, train_loss / total_step))
#     writer.add_scalar('train_loss'traabsrain_loss / total_step, epoch + 1)
elapsed_time = time.time() - start_time
print('elapsed_time: {:.3f}s'.format(elapsed_time))

Epoch [1/20], train_loss: 0.1982
Epoch [2/20], train_loss: 0.2042
Epoch [3/20], train_loss: 0.2179
Epoch [4/20], train_loss: 0.1867
Epoch [5/20], train_loss: 0.1856
Epoch [6/20], train_loss: 0.1768
Epoch [7/20], train_loss: 0.1703
Epoch [8/20], train_loss: 0.1925
Epoch [9/20], train_loss: 0.1678
Epoch [10/20], train_loss: 0.2059
Epoch [11/20], train_loss: 0.1969
Epoch [12/20], train_loss: 0.1767
Epoch [13/20], train_loss: 0.1590
Epoch [14/20], train_loss: 0.1691
Epoch [15/20], train_loss: 0.1558
Epoch [16/20], train_loss: 0.1546
Epoch [17/20], train_loss: 0.1526
Epoch [18/20], train_loss: 0.1753
Epoch [19/20], train_loss: 0.1588
Epoch [20/20], train_loss: 0.1442
elapsed_time: 115.371s


In [23]:
def accuracy(y_pred, y_true):
    return (np.argmax(y_pred.cpu().numpy(),1) == y_true.numpy()).astype('int').mean()

In [24]:
train_x = torch.tensor(train_x, dtype=torch.float).reshape(-1,50,input_size)
test_x = torch.tensor(test_x, dtype=torch.float).reshape(-1,50,input_size)
train_y = torch.tensor(train_y)
test_y = torch.tensor(test_y)

In [37]:
test_output.shape

torch.Size([10, 2])

In [44]:
with torch.no_grad():
    epoch_loss = 0
    for step, (seq, label) in enumerate(train_data):
        seq = seq.clone().detach().view(-1, seq.shape[1], input_size).to(device)
        test_output = model(seq.to(device))
        if step==0:
            output = test_output
            labels = label
        else:
            labels = torch.cat([labels,label],0)
            output = torch.cat([output,test_output],0)
        epoch_loss += criterion(test_output, label.to(device)).data
    epoch_accuracy = accuracy(output, labels)
    epoch_loss = epoch_loss/len(train_data)
    print('loss: ', round(epoch_loss.item(), 3), 'accuracy: ', round(epoch_accuracy.item(), 3))


loss:  0.151 accuracy:  0.943


In [43]:
epoch_accuracy = accuracy(output, labels)
epoch_loss = epoch_loss/len(train_data)
print('loss: ', round(epoch_loss.item(), 3), 'accuracy: ', round(epoch_accuracy.item(), 3))


loss:  0.15 accuracy:  0.943


In [40]:
with torch.no_grad():
    for step, (seq, label) in enumerate(train_data):
        seq = seq.clone().detach().view(-1, seq.shape[1], input_size).to(device)
        test_output = model(seq.to(device))
        epoch_loss = criterion(test_output, label.to(device)).data
        epoch_accuracy = accuracy(test_output, label)
#         epoch_loss = criterion(train_output, train_y.to(device)).data
#         test_output = model(test_x.to(device))
#         epoch_test_accuracy = accuracy(test_output, test_y)
#         epoch_test_loss = criterion(test_output, test_y.to(device)).data
#         print('epoch: ', epoch, 'loss: ', round(epoch_loss.item(), 3), 'accuracy: ', round(epoch_accuracy.item(), 3),
        print('test_loss: ', round(epoch_loss.item(), 3), 'test_accuracy: ', round(epoch_accuracy.item(), 3))


test_loss:  0.331 test_accuracy:  0.88
test_loss:  0.117 test_accuracy:  0.96
test_loss:  0.151 test_accuracy:  0.94
test_loss:  0.126 test_accuracy:  0.96
test_loss:  0.327 test_accuracy:  0.88
test_loss:  0.154 test_accuracy:  0.94
test_loss:  0.125 test_accuracy:  0.96
test_loss:  0.167 test_accuracy:  0.96
test_loss:  0.094 test_accuracy:  0.96
test_loss:  0.166 test_accuracy:  0.92
test_loss:  0.134 test_accuracy:  0.96
test_loss:  0.079 test_accuracy:  0.98
test_loss:  0.14 test_accuracy:  0.96
test_loss:  0.177 test_accuracy:  0.94
test_loss:  0.233 test_accuracy:  0.92
test_loss:  0.259 test_accuracy:  0.92
test_loss:  0.073 test_accuracy:  0.96
test_loss:  0.16 test_accuracy:  0.92
test_loss:  0.148 test_accuracy:  0.94
test_loss:  0.192 test_accuracy:  0.92
test_loss:  0.104 test_accuracy:  0.98
test_loss:  0.067 test_accuracy:  1.0
test_loss:  0.104 test_accuracy:  0.96
test_loss:  0.123 test_accuracy:  0.98
test_loss:  0.225 test_accuracy:  0.9
test_loss:  0.235 test_accura

In [28]:
with torch.no_grad():
    
    train_output = model(train_x.to(device))
    epoch_accuracy = accuracy(train_output, train_y)
    epoch_loss = criterion(train_output, train_y.to(device)).data
    test_output = model(test_x.to(device))
    epoch_test_accuracy = accuracy(test_output, test_y)
    epoch_test_loss = criterion(test_output, test_y.to(device)).data
    print('epoch: ', epoch, 'loss: ', round(epoch_loss.item(), 3), 'accuracy: ', round(epoch_accuracy.item(), 3),
    'test_loss: ', round(epoch_test_loss.item(), 3), 'test_accuracy: ', round(epoch_test_accuracy.item(), 3))

RuntimeError: CUDA out of memory. Tried to allocate 1.94 GiB (GPU 0; 4.00 GiB total capacity; 1.80 GiB already allocated; 1.11 GiB free; 1.82 GiB reserved in total by PyTorch)