In [7]:
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import TensorDataset, DataLoader
import os
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
num_classes = 2
num_epochs = 20
batch_size = 50
input_size = 1
model_dir = 'model'
window_size = 10
num_layers = 2
hidden_size = 64
file_dir = 'data_supervised'

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [25]:
def generate_dataset(file_dir):
    eventId_to_bert = torch.load("../bert/bert_raw_data.pth")
    sessions = []
    normal_data = set()
    labels = []
    max_len = 50
    with open(file_dir+'/test_normal', 'r') as f:
        for ln in f.readlines():
            ln = [0] + list(map(lambda n: n, map(int, ln.strip().split()))) + [30]
            if len(ln)>50:
                continue
    #             max_len = max(max_len,len(ln))
            normal_data.add(tuple(ln))
    abnormal_data = set()
    with open(file_dir+'/abnormal', 'r') as f:
        for ln in f.readlines():
            ln = [0] + list(map(lambda n: n, map(int, ln.strip().split()))) + [30]
            if len(ln)>50:
                continue
    #             max_len = max(max_len,len(ln))
            abnormal_data.add(tuple(ln))
    print(max_len)
    for line in tqdm(normal_data, "normal:"):
        line = list(line) + [-1]*(max_len-len(line))
        sessions.append(tuple(line))
        labels.append(0)
    for line in tqdm(abnormal_data, "abnormal:"):
        line = list(line) + [-1]*(max_len-len(line))
        sessions.append(tuple(line))
        labels.append(1)
    print('Number of sessions({}): {}'.format(file_dir, len(sessions)))
    print('Number of normal sessions: {}'.format(len(normal_data)))
    print('Number of abnormal sessions: {}'.format(len(abnormal_data)))
    train_x, test_x, train_y, test_y = train_test_split(sessions, labels,test_size=0.2 )
    train_data = TensorDataset(torch.tensor(train_x, dtype=torch.float), torch.tensor(train_y))
    train_data= DataLoader(train_data, batch_size = batch_size, shuffle = True)
    test_data = TensorDataset(torch.tensor(test_x, dtype=torch.float), torch.tensor(test_y))
    test_data = DataLoader(test_data, batch_size = batch_size)
    return train_data,test_data,train_x,train_y,test_x,test_y

In [4]:
class Model(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_keys):
        super(Model, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True,bidirectional=True)
        self.fc = nn.Linear(2*hidden_size, num_keys)

    def forward(self, x):
        h0 = torch.zeros(2*self.num_layers, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(2*self.num_layers, x.size(0), self.hidden_size).to(device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

In [18]:
train_data

<torch.utils.data.dataloader.DataLoader at 0x20f40cd24a8>

In [26]:
train_data,test_data,train_x,train_y,test_x,test_y= generate_dataset(file_dir)

normal:: 100%|████████████████████████████████████████████████████████████████| 14159/14159 [00:00<00:00, 90560.94it/s]
abnormal::   0%|                                                                              | 0/4109 [00:00<?, ?it/s]

50


abnormal:: 100%|███████████████████████████████████████████████████████████████| 4109/4109 [00:00<00:00, 146176.84it/s]


Number of sessions(data_supervised): 18268
Number of normal sessions: 14159
Number of abnormal sessions: 4109


In [9]:
model = Model(input_size, hidden_size, num_layers, num_classes).to(device)

In [11]:
# writer = Summary# writer(log_dir='log/' + log)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

In [21]:
total_step = len(train_data)
start_time = time.time()
num_epochs=20
for epoch in range(num_epochs):  # Loop over the dataset multiple times
    train_loss = 0
    for step, (seq, label) in enumerate(train_data):
        # Forward pass
        seq = seq.clone().detach().view(-1, seq.shape[1], input_size).to(device)
        output = model(seq)
        loss = criterion(output, label.to(device))

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
#         writer.add_graph(model, seq)
    print('Epoch [{}/{}], train_loss: {:.4f}'.format(epoch + 1, num_epochs, train_loss / total_step))
#     writer.add_scalar('train_loss'traabsrain_loss / total_step, epoch + 1)
elapsed_time = time.time() - start_time
print('elapsed_time: {:.3f}s'.format(elapsed_time))

Epoch [1/20], train_loss: 0.0952
Epoch [2/20], train_loss: 0.0925
Epoch [3/20], train_loss: 0.1001
Epoch [4/20], train_loss: 0.0613
Epoch [5/20], train_loss: 0.0557
Epoch [6/20], train_loss: 0.0695
Epoch [7/20], train_loss: 0.1290
Epoch [8/20], train_loss: 0.0575
Epoch [9/20], train_loss: 0.0612
Epoch [10/20], train_loss: 0.0603
Epoch [11/20], train_loss: 0.0531
Epoch [12/20], train_loss: 0.1004
Epoch [13/20], train_loss: 0.0504
Epoch [14/20], train_loss: 0.0482
Epoch [15/20], train_loss: 0.0527
Epoch [16/20], train_loss: 0.0482
Epoch [17/20], train_loss: 0.0530
Epoch [18/20], train_loss: 0.0551
Epoch [19/20], train_loss: 0.0462
Epoch [20/20], train_loss: 0.0477
elapsed_time: 212.323s


In [27]:
def accuracy(y_pred, y_true):
    return (np.argmax(y_pred.cpu().numpy(),1) == y_true.numpy()).astype('int').mean()

In [28]:
train_x = torch.tensor(train_x, dtype=torch.float).reshape(-1,50,1)
test_x = torch.tensor(test_x, dtype=torch.float).reshape(-1,50,1)
train_y = torch.tensor(train_y)
test_y = torch.tensor(test_y)

In [31]:
train_output.shape

torch.Size([6468, 2])

In [32]:
with torch.no_grad():
    for step, (seq, label) in enumerate(train_data):
        seq = seq.clone().detach().view(-1, seq.shape[1], input_size).to(device)
        train_output = model(seq.to(device))
        epoch_accuracy = accuracy(train_output, label)
#         epoch_loss = criterion(train_output, train_y.to(device)).data
        test_output = model(test_x.to(device))
        epoch_test_accuracy = accuracy(test_output, test_y)
        epoch_test_loss = criterion(test_output, test_y.to(device)).data
        print('epoch: ', epoch, 'loss: ', round(epoch_loss.item(), 3), 'accuracy: ', round(epoch_accuracy.item(), 3),
              'test_loss: ', round(epoch_test_loss.item(), 3), 'test_accuracy: ', round(epoch_test_accuracy.item(), 3))


epoch:  19 loss:  0.045 accuracy:  0.98 test_loss:  0.06 test_accuracy:  0.985
epoch:  19 loss:  0.045 accuracy:  1.0 test_loss:  0.06 test_accuracy:  0.985
epoch:  19 loss:  0.045 accuracy:  1.0 test_loss:  0.06 test_accuracy:  0.985
epoch:  19 loss:  0.045 accuracy:  0.98 test_loss:  0.06 test_accuracy:  0.985
epoch:  19 loss:  0.045 accuracy:  1.0 test_loss:  0.06 test_accuracy:  0.985
epoch:  19 loss:  0.045 accuracy:  0.96 test_loss:  0.06 test_accuracy:  0.985
epoch:  19 loss:  0.045 accuracy:  0.96 test_loss:  0.06 test_accuracy:  0.985
epoch:  19 loss:  0.045 accuracy:  1.0 test_loss:  0.06 test_accuracy:  0.985
epoch:  19 loss:  0.045 accuracy:  0.96 test_loss:  0.06 test_accuracy:  0.985
epoch:  19 loss:  0.045 accuracy:  0.98 test_loss:  0.06 test_accuracy:  0.985
epoch:  19 loss:  0.045 accuracy:  1.0 test_loss:  0.06 test_accuracy:  0.985
epoch:  19 loss:  0.045 accuracy:  0.98 test_loss:  0.06 test_accuracy:  0.985
epoch:  19 loss:  0.045 accuracy:  0.96 test_loss:  0.06 

KeyboardInterrupt: 

In [29]:
with torch.no_grad():
    
    train_output = model(train_x.to(device))
    epoch_accuracy = accuracy(train_output, train_y)
    epoch_loss = criterion(train_output, train_y.to(device)).data
    test_output = model(test_x.to(device))
    epoch_test_accuracy = accuracy(test_output, test_y)
    epoch_test_loss = criterion(test_output, test_y.to(device)).data
    print('epoch: ', epoch, 'loss: ', round(epoch_loss.item(), 3), 'accuracy: ', round(epoch_accuracy.item(), 3),
          'test_loss: ', round(epoch_test_loss.item(), 3), 'test_accuracy: ', round(epoch_test_accuracy.item(), 3))

RuntimeError: CUDA out of memory. Tried to allocate 3.83 GiB (GPU 0; 4.00 GiB total capacity; 1.72 GiB already allocated; 252.77 MiB free; 2.48 GiB reserved in total by PyTorch)