## LSTM Autoencoder

The idea of this approach is that the model has to be able to represent a sequence of logs after being transfered into a low dimension. Then, if we train the model with the normal logs (the majority of logs in our dataset) we would have trained the model to represent the normal logs. Therefore, if an anomalous sequence of logs is presented, the model would work badly, and here is where we can identify the anomaly.

In [1]:
# Imports:

import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch
import torch.nn as nn
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from skipgram import *
from torch.nn.utils.rnn import pad_sequence

### Step 1: Data preprocessing

In [2]:
logs_df = pd.read_csv('../data/sitges_access_prepared_whole_set_but_last.csv')

### Step 2: Dataset

In [3]:
# Custom Dataset class to create sequences on-the-fly
class SequenceDataset(Dataset):
    def __init__(self, data, seq_length):
        self.data = data
        self.seq_length = seq_length

    def __len__(self):
        return len(self.data) - self.seq_length + 1

    def __getitem__(self, idx):
        sequence = self.data[idx:idx + self.seq_length]
        return torch.tensor(sequence, dtype=torch.float32)

In [4]:
# Function to create sequences
def create_sequences(data, seq_length):
    sequences = []
    for i in range(len(data) - seq_length + 1):
        sequence = data[i:i+seq_length]
        sequences.append(sequence)
    return np.array(sequences)

In [5]:
# Split the data into train, validation, and test sets
X_train, X_temp = train_test_split(logs_df, test_size=0.4, random_state=42)
X_val, X_test = train_test_split(X_temp, test_size=0.5, random_state=42)

sequence_length=20

train_dataset = SequenceDataset(X_train, sequence_length)
train_loader = DataLoader(train_dataset, batch_size=1000, shuffle=False)

val_dataset = SequenceDataset(X_val, sequence_length)
val_loader = DataLoader(val_dataset, batch_size=1000, shuffle=False)

test_dataset = SequenceDataset(X_test, sequence_length)
test_loader = DataLoader(test_dataset, batch_size=1000, shuffle=False)


### Step 3: Model

In [None]:
class LSTMAutoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers):
        super(LSTMAutoencoder, self).__init__()
        self.encoder = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.decoder = nn.LSTM(hidden_dim, input_dim, num_layers, batch_first=True)
    
    def forward(self, x):
        batch_size, seq_len, _ = x.size()
        # Encoder
        _, (hidden, _) = self.encoder(x)
        # Repeat the last hidden state seq_len times
        hidden = hidden[-1].repeat(seq_len, 1, 1).transpose(0, 1)
        # Decoder
        decoded, _ = self.decoder(hidden)
        return decoded

# Example parameters (these should be tuned according to your dataset)
input_dim = len(logs_df.columns)  # Number of features in your log data
hidden_dim = input_dim//4
num_layers = 3

model = LSTMAutoencoder(input_dim, hidden_dim, num_layers)

# Print model summary
print(model)

LSTMAutoencoder(
  (encoder): LSTM(115, 28, num_layers=3, batch_first=True)
  (decoder): LSTM(28, 115, num_layers=3, batch_first=True)
)


### Step 4: Training

In [None]:
# Function for training the model
def train_model(model, train_loader, criterion, optimizer, num_epochs=10):
    #model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for batch in tqdm(train_loader):
            #batch=batch.to(device)
            
            optimizer.zero_grad()

            output = model(batch)
            loss = criterion(output, batch)
            
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss / len(train_loader)}")

In [None]:
# Training parameters
num_epochs = 30
learning_rate = 0.001

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

train_loss = train_model(model, train_loader, criterion, optimizer, num_epochs)

  return torch.tensor(sample)
100%|██████████| 80/80 [00:21<00:00,  3.80it/s]


Epoch [1/20], Loss: 0.15259287040680647


100%|██████████| 80/80 [00:20<00:00,  3.86it/s]


Epoch [2/20], Loss: 0.12568646743893624


100%|██████████| 80/80 [00:20<00:00,  3.81it/s]


Epoch [3/20], Loss: 0.12557480279356242


100%|██████████| 80/80 [00:21<00:00,  3.80it/s]


Epoch [4/20], Loss: 0.1255526153370738


100%|██████████| 80/80 [00:21<00:00,  3.65it/s]


Epoch [5/20], Loss: 0.12554365592077374


100%|██████████| 80/80 [00:25<00:00,  3.15it/s]


Epoch [6/20], Loss: 0.1255388718098402


100%|██████████| 80/80 [00:20<00:00,  3.83it/s]


Epoch [7/20], Loss: 0.12553582303225994


100%|██████████| 80/80 [00:20<00:00,  3.87it/s]


Epoch [8/20], Loss: 0.1255336867645383


100%|██████████| 80/80 [00:22<00:00,  3.63it/s]


Epoch [9/20], Loss: 0.1255320582538843


100%|██████████| 80/80 [00:22<00:00,  3.53it/s]


Epoch [10/20], Loss: 0.12553033782169223


100%|██████████| 80/80 [00:22<00:00,  3.62it/s]


Epoch [11/20], Loss: 0.12552684564143418


100%|██████████| 80/80 [00:22<00:00,  3.63it/s]


Epoch [12/20], Loss: 0.12534217918291687


100%|██████████| 80/80 [00:22<00:00,  3.55it/s]


Epoch [13/20], Loss: 0.12339397510513664


100%|██████████| 80/80 [00:22<00:00,  3.57it/s]


Epoch [14/20], Loss: 0.12166460892185568


100%|██████████| 80/80 [00:21<00:00,  3.70it/s]


Epoch [15/20], Loss: 0.12052223645150661


100%|██████████| 80/80 [00:21<00:00,  3.73it/s]


Epoch [16/20], Loss: 0.11907444037497043


100%|██████████| 80/80 [00:21<00:00,  3.71it/s]


Epoch [17/20], Loss: 0.11724910149350762


100%|██████████| 80/80 [00:22<00:00,  3.63it/s]


Epoch [18/20], Loss: 0.11506083672866226


100%|██████████| 80/80 [00:22<00:00,  3.63it/s]


Epoch [19/20], Loss: 0.11345840450376272


100%|██████████| 80/80 [00:21<00:00,  3.70it/s]

Epoch [20/20], Loss: 0.11206605564802885





### Step 5: Test

In [None]:
# Test the model
def test_model(model, test_loader, criterion):

    model.eval()  # Set model to evaluation mode

    test_loss = 0.0

    with torch.no_grad():
        for inputs in test_loader:
            #inputs = inputs.to(device)

            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), inputs.squeeze())

            test_loss += loss.item()

    avg_test_loss = test_loss / len(test_loader)
    print(f"Average Test Loss: {avg_test_loss}")

In [None]:
val_loss = test_model(model, val_loader, criterion)

  return torch.tensor(sample)


Average Test Loss: 0.11228703403914417


In [None]:
test_loss = test_model(model, test_loader, criterion)

  return torch.tensor(sample)


Average Test Loss: 0.11038957784573238


### Step 6: Upload the model

In [None]:
# Save the model state dictionary 
torch.save(model.state_dict(), '../models/normalAutoencoder.pt')