## Normal Autoencoder

The idea of this approach is that the model has to be able to represent the same log after being transfered into a low dimension. Then, if we train the model with the normal logs (the majority of logs in our dataset) we would have trained the model to represent the normal logs. Therefore, if an anomalous log is presented, the model would work badly, and here is where we can identify the anomaly.

In [1]:
# imports:

import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch
import torch.nn as nn
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


from skipgram import *
import os

from torch.nn.utils.rnn import pad_sequence
from torch.nn.functional import cosine_similarity

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
ROOT_DIR = os.path.dirname(os.path.abspath(""))

### Step 1: Data preprocessing

In [2]:
# Load the CSV file into a pandas DataFrame
logs_df = pd.read_csv('../data/sitges_access_prepared_whole_set_but_last.csv')

### Step 2: Dataset


In [3]:
# Split the data into train, validation, and test sets
X_train, X_temp = train_test_split(logs_df, test_size=0.4, random_state=42)
X_val, X_test = train_test_split(X_temp, test_size=0.5, random_state=42)

In [7]:
class CustomDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe.values.astype(np.float32) # Assuming dataframe is a pandas DataFrame

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        return torch.tensor(sample)

In [8]:
train_dataset = CustomDataset(X_train)
train_loader = DataLoader(train_dataset, batch_size=1000, shuffle=False)

val_dataset = CustomDataset(X_val)
val_loader = DataLoader(val_dataset, batch_size=1000, shuffle=False)

test_dataset = CustomDataset(X_test)
test_loader = DataLoader(test_dataset, batch_size=1000, shuffle=False)

### Step 3: Model

In [9]:
# Define Autoencoder Model
class LogAnomalyDetector(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(LogAnomalyDetector, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_size, hidden_size[0]),
            nn.ReLU(),
            nn.Linear(hidden_size[0], hidden_size[1]),
            nn.ReLU(),
            nn.Linear(hidden_size[1], hidden_size[2])
        )
        self.decoder = nn.Sequential(
            nn.Linear(hidden_size[2], hidden_size[1]),
            nn.ReLU(),
            nn.Linear(hidden_size[1], hidden_size[0]),
            nn.ReLU(),
            nn.Linear(hidden_size[0], input_size)
        )

    def forward(self, x):
        emb = self.encoder(x)
        #print("Latent space:", emb)
        x = self.decoder(emb)
        return x

input_size=len(logs_df.columns)
hidden_size=[50, 20, 5] 

# Create an instance of the model
model = LogAnomalyDetector(input_size=input_size, hidden_size=hidden_size)
model.to(device)

# Print model summary
print(model)

LogAnomalyDetector(
  (encoder): Sequential(
    (0): Linear(in_features=115, out_features=50, bias=True)
    (1): ReLU()
    (2): Linear(in_features=50, out_features=20, bias=True)
    (3): ReLU()
    (4): Linear(in_features=20, out_features=5, bias=True)
  )
  (decoder): Sequential(
    (0): Linear(in_features=5, out_features=20, bias=True)
    (1): ReLU()
    (2): Linear(in_features=20, out_features=50, bias=True)
    (3): ReLU()
    (4): Linear(in_features=50, out_features=115, bias=True)
  )
)


### Step 4: Training

In [10]:
# Function for training the model
def train_model(model, train_loader, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for i, inputs in tqdm(enumerate(train_loader)):
            inputs = inputs.to(device)

            optimizer.zero_grad()

            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), inputs.squeeze())

            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss / len(train_loader)}")



In [11]:
# Define the criterion (loss function)
criterion = nn.MSELoss()

# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)  # Adjust learning rate as needed

# Train the model
num_epochs = 20

train_loss = train_model(model, train_loader, criterion, optimizer, num_epochs=num_epochs)

534it [00:05, 92.27it/s]


Epoch [1/20], Loss: 0.08764286366490166


534it [00:06, 86.35it/s] 


Epoch [2/20], Loss: 0.049868681297152676


534it [00:05, 104.31it/s]


Epoch [3/20], Loss: 0.047958169718471805


534it [00:05, 90.56it/s]


Epoch [4/20], Loss: 0.04538564170511921


534it [00:05, 96.07it/s] 


Epoch [5/20], Loss: 0.043962398090500954


534it [00:05, 106.57it/s]


Epoch [6/20], Loss: 0.04331939338884327


534it [00:06, 83.38it/s] 


Epoch [7/20], Loss: 0.04264342656519529


534it [00:05, 105.40it/s]


Epoch [8/20], Loss: 0.04213571413123652


534it [00:06, 88.39it/s]


Epoch [9/20], Loss: 0.041813429047384956


534it [00:05, 96.86it/s] 


Epoch [10/20], Loss: 0.04158894991160332


534it [00:05, 105.04it/s]


Epoch [11/20], Loss: 0.04142337720771407


534it [00:06, 85.38it/s] 


Epoch [12/20], Loss: 0.04129487549544274


534it [00:05, 100.27it/s]


Epoch [13/20], Loss: 0.04118955597867457


534it [00:06, 82.35it/s]


Epoch [14/20], Loss: 0.04109313778495521


534it [00:05, 102.32it/s]


Epoch [15/20], Loss: 0.04100658329489749


534it [00:05, 97.75it/s]


Epoch [16/20], Loss: 0.040924259318450416


534it [00:06, 86.61it/s] 


Epoch [17/20], Loss: 0.04084469608209106


534it [00:05, 104.16it/s]


Epoch [18/20], Loss: 0.04076496230556947


534it [00:06, 82.06it/s] 


Epoch [19/20], Loss: 0.04067158990640765


534it [00:05, 104.61it/s]

Epoch [20/20], Loss: 0.040562229740262475





### Step 5: Test

In [12]:
# Test the model
def test_model(model, test_loader, criterion):

    model.eval()  # Set model to evaluation mode

    test_loss = 0.0

    with torch.no_grad():
        for inputs in test_loader:
            inputs = inputs.to(device)

            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), inputs.squeeze())

            test_loss += loss.item()

    avg_test_loss = test_loss / len(test_loader)
    print(f"Average Test Loss: {avg_test_loss}")

In [13]:
val_loss = test_model(model, val_loader, criterion)

Average Test Loss: 0.04044552532474646


In [14]:
test_loss = test_model(model, test_loader, criterion)

Average Test Loss: 0.04053921034831679


### Step 6: Upload the model

In [20]:
# Save the model state dictionary 
torch.save(model.state_dict(), '../models/normalAutoencoder.pt')