##Simple Example of LSTM

The idea of this approach is that the model has to be able to represent the same log after being transfered into a low dimension. Then, if we train the model with the normal logs (the majority of logs in our dataset) we would have trained the model to represent the normal logs. Therefore, if an anomalous log is presented, the model would work badly, and here is where we can identify the anomaly.

In [25]:
# imports:

import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch
import torch.nn as nn
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


###Step 1: Preprocessing

In [26]:
# Assuming you have a CSV file named 'logs.csv' with columns ['date', 'petition', 'URL', 'status', 'referer', 'user-agent']
# Load the CSV file into a pandas DataFrame
logs_df = pd.read_csv('sitges_access_prepared.csv')
logs_df.head()

Unnamed: 0,bytes,elapsed,IP_oct0,IP_oct1,IP_oct2,IP_oct3,month_sin,month_cos,day_sin,day_cos,...,petition_-,petition_GET,petition_HEAD,petition_POST,petition_other,status_1,status_2,status_3,status_4,status_5
0,-0.597348,0.0,0.427313,-0.342599,-0.744068,1.620095,0.5,0.866025,-0.998717,-0.050649,...,0,1,0,0,0,False,1,0,0,
1,0.418239,1.94591,-1.820773,-1.11011,2.453028,-0.878825,0.5,0.866025,-0.998717,-0.050649,...,0,1,0,0,0,False,0,0,1,
2,1.804745,6.2106,-1.283917,1.65293,-0.635692,-0.46464,0.5,0.866025,-0.998717,-0.050649,...,0,1,0,0,0,False,1,0,0,
3,0.808723,0.0,-1.065819,0.24071,0.104878,1.482033,0.5,0.866025,-0.998717,-0.050649,...,0,1,0,0,0,False,1,0,0,
4,1.747754,0.0,-0.981935,1.00822,2.145962,-1.582941,0.5,0.866025,-0.998717,-0.050649,...,0,1,0,0,0,False,1,0,0,


In [27]:
print(logs_df[["status_1", "status_5"]].head())

   status_1  status_5
0     False       NaN
1     False       NaN
2     False       NaN
3     False       NaN
4     False       NaN


In [28]:
# Divide the data into features (X)
print(logs_df.columns)

# Convert boolean column to integers
logs_df['status_1'] = logs_df['status_1'].astype(int)

# Replace NaN values with 0 in the 'column_with_nan'
logs_df['status_5'] = logs_df['status_5'].fillna(0)

print(logs_df[["status_1", "status_5"]].head())

#logs_df = logs_df.drop(["status_1", "status_5"], axis=1)
# Split the data into train, validation, and test sets
X_train, X_test = train_test_split(logs_df, test_size=0.2, random_state=42)
X_train.head()

Index(['bytes', 'elapsed', 'IP_oct0', 'IP_oct1', 'IP_oct2', 'IP_oct3',
       'month_sin', 'month_cos', 'day_sin', 'day_cos', 'weekday_sin',
       'weekday_cos', 'hour_sin', 'hour_cos', 'minute_sin', 'minute_cos',
       'petition_-', 'petition_GET', 'petition_HEAD', 'petition_POST',
       'petition_other', 'status_1', 'status_2', 'status_3', 'status_4',
       'status_5'],
      dtype='object')
   status_1  status_5
0         0       0.0
1         0       0.0
2         0       0.0
3         0       0.0
4         0       0.0


Unnamed: 0,bytes,elapsed,IP_oct0,IP_oct1,IP_oct2,IP_oct3,month_sin,month_cos,day_sin,day_cos,...,petition_-,petition_GET,petition_HEAD,petition_POST,petition_other,status_1,status_2,status_3,status_4,status_5
41781,-2.630782,0.0,-1.770443,-0.127696,1.224765,-0.920244,0.5,0.866025,-0.998717,-0.050649,...,0,1,0,0,0,0,0,1,0,0.0
89004,1.818868,0.0,1.165491,-1.278962,-0.057686,0.211864,0.5,0.866025,-0.998717,-0.050649,...,0,1,0,0,0,0,1,0,0,0.0
16853,0.808723,0.0,-0.998712,-1.555266,2.200151,-0.851213,0.5,0.866025,-0.998717,-0.050649,...,0,1,0,0,0,0,1,0,0,0.0
43588,-0.440833,0.0,0.880286,0.194659,0.303568,1.56487,0.5,0.866025,-0.998717,-0.050649,...,0,1,0,0,0,0,1,0,0,0.0
38249,0.67289,7.798933,-1.720113,-0.987308,1.116389,-1.348236,0.5,0.866025,-0.998717,-0.050649,...,0,1,0,0,0,0,1,0,0,0.0


In [29]:
class CustomDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe.values.astype(np.float32) # Assuming dataframe is a pandas DataFrame

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        return torch.tensor(sample)

In [30]:
train_dataset = CustomDataset(X_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

test_dataset = CustomDataset(X_test)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

###Step 3: Model

In [31]:
# Define LSTM Autoencoder Model
class LogAnomalyDetector(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(LogAnomalyDetector, self).__init__()
        self.encoder = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.decoder = nn.LSTM(input_size=hidden_size, hidden_size=input_size, num_layers=num_layers, batch_first=True)

    def forward(self, x):
        _, (hidden, cell) = self.encoder(x)
        output, _ = self.decoder(hidden)
        return output[-1] #Return just the last layer

input_size=len(logs_df.columns)
hidden_size=input_size//4
num_layers=3

# Create an instance of the model
model = LogAnomalyDetector(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers)

# Print model summary
print(model)

LogAnomalyDetector(
  (encoder): LSTM(26, 6, num_layers=3, batch_first=True)
  (decoder): LSTM(6, 26, num_layers=3, batch_first=True)
)


###Step 4: Train the model

In [36]:
# Function for training the model
def train_model(model, train_loader, criterion, optimizer, num_epochs=10):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    for epoch in range(num_epochs):
        running_loss = 0.0
        for i, inputs in tqdm(enumerate(train_loader, 0)):
            inputs = inputs.to(device)

            optimizer.zero_grad()

            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), inputs.squeeze())

            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss / len(train_loader)}")



In [33]:
# Sample log entry
log_entry = [-2.630782, -2.630782, -2.630782, -2.630782, -2.630782, -2.630782, 0.000000, -1.770443, -0.127696, 1.224765, -0.920244, 0.5, 0.866025, -0.998717, -0.050649, 0, 1, 0, 0, 0, 0, 1, 0]

# Convert the log entry to a tensor
log_tensor = torch.tensor(log_entry).unsqueeze(0)  # Add batch dimension

# Pass the log tensor through the model
output_log_tensor = model(log_tensor)

# Convert the output tensor back to a numpy array
#output_log_entry = output_log_tensor.squeeze().detach().numpy()

# Output the reconstructed log entry
print("Reconstructed Log Entry:")
print(output_log_tensor)
print(log_tensor)

Reconstructed Log Entry:
tensor([-0.0170, -0.0134,  0.0100,  0.0140,  0.0920,  0.0436, -0.0111,  0.0225,
         0.0649,  0.0248,  0.0347,  0.0638,  0.0835,  0.1617,  0.1700,  0.0633,
        -0.0106, -0.0038,  0.0894,  0.0599,  0.0501, -0.0260,  0.1218,  0.1616,
        -0.1232, -0.1718], grad_fn=<SelectBackward0>)
tensor([[-2.6308, -2.6308, -2.6308, -2.6308, -2.6308, -2.6308,  0.0000, -1.7704,
         -0.1277,  1.2248, -0.9202,  0.5000,  0.8660, -0.9987, -0.0506,  0.0000,
          1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  1.0000,  0.0000]])


In [37]:
# Define the criterion (loss function)
criterion = nn.MSELoss()

# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=0.01)  # Adjust learning rate as needed

# Train the model
num_epochs = 20

train_loss = train_model(model, train_loader, criterion, optimizer, num_epochs=num_epochs)

3325it [00:12, 266.10it/s]


Epoch [1/20], Loss: 0.39923158237360473


3325it [00:12, 276.69it/s]


Epoch [2/20], Loss: 0.3992271705095033


3325it [00:13, 252.18it/s]


Epoch [3/20], Loss: 0.3993775367871263


3325it [00:12, 272.59it/s]


Epoch [4/20], Loss: 0.399202653389228


3325it [00:12, 274.61it/s]


Epoch [5/20], Loss: 0.3991735292511775


3325it [00:13, 254.29it/s]


Epoch [6/20], Loss: 0.3993849283008647


3325it [00:12, 267.31it/s]


Epoch [7/20], Loss: 0.3992680248550903


3325it [00:12, 266.71it/s]


Epoch [8/20], Loss: 0.3993122111899512


3325it [00:12, 270.45it/s]


Epoch [9/20], Loss: 0.39924472778363335


3325it [00:15, 219.10it/s]


Epoch [10/20], Loss: 0.39927182684267376


3325it [00:13, 253.33it/s]


Epoch [11/20], Loss: 0.39916962575643583


3325it [00:12, 265.45it/s]


Epoch [12/20], Loss: 0.3991554999082608


3325it [00:15, 220.87it/s]


Epoch [13/20], Loss: 0.3993976580738125


3325it [00:14, 229.18it/s]


Epoch [14/20], Loss: 0.3990990041476443


3325it [00:13, 247.86it/s]


Epoch [15/20], Loss: 0.3992182392867884


3325it [00:14, 232.45it/s]


Epoch [16/20], Loss: 0.3992101382985151


3325it [00:13, 255.73it/s]


Epoch [17/20], Loss: 0.39914701779982203


3325it [00:13, 239.42it/s]


Epoch [18/20], Loss: 0.39920004202907244


3325it [00:13, 244.52it/s]


Epoch [19/20], Loss: 0.3992282909782309


3325it [00:12, 270.15it/s]

Epoch [20/20], Loss: 0.3991897730137173





### Step 5: Test the model

In [38]:
# Test the model
def test_model(model, test_loader, criterion):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()  # Set model to evaluation mode
    model.to(device)
    test_loss = 0.0

    with torch.no_grad():
        for inputs in test_loader:
            inputs = inputs.to(device)

            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), inputs.squeeze())

            test_loss += loss.item()

    avg_test_loss = test_loss / len(test_loader)
    print(f"Average Test Loss: {avg_test_loss}")

In [39]:
test_loss = test_model(model, test_loader, criterion)

Average Test Loss: 0.4015493282976632


  return F.mse_loss(input, target, reduction=self.reduction)


In [None]:
# To get the scoring system, compute the similarity between the output vector and the input vector. (This is just an idea)
# An anomaly should give low similarity (This is if the model performs accuratelly the task of autoencode the same vector)