# Setup Environment and Read Data

In [None]:
import torch
import numpy as np
import pandas as pd
import pickle
import copy
from tqdm import trange,tqdm
import torch.nn as nn
from torch.optim import Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score, classification_report, confusion_matrix

## Download the dataset

In [None]:
!wget https://raw.githubusercontent.com/khundman/telemanom/master/labeled_anomalies.csv

--2025-04-21 14:04:03--  https://raw.githubusercontent.com/khundman/telemanom/master/labeled_anomalies.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3956 (3.9K) [text/plain]
Saving to: ‘labeled_anomalies.csv’


2025-04-21 14:04:04 (32.3 MB/s) - ‘labeled_anomalies.csv’ saved [3956/3956]



In [None]:
%env DRIVE_PATH=/content/drive/MyDrive/Colab Notebooks/ELTE/DSLAB/

env: DRIVE_PATH=/content/drive/MyDrive/Colab Notebooks/ELTE/DSLAB/


In [None]:
!mkdir "/root/.config/kaggle"

In [None]:
!cp "$DRIVE_PATH/kaggle.json" "/root/.config/kaggle"
!chmod 600 "/root/.config/kaggle/kaggle.json"

In [None]:
!cd "$DRIVE_PATH" && kaggle datasets download -d patrickfleith/nasa-anomaly-detection-dataset-smap-msl && mv nasa-anomaly-detection-dataset-smap-msl.zip data.zip && unzip -o data.zip && rm data.zip && mv data/data tmp && rm -r data && mv tmp data

Dataset URL: https://www.kaggle.com/datasets/patrickfleith/nasa-anomaly-detection-dataset-smap-msl
License(s): copyright-authors
Archive:  data.zip
  inflating: data/data/2018-05-19_15.00.10/models/A-1.h5  
  inflating: data/data/2018-05-19_15.00.10/models/A-2.h5  
  inflating: data/data/2018-05-19_15.00.10/models/A-3.h5  
  inflating: data/data/2018-05-19_15.00.10/models/A-4.h5  
  inflating: data/data/2018-05-19_15.00.10/models/A-5.h5  
  inflating: data/data/2018-05-19_15.00.10/models/A-6.h5  
  inflating: data/data/2018-05-19_15.00.10/models/A-7.h5  
  inflating: data/data/2018-05-19_15.00.10/models/A-8.h5  
  inflating: data/data/2018-05-19_15.00.10/models/A-9.h5  
  inflating: data/data/2018-05-19_15.00.10/models/B-1.h5  
  inflating: data/data/2018-05-19_15.00.10/models/C-1.h5  
  inflating: data/data/2018-05-19_15.00.10/models/C-2.h5  
  inflating: data/data/2018-05-19_15.00.10/models/D-1.h5  
  inflating: data/data/2018-05-19_15.00.10/models/D-11.h5  
  inflating: data/data/20

## Setup the dataset

In [None]:
DRIVE = "drive/MyDrive/Colab Notebooks/ELTE/DSLAB/ServerMachineDataset/"
MACHINE = "machine-1-1.txt"
TRAIN_DATASET = DRIVE + "train/" + MACHINE
TEST_DATASET = DRIVE + "test/" + MACHINE
TEST_LABEL_DATASET = DRIVE + "test_label/" + MACHINE

metric = pd.read_csv(TRAIN_DATASET, header=None)
metric_test = pd.read_csv(TEST_DATASET, header=None)
true_anomalies = pd.read_csv(TEST_LABEL_DATASET, header=None)[0].to_numpy()

In [None]:
metric

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,28,29,30,31,32,33,34,35,36,37
0,0.032258,0.039195,0.027871,0.024390,0.0,0.915385,0.343691,0.0,0.020011,0.000122,...,0.0,0.004298,0.029993,0.022131,0.000000,0.000045,0.034677,0.034747,0.0,0.0
1,0.043011,0.048729,0.033445,0.025552,0.0,0.915385,0.344633,0.0,0.019160,0.001722,...,0.0,0.004298,0.030041,0.028821,0.000000,0.000045,0.035763,0.035833,0.0,0.0
2,0.043011,0.034958,0.032330,0.025552,0.0,0.915385,0.344633,0.0,0.020011,0.000122,...,0.0,0.004298,0.026248,0.021101,0.000000,0.000045,0.033012,0.033082,0.0,0.0
3,0.032258,0.028602,0.030100,0.024390,0.0,0.912821,0.342750,0.0,0.021289,0.000000,...,0.0,0.004298,0.030169,0.025733,0.000000,0.000022,0.035112,0.035182,0.0,0.0
4,0.032258,0.019068,0.026756,0.023229,0.0,0.912821,0.342750,0.0,0.018734,0.000000,...,0.0,0.004298,0.027240,0.022645,0.000000,0.000034,0.033447,0.033517,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28474,0.075269,0.046610,0.071349,0.076655,0.0,0.928205,0.269303,0.0,0.031649,0.000244,...,0.0,0.008596,0.068980,0.049408,0.000386,0.000034,0.064504,0.064572,0.0,0.0
28475,0.086022,0.070975,0.075808,0.077816,0.0,0.930769,0.269303,0.0,0.029946,0.000244,...,0.0,0.008596,0.073029,0.055584,0.000386,0.000034,0.067690,0.067757,0.0,0.0
28476,0.086022,0.065678,0.073579,0.076655,0.0,0.935897,0.270245,0.0,0.030372,0.000244,...,0.0,0.008596,0.070516,0.048893,0.000386,0.000034,0.064866,0.064934,0.0,0.0
28477,0.086022,0.056144,0.068004,0.074332,0.0,0.933333,0.271186,0.0,0.032643,0.000244,...,0.0,0.008596,0.070308,0.055069,0.000386,0.000045,0.067111,0.067178,0.0,0.0


# Preprocess the Dataset

In [None]:
# Scale the values of the input metrics
scaler = MinMaxScaler()
metric_scaled = scaler.fit_transform(metric)
metric_tensor = torch.tensor(metric_scaled, dtype=torch.float32)
metric_scaled = pd.DataFrame(metric_scaled, index=metric.index, columns=metric.columns)
metric_scaled

host,R740-3-1,R740-3-1,R740-3-1,R740-3-1,R740-3-1,R740-3-1,R740-3-1,R740-3-1,R740-3-1,R740-3-1,...,R740-3-5,R740-3-5,R740-3-5,R740-3-5,R740-3-5,R740-3-5,R740-3-5,R740-3-5,R740-3-5,R740-3-5
metric,cpu.mean_usage_idle,cpu.mean_usage_iowait,cpu.mean_usage_softirq,cpu.mean_usage_system,cpu.mean_usage_user,disk.mean_used_percent,diskio.io_time,diskio.mean_iops_in_progress,diskio.read_speed,diskio.write_speed,...,mem.mean_free,net.recieved,net.sent,netstat.mean_tcp_time_wait,processes.mean_blocked,processes.mean_running,processes.mean_total,system.mean_load1,system.mean_load15,system.mean_load5
0,0.950674,0.196214,0.038462,0.003521,0.001633,0.024794,0.139161,0.004854,0.000000,0.000203,...,0.026584,0.000183,0.098977,0.334012,0.086957,0.0,0.526316,0.362028,0.459178,0.429102
1,0.977175,0.088424,0.038382,0.002268,0.001958,0.024931,0.109986,0.002555,0.000000,0.000200,...,0.055531,0.000192,0.098971,0.329939,0.260870,0.0,0.526316,0.362323,0.466216,0.431143
2,0.189643,0.041885,0.077036,0.010180,0.799099,0.025091,0.124771,0.002555,0.001839,0.000221,...,0.054637,0.000267,0.098979,0.295316,0.173913,0.0,0.526316,0.378685,0.476351,0.438938
3,0.068136,0.005882,0.038393,0.008718,0.929881,0.025243,0.113868,0.002810,0.000022,0.000264,...,0.053539,0.000350,0.198149,0.407332,0.260870,0.0,0.513158,0.415979,0.490428,0.453601
4,0.068392,0.003118,0.038481,0.010484,0.930056,0.025374,0.079193,0.003577,0.000000,0.000197,...,0.039764,0.000086,0.099212,0.403259,0.130435,0.0,0.526316,0.415979,0.500563,0.460653
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8635,0.993020,0.019076,0.038593,0.004625,0.002683,0.070769,0.016220,0.000000,0.000000,0.000148,...,0.015193,0.086387,0.066898,0.073320,0.000000,0.0,0.131579,0.174676,0.366554,0.201745
8636,0.996937,0.004855,0.046169,0.005066,0.002408,0.070892,0.007291,0.000000,0.000000,0.000052,...,0.014681,0.064340,0.004154,0.061100,0.000000,0.0,0.131579,0.175708,0.365991,0.203972
8637,0.993645,0.013547,0.069352,0.002353,0.004042,0.071016,0.011761,0.000000,0.000000,0.000131,...,0.019500,0.100352,0.084221,0.048880,0.000000,0.0,0.131579,0.179245,0.365428,0.206570
8638,0.992556,0.018716,0.077052,0.006154,0.003006,0.071135,0.005991,0.000000,0.000000,0.000418,...,0.011368,0.021138,0.191083,0.054990,0.000000,0.0,0.131579,0.176150,0.364302,0.207869


### Scaled

In [None]:
# create train and test dataloaders
metric_scaled.interpolate(inplace=True)
metric_scaled.bfill(inplace=True)
metric_tensor = metric_scaled.values
sequence_length = 30
sequences = [metric_tensor[i:i + sequence_length].reshape(sequence_length, NUM_OF_COMPONENTS, -1) for i in range(metric_tensor.shape[0] - sequence_length + 1)]

train_data, val_data = train_test_split(sequences, test_size=0.3, random_state=42) # 70% train, 30% val
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42) # 50% of temp for val and test



batch_size = 32
train_loader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset=val_data, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(dataset=test_data, batch_size=batch_size, shuffle=False)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Non-Scaled

LSTM

In [None]:
# create train and test dataloaders
metric.interpolate(inplace=True)
metric.bfill(inplace=True)
metric_tensor = metric.values

metric_test.interpolate(inplace=True)
metric_test.bfill(inplace=True)
metric_test_tensor = metric_test.values

sequence_length = 30
sequences = []
for i in range(metric_tensor.shape[0] - sequence_length + 1):
  sequences.append(metric_tensor[i:i + sequence_length])

train_data, val_data = train_test_split(sequences, test_size=0.3, random_state=42) # 70% train, 30% temp

test_sequences = []
for i in range(metric_test_tensor.shape[0] - sequence_length + 1):
  test_sequences.append(metric_test_tensor[i:i + sequence_length])

batch_size = 32
train_loader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset=val_data, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(dataset=test_sequences, batch_size=batch_size, shuffle=False)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
sequences[0].shape

(30, 38)

# Define the Network

## LSTM

In [None]:
class LSTMEncoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim, num_layers=1):
        super(LSTMEncoder, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.fc_mean = nn.Linear(hidden_dim, latent_dim)
        self.fc_logvar = nn.Linear(hidden_dim, latent_dim)

    def forward(self, x):
        _, (h_n, _) = self.lstm(x)  # h_n: (num_layers, batch, hidden_dim)
        h = h_n[-1]  # take the output of the last layer
        return self.fc_mean(h), self.fc_logvar(h)


class LSTMDecoder(nn.Module):
    def __init__(self, latent_dim, hidden_dim, output_dim, sequence_length, num_layers=1):
        super(LSTMDecoder, self).__init__()
        self.sequence_length = sequence_length
        self.latent_to_hidden = nn.Linear(latent_dim, hidden_dim)
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.output_layer = nn.Linear(hidden_dim, output_dim)

    def forward(self, z):
        # Repeat z for each timestep
        hidden = self.latent_to_hidden(z).unsqueeze(1).repeat(1, self.sequence_length, 1)
        out, _ = self.lstm(hidden)
        return self.output_layer(out)


class LSTMVAE(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim, sequence_length, num_layers=1, device='cpu'):
        super(LSTMVAE, self).__init__()
        self.encoder = LSTMEncoder(input_dim, hidden_dim, latent_dim, num_layers).to(device)
        self.decoder = LSTMDecoder(latent_dim, hidden_dim, input_dim, sequence_length, num_layers).to(device)

    def reparameterize(self, mean, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mean + eps * std

    def forward(self, x):
        mean, logvar = self.encoder(x)
        z = self.reparameterize(mean, logvar)
        x_recon = self.decoder(z)
        return x_recon, mean, logvar

In [None]:
input_dim = 38
hidden_dim = 128
latent_dim = 32
num_layers = 1

model = LSTMVAE(input_dim=input_dim,
                hidden_dim=hidden_dim,
                latent_dim=latent_dim,
                sequence_length=sequence_length,
                num_layers=num_layers,
                device=device).to(device)
optimizer = Adam(model.parameters(), lr=1e-3)

## Support functions

In [None]:
def loss_function(x, x_hat, mean, log_var):
    reproduction_loss = nn.functional.mse_loss(x_hat, x, reduction='sum')
    KLD = - 0.5 * torch.sum(1+ log_var - mean.pow(2) - log_var.exp())

    return reproduction_loss + KLD

In [None]:
def save_model(model):
    model_state = {
        'input_dim':26,
        'latent_dim':26,
        'hidden_dim':200,
        'state_dict':model.state_dict()
    }
    torch.save(model_state,'vae.pth')

# Train

## LSTM

In [None]:
torch.cuda.empty_cache()

scheduler = ReduceLROnPlateau(optimizer, 'min', patience=5, factor=0.1, verbose=True)

# SPO optimizer - optuna
# bayesian hyperparameter tuning
# grid search - slow for DL

def train_model(model, train_loader, val_loader, optimizer, loss_fn, scheduler, num_epochs=10):
    train_losses = []
    val_losses = []

    early_stop_tolerant_count = 0
    early_stop_tolerant = 10
    best_loss = float('inf')
    for epoch in range(num_epochs):  # loop over the dataset multiple times
        train_loss = 0.0
        model.train()
        for batch in train_loader:
            batch = torch.tensor(batch, dtype=torch.float32).to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            recon_batch, mean, logvar = model(batch)

            # forward + loss + backward + optimize
            loss = loss_fn(recon_batch, batch, mean, logvar)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        train_loss /= len(train_loader)  # Divide by number of batches
        train_losses.append(train_loss)

        # Validation
        model.eval()
        valid_loss = 0.0
        with torch.no_grad():
            for batch in val_loader:
                batch = torch.tensor(batch, dtype=torch.float32).to(device)

                recon_batch, mean, logvar = model(batch)

                loss = loss_fn(recon_batch, batch, mean, logvar)
                valid_loss += loss.item()

        valid_loss /= len(val_loader)  # Divide by number of batches
        val_losses.append(valid_loss)

        scheduler.step(valid_loss)

        early_stop_tolerant_count += 1
        if valid_loss < best_loss:
            early_stop_tolerant_count = 0
            best_loss = valid_loss
            best_model_wts = copy.deepcopy(model.state_dict())

        if early_stop_tolerant_count >= early_stop_tolerant:
            print("Early stopping triggered")
            break

        print(f"Epoch {epoch:04d}: train loss {train_loss:.4f}, valid loss {valid_loss:.4f}")
        save_model(model)

    print('Finished Training')
    return train_losses, val_losses

train_losses, val_losses = train_model(model, train_loader, val_loader, optimizer, loss_function, scheduler, num_epochs=100)

  batch = torch.tensor(batch, dtype=torch.float32).to(device)
  batch = torch.tensor(batch, dtype=torch.float32).to(device)


Epoch 0000: train loss 123.4685, valid loss 80.8567
Epoch 0001: train loss 74.0084, valid loss 65.1036
Epoch 0002: train loss 64.4923, valid loss 64.3526
Epoch 0003: train loss 63.7101, valid loss 63.7205
Epoch 0004: train loss 63.7187, valid loss 66.0969
Epoch 0005: train loss 63.3852, valid loss 64.7285
Epoch 0006: train loss 63.2681, valid loss 63.3437
Epoch 0007: train loss 63.1036, valid loss 66.2087
Epoch 0008: train loss 62.8658, valid loss 64.2517
Epoch 0009: train loss 62.1194, valid loss 63.6555
Epoch 0010: train loss 62.5832, valid loss 63.2468
Epoch 0011: train loss 62.3666, valid loss 63.1829
Epoch 0012: train loss 62.4743, valid loss 64.0407
Epoch 0013: train loss 62.1222, valid loss 64.0149
Epoch 0014: train loss 61.9344, valid loss 62.8611
Epoch 0015: train loss 61.8461, valid loss 62.0486
Epoch 0016: train loss 61.9262, valid loss 61.9313
Epoch 0017: train loss 61.9506, valid loss 62.7810
Epoch 0018: train loss 61.9946, valid loss 62.4230
Epoch 0019: train loss 61.6737

# Evaluate

In [None]:
def evaluate_lstm(model, test_loader, device, percentile_threshold=90):
    model.eval()
    anomaly_scores = []

    with torch.no_grad():
        for batch in test_loader:
            batch = torch.tensor(batch, dtype=torch.float32).to(device)

            batch_scores = []
            for i in range(batch.shape[0]): #Iterate through each sequence in the batch
                sequence = batch[i, :, :].unsqueeze(0)  # Select a single sequence
                recon_batch, mean, logvar = model(sequence)
                loss = loss_function(recon_batch, sequence, mean, logvar)
                batch_scores.append(loss.item())
            anomaly_scores.extend(batch_scores)  # Append scores for all sequences in the batch


    # Calculate the threshold based on the specified percentile
    threshold = np.percentile(anomaly_scores, percentile_threshold)

    # Identify anomaly indices
    anomaly_indices = [i for i, score in enumerate(anomaly_scores) if score > threshold]
    return anomaly_indices
anomalies = evaluate_lstm(model, test_loader, device, 90)

  batch = torch.tensor(batch, dtype=torch.float32).to(device)


In [None]:
def calculate_f1_score(anomaly_indices, true_anomalies):
    # Create a binary array representing predicted anomalies
    predicted_anomalies = np.zeros_like(true_anomalies)
    for index in anomaly_indices:
        if index < len(predicted_anomalies):  # Check index bounds
          predicted_anomalies[index] = 1

    # Calculate the F1 score
    f1 = f1_score(true_anomalies, predicted_anomalies)
    return f1, predicted_anomalies

# Example usage (assuming 'anomalies' and 'true_anomalies' are defined)
f1, predicted_anomalies = calculate_f1_score(anomalies, true_anomalies)
print(f"F1 Score: {f1}")

F1 Score: 0.6163567430944213


In [None]:
print(classification_report(true_anomalies, predicted_anomalies))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96     25785
           1       0.60      0.63      0.62      2694

    accuracy                           0.93     28479
   macro avg       0.78      0.79      0.79     28479
weighted avg       0.93      0.93      0.93     28479



In [None]:
print(confusion_matrix(true_anomalies, predicted_anomalies))

[[24647  1138]
 [  987  1707]]
