In [1]:
import torch
import numpy as np
import pandas as pd
import pickle
import copy
from tqdm import trange,tqdm
import torch.nn as nn
from torch.optim import Adam
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split

In [2]:
DRIVE = "drive/MyDrive/Colab Notebooks/ELTE/DSLAB/dataset/"
DATASET = DRIVE + "MBD.csv"
TOPOLOGY = DRIVE + "MBD_topology.pk"

data = pd.read_csv(DATASET, header=[0,1])

metric = data.drop(['date', 'label'], axis = 1)
metric.columns.names = ['host','metric']
tempm = metric.swaplevel('metric','host',axis=1).stack()
tempm = (tempm-tempm.mean())/(tempm.std())
metric = tempm.unstack().swaplevel('metric','host',axis=1).stack().unstack()
metric_tensor = torch.tensor(metric.values, dtype=torch.float32)

with open(TOPOLOGY, 'rb') as f:
    edge_index = pickle.load(f)

  metric = data.drop(['date', 'label'], axis = 1)
  tempm = metric.swaplevel('metric','host',axis=1).stack()
  metric = tempm.unstack().swaplevel('metric','host',axis=1).stack().unstack()


In [6]:
metric

host,R740-3-1,R740-3-1,R740-3-1,R740-3-1,R740-3-1,R740-3-1,R740-3-1,R740-3-1,R740-3-1,R740-3-1,...,R740-3-5,R740-3-5,R740-3-5,R740-3-5,R740-3-5,R740-3-5,R740-3-5,R740-3-5,R740-3-5,R740-3-5
metric,cpu.mean_usage_idle,cpu.mean_usage_iowait,cpu.mean_usage_softirq,cpu.mean_usage_system,cpu.mean_usage_user,disk.mean_used_percent,diskio.io_time,diskio.mean_iops_in_progress,diskio.read_speed,diskio.write_speed,...,mem.mean_free,net.recieved,net.sent,netstat.mean_tcp_time_wait,processes.mean_blocked,processes.mean_running,processes.mean_total,system.mean_load1,system.mean_load15,system.mean_load5
0,1.297715,-0.632874,-0.703865,-0.812009,-0.960151,-1.912235,-0.125818,-0.394307,-0.484756,-0.549166,...,-0.600163,-0.729554,0.131946,3.001606,0.021189,-0.109772,0.070849,1.693814,0.910688,1.767932
1,1.386064,-0.747534,-0.703948,-0.834344,-0.958793,-1.912233,-0.149738,-0.426539,-0.484756,-0.549179,...,-0.496413,-0.729449,0.131897,2.956195,1.096132,-0.109772,0.070849,1.696252,0.950638,1.783531
2,-1.239377,-0.797039,-0.664087,-0.693319,2.380106,-1.912231,-0.137616,-0.426539,-0.483687,-0.549084,...,-0.499616,-0.728573,0.131962,2.570205,0.558660,-0.109772,0.070849,1.831540,1.008166,1.843091
3,-1.644451,-0.835336,-0.703936,-0.719389,2.927902,-1.912229,-0.146556,-0.422957,-0.484743,-0.548879,...,-0.503554,-0.727601,0.957520,3.818996,1.096132,-0.109772,0.034244,2.139900,1.088067,1.955121
4,-1.643598,-0.838276,-0.703845,-0.687902,2.928635,-1.912227,-0.174985,-0.412214,-0.484756,-0.549193,...,-0.552925,-0.730687,0.133904,3.773586,0.289925,-0.109772,0.070849,2.139900,1.145595,2.009008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8635,1.438887,-0.821302,-0.703730,-0.792337,-0.955754,-1.911606,-0.226616,-0.462351,-0.484756,-0.549426,...,-0.640990,0.278332,-0.135095,0.095328,-0.516282,-0.109772,-1.027300,0.144702,0.384942,0.030765
8636,1.451945,-0.836429,-0.695918,-0.784464,-0.956904,-1.911604,-0.233937,-0.462351,-0.484756,-0.549877,...,-0.642823,0.020555,-0.657424,-0.040904,-0.516282,-0.109772,-1.027300,0.153234,0.381746,0.047782
8637,1.440971,-0.827184,-0.672011,-0.832834,-0.950063,-1.911602,-0.230273,-0.462351,-0.484756,-0.549508,...,-0.625552,0.441609,0.009106,-0.177136,-0.516282,-0.109772,-1.027300,0.182485,0.378550,0.067635
8638,1.437342,-0.821685,-0.664070,-0.765072,-0.954403,-1.911601,-0.235003,-0.462351,-0.484756,-0.548154,...,-0.654699,-0.484553,0.898695,-0.109020,-0.516282,-0.109772,-1.027300,0.156890,0.372158,0.077562


In [3]:
# create train and test dataloaders
metric.interpolate(inplace=True)
metric.bfill(inplace=True)
metric_tensor = metric.values
sequence_length = 30
sequences = [metric_tensor[i:i + sequence_length].reshape(sequence_length, 5, -1) for i in range(metric_tensor.shape[0] - sequence_length + 1)]

train_data, temp_data = train_test_split(sequences, test_size=0.3, random_state=42) # 70% train, 30% temp
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42) # 50% of temp for val and test

batch_size = 32
train_loader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset=val_data, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(dataset=test_data, batch_size=batch_size, shuffle=False)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
class Encoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim):
        super(Encoder, self).__init__()

        self.linear1 = nn.Linear(input_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
        self.mean = nn.Linear(hidden_dim, latent_dim)
        self.var = nn.Linear (hidden_dim, latent_dim)
        self.LeakyReLU = nn.LeakyReLU(0.2)
        self.training = True

    def forward(self, x):
        x = self.linear1(x)
        x = self.LeakyReLU(x)
        x = self.linear2(x)
        x = self.LeakyReLU(x)

        mean = self.mean(x)
        log_var = self.var(x)
        return mean, log_var

In [5]:
class Decoder(nn.Module):
    def __init__(self, output_dim, hidden_dim, latent_dim):
        super(Decoder, self).__init__()

        self.linear2 = nn.Linear(latent_dim, hidden_dim)
        self.linear1 = nn.Linear(hidden_dim, hidden_dim)
        self.output = nn.Linear(hidden_dim, output_dim)
        self.LeakyReLU = nn.LeakyReLU(0.2)

    def forward(self, x):
        x = self.linear2(x)
        x = self.LeakyReLU(x)
        x = self.linear1(x)
        x = self.LeakyReLU(x)

        x_hat = torch.sigmoid(self.output(x))
        return x_hat

In [6]:
class VAE(nn.Module):
    def __init__(self, input_dim, hidden_dim=200, latent_dim=26, device=device):
        super(VAE, self).__init__()

        self.encoder = Encoder(input_dim, hidden_dim, latent_dim).to(device)
        self.decoder = Decoder(input_dim, hidden_dim, latent_dim).to(device)

    def reparameterization(self, mean, var):
        epsilon = torch.randn_like(var).to(device)
        z = mean + var*epsilon
        return z

    def forward(self, x):
        mean, log_var = self.encoder(x)
        z = self.reparameterization(mean, torch.exp(0.5 * log_var))
        x_hat = self.decoder(z)
        return x_hat, mean, log_var

In [7]:
model = VAE(input_dim=26).to(device)
optimizer = Adam(model.parameters(), lr=1e-3)

In [8]:
def loss_function(x, x_hat, mean, log_var):
    reproduction_loss = nn.functional.mse_loss(x_hat, x, reduction='sum')
    KLD = - 0.5 * torch.sum(1+ log_var - mean.pow(2) - log_var.exp())

    return reproduction_loss + KLD

In [9]:
def save_model(model):
    model_state = {
        'input_dim':26,
        'latent_dim':26,
        'hidden_dim':200,
        'state_dict':model.state_dict()
    }
    torch.save(model_state,'vae.pth')

In [10]:
torch.cuda.empty_cache()

def train_model(model, train_loader, val_loader, optimizer, loss_fn, num_epochs=10):
    train_losses = []
    val_losses = []

    early_stop_tolerant_count = 0
    early_stop_tolerant = 10
    best_loss = float('inf')
    for epoch in range(num_epochs):  # loop over the dataset multiple times
        train_loss = 0.0
        model.train()
        for batch_idx, x in enumerate(train_loader):
            x = x.to(device).type(torch.float32)

            # zero the parameter gradients
            optimizer.zero_grad()

            x_hat, mean, log_var = model(x)

            # forward + loss + backward + optimize
            loss = loss_fn(x, x_hat, mean, log_var)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        train_loss /= len(train_loader)  # Divide by number of batches
        train_losses.append(train_loss)

        # Validation
        model.eval()
        valid_loss = 0.0
        with torch.no_grad():
            for batch_idx, x in enumerate(val_loader):
                x = x.to(device).type(torch.float32)

                x_hat, mean, log_var = model(x)

                loss = loss_fn(x, x_hat, mean, log_var)
                valid_loss += loss.item()

        valid_loss /= len(val_loader)  # Divide by number of batches
        val_losses.append(valid_loss)

        early_stop_tolerant_count += 1
        if valid_loss < best_loss:
            early_stop_tolerant_count = 0
            best_loss = valid_loss
            best_model_wts = copy.deepcopy(model.state_dict())

        if early_stop_tolerant_count >= early_stop_tolerant:
            break

        print(f"Epoch {epoch:04d}: train loss {train_loss:.4f}, valid loss {valid_loss:.4f}")
        save_model(model)

    print('Finished Training')
    return train_losses, val_losses

train_losses, val_losses = train_model(model, train_loader, val_loader, optimizer, loss_function, num_epochs=15)

Epoch 0000: train loss 117309.8983, valid loss 103601.7243
Epoch 0001: train loss 105589.9977, valid loss 95966.7241
Epoch 0002: train loss 98063.4099, valid loss 91289.6240
Epoch 0003: train loss 96749.8733, valid loss 90847.3352
Epoch 0004: train loss 96385.3544, valid loss 90653.8497
Epoch 0005: train loss 96152.8689, valid loss 90410.3136
Epoch 0006: train loss 95919.2327, valid loss 90143.5594
Epoch 0007: train loss 95723.2424, valid loss 89968.8533
Epoch 0008: train loss 95595.2550, valid loss 89931.6208
Epoch 0009: train loss 95735.0082, valid loss 89794.1332
Epoch 0010: train loss 95445.6148, valid loss 89621.8967
Epoch 0011: train loss 95329.7978, valid loss 89612.3198
Epoch 0012: train loss 95301.5898, valid loss 89799.8289
Epoch 0013: train loss 95233.8225, valid loss 89515.4358
Epoch 0014: train loss 95204.0273, valid loss 89509.5138
Finished Training


In [11]:
def interpret(model, data_loader, nodes_num=5, sampling_num=5, delay=5):
        model.eval()

        scores_sum = []
        outputs = []

        for (i,ts_batch) in enumerate(tqdm(data_loader)):
            ts_batch = ts_batch.to(device).type(torch.float32)
            x_hat, mean, log_var = model(ts_batch)

            sample_scores_sum = []

            for j in range(sampling_num):
                error_origin = torch.div((x_hat - ts_batch) ** 2, log_var.exp()) + log_var

                sample_score_sum = torch.sum(error_origin, 3)

                sample_scores_sum.append(sample_score_sum)

            score_sum = torch.mean(torch.stack(sample_scores_sum,3),3)
            #print(score_sum)

            scores_sum.append(score_sum.data.cpu().numpy())
            outputs.append(mean.data.cpu().numpy())

        scores_sum = np.concatenate(scores_sum)
        #print(scores_sum)
        outputs = np.concatenate(outputs)


        # lattice = np.full((delay, len(sequences)+delay-1, nodes_num), np.nan)
        # for i, score in enumerate(scores_sum):
        #     lattice[i % delay, i:i + delay] = score[-delay:]
        # scores_sum = np.nanmean(lattice, axis=0)
        # print(scores_sum)

        scores_argsort = np.argsort(-scores_sum)

        return scores_sum, outputs, scores_argsort

In [12]:
scores_sum, outputs, scores_argsort = interpret(model, test_loader)

100%|██████████| 41/41 [00:00<00:00, 244.06it/s]


In [13]:
threshold = np.percentile(scores_sum, 95)
anomalies = scores_sum > threshold

print("Anomaly Indices:", np.where(anomalies)[0])

Anomaly Indices: [   1    1    1 ... 1291 1291 1291]


In [14]:
print(threshold)

68.88581962585442


In [15]:
print(len(np.where(anomalies)[0]))

9690


In [16]:
print(len(scores_sum))

1292
