In [1]:
from merlion.utils import TimeSeries

In [28]:
# Set the train, validation, and test data
data_file = {'feat_num': 38, 
             'num_classes': 2, 
             'train_data': ['machine-1-1_train_features.hdf5',
                            'machine-2-1_train_features.hdf5',
                            'machine-3-2_train_features.hdf5',
                            'machine-3-7_train_features.hdf5'], 
             
             'val_data': ['machine-1-1_val_features.hdf5',
                          'machine-2-1_val_features.hdf5',
                          'machine-3-2_val_features.hdf5',
                          'machine-3-7_val_features.hdf5'],
             
             'test_data': ['machine-1-1_features.hdf5',
                           'machine-2-1_features.hdf5', 
                           'machine-3-2_features.hdf5',
                           'machine-3-7_features.hdf5']}

In [29]:
import pickle 

data_info = 'data/SMD_data_info.pkl'
with open(data_info, 'wb') as f:
    pickle.dump(data_file, f)
        
with open(data_info, 'rb') as f:
    loaded_dict = pickle.load(f)

In [30]:
loaded_dict

{'feat_num': 38,
 'num_classes': 2,
 'train_data': ['machine-1-1_train_features.hdf5',
  'machine-2-1_train_features.hdf5',
  'machine-3-2_train_features.hdf5',
  'machine-3-7_train_features.hdf5'],
 'val_data': ['machine-1-1_val_features.hdf5',
  'machine-2-1_val_features.hdf5',
  'machine-3-2_val_features.hdf5',
  'machine-3-7_val_features.hdf5'],
 'test_data': ['machine-1-1_features.hdf5',
  'machine-2-1_features.hdf5',
  'machine-3-2_features.hdf5',
  'machine-3-7_features.hdf5']}

In [9]:
import h5py
import numpy as np
import pandas as pd

In [10]:
file_path = 'data/machine-1-1_val_features.hdf5'
f = h5py.File(file_path, "r")
data = np.array(f['data'])
labels = np.array(f['labels'])
f.close()

In [11]:
print(np.shape(data))
print(np.shape(labels))

(500, 38)
(500, 2)


In [12]:
sel_feats = ['value-' + str(i) for i in range(38)]
data_ts = TimeSeries.from_pd(pd.DataFrame(data, columns=sel_feats))

In [13]:
len(data_ts)

500

In [14]:
from merlion.models.anomaly.vae import VAE
from merlion.models.utils.rolling_window_dataset import RollingWindowDataset
from merlion.utils.misc import ProgressBar

In [19]:
sequence_len = 1
batch_size = 64

loader = RollingWindowDataset(
            data_ts,
            target_seq_index=None,
            shuffle=True,
            flatten=True,
            n_past=2,
            n_future=0,
            batch_size=batch_size,
        )

In [20]:
len(data_ts)

500

In [21]:
for i, (batch, _, _, _) in enumerate(loader):
    print(i, np.shape(batch))

0 (64, 76)
1 (64, 76)
2 (64, 76)
3 (64, 76)
4 (64, 76)
5 (64, 76)
6 (64, 76)
7 (51, 76)


In [121]:
np.shape(batch)

(62, 40)

In [48]:
len(data_ts)

500

In [58]:
batch

array([[6.45160e-02, 6.88560e-02, 7.91530e-02, 7.31710e-02, 0.00000e+00,
        8.94872e-01, 2.60829e-01, 0.00000e+00, 2.35590e-02, 0.00000e+00,
        2.09117e-01, 8.10810e-02, 2.73970e-02, 5.53330e-02, 1.04241e-01,
        2.22437e-01, 0.00000e+00, 0.00000e+00, 9.78640e-02, 7.65820e-02,
        8.19470e-02, 6.44480e-02, 9.64688e-01, 1.08330e-02, 2.76370e-02,
        1.00250e-02, 0.00000e+00, 6.92300e-02, 0.00000e+00, 8.59600e-03,
        6.03850e-02, 4.37470e-02, 0.00000e+00, 3.40000e-05, 6.23330e-02,
        6.23280e-02, 0.00000e+00, 0.00000e+00],
       [1.07527e-01, 1.40890e-01, 1.01449e-01, 9.05920e-02, 0.00000e+00,
        9.28205e-01, 2.69303e-01, 0.00000e+00, 2.82430e-02, 1.22000e-04,
        1.16831e-01, 6.75680e-02, 2.73970e-02, 6.24680e-02, 1.18906e-01,
        1.32584e-01, 0.00000e+00, 0.00000e+00, 1.23528e-01, 1.08873e-01,
        1.22797e-01, 9.86610e-02, 9.61171e-01, 1.28530e-02, 4.58280e-02,
        1.20190e-02, 0.00000e+00, 1.02782e-01, 0.00000e+00, 1.14610e-02,
   

In [98]:
import h5py
import numpy as np
import math

from utils import unfold_label, shuffle_data
from merlion.models.utils.rolling_window_dataset import RollingWindowDataset


class BatchDataGenerator:
    def __init__(self, stage, file_path, batch_size):

        if stage not in ['train', 'val', 'test']:
            assert ValueError('invalid stage!')

        self.configuration(stage, file_path)
        self.load_data()
        self.batch_size = batch_size

    def configuration(self, stage, file_path):
        self.current_index = -1
        self.file_path = file_path
        self.stage = stage
        self.shuffled = False

    def load_data(self):
        file_path = self.file_path
        f = h5py.File(file_path, "r")
        self.data = np.array(f['data'])
        self.labels = np.array(f['labels'])
        f.close()
        self.file_num_train = len(self.labels)
        print('data num loaded:', self.file_num_train)
    
    def get_data_labels_batch(self):

        data, labels = [], []
        labels = []
        for index in range(self.batch_size):
            self.current_index += 1

            # Void overflow
            if self.current_index > self.file_num_train - 1:
                self.current_index %= self.file_num_train
                self.data, self.labels = shuffle_data(samples=self.data, labels=self.labels, seed=self.current_index)

            # Slice the data and label
            data.append(self.data[self.current_index])
            labels.append(self.labels[self.current_index])

        data = np.stack(data)
        labels = np.stack(labels)
        return data, labels


In [59]:
file_path = 'data/machine-1-1_features.hdf5'

In [60]:
f = h5py.File(file_path, "r")
data = np.array(f['data'])
labels = np.array(f['labels'])

In [61]:
print(np.shape(data))
print(np.shape(labels))

(28478, 38)
(28478, 2)


In [96]:
feat_num = 38
num_classes = 2
batch_size = 1000
inner_loops = 451
k = 1

iter = inner_loops

cols = ['value-' + str(i) for i in range(feat_num)] + ['label-' + str(i) for i in range(num_classes)]

In [105]:
if iter * batch_size > len(data):
    rep_num = math.ceil(iter * batch_size / len(data)) - 1

    data, labels = [data], [labels]
    for rep in range(rep_num): 
        shuffled_data, shuffled_labels = shuffle_data(data, labels, seed=rep)
        data.append(shuffled_data)
        labels.append(shuffled_labels)

data_lab = np.concatenate([data, labels], axis=1)
data_ts = TimeSeries.from_pd(pd.DataFrame(data_lab, columns=cols))

loader = RollingWindowDataset(
            data_ts,
            target_seq_index=None,
            shuffle=True,
            flatten=True,
            n_past=k,
            n_future=0,
            batch_size=batch_size,
        )

batch_data = [b[0][:, :-num_classes] for b in loader]
batch_labels = [b[0][:, -num_classes:] for b in loader]

In [116]:
batch_data = [b[0][:, :-num_classes] for b in loader]
batch_labels = [b[0][:, -num_classes:] for b in loader]

In [117]:
np.shape(batch_data[0])

(1000, 38)

In [118]:
np.shape(batch_labels[0])

(1000, 2)

In [82]:
for i, (batch, _, _, _) in enumerate(loader):
    print(i, np.shape(batch))

0 (1000, 40)
1 (1000, 40)
2 (1000, 40)
3 (1000, 40)
4 (1000, 40)
5 (1000, 40)
6 (1000, 40)
7 (1000, 40)
8 (1000, 40)
9 (1000, 40)
10 (1000, 40)
11 (1000, 40)
12 (1000, 40)
13 (1000, 40)
14 (1000, 40)
15 (1000, 40)
16 (1000, 40)
17 (1000, 40)
18 (1000, 40)
19 (1000, 40)
20 (1000, 40)
21 (1000, 40)
22 (1000, 40)
23 (1000, 40)
24 (1000, 40)
25 (1000, 40)
26 (1000, 40)
27 (1000, 40)
28 (478, 40)


In [None]:
class MLVAE(VAE):
    """
    """

    def _getVAE(self): 
        self.model = self._build_model(train_data.shape[1]).to(self.device)
        self.data_dim = train_data.shape[1]
    
    def _train(self, train_data: pd.DataFrame, train_config=None) -> pd.DataFrame:
        # self.model = self._build_model(train_data.shape[1]).to(self.device)
        # self.data_dim = train_data.shape[1]

        # bar = ProgressBar(total=self.num_epochs)
        self.model.train()

        for epoch in range(self.num_epochs):
            # total_loss = 0
            for i, (batch, _, _, _) in enumerate(loader):
                x = torch.tensor(batch, dtype=torch.float, device=self.device)
                recon_x, mu, log_var, _ = self.model(x, None)
                recon_loss = loss_func(x, recon_x)
                kld_loss = -0.5 * torch.mean(torch.sum(1 + log_var - mu**2 - log_var.exp(), dim=1), dim=0)
                loss = recon_loss + kld_loss * self.kld_weight

In [None]:
self.model = self._build_model(train_data.shape[1]).to(self.device)
self.data_dim = train_data.shape[1]

loader = RollingWindowDataset(
    train_data,
    target_seq_index=None,
    shuffle=True,
    flatten=True,
    n_past=self.k,
    n_future=0,
    batch_size=self.batch_size,
)
optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr)
loss_func = nn.MSELoss()
bar = ProgressBar(total=self.num_epochs)

self.model.train()
for epoch in range(self.num_epochs):
    total_loss = 0
    for i, (batch, _, _, _) in enumerate(loader):
        x = torch.tensor(batch, dtype=torch.float, device=self.device)
        recon_x, mu, log_var, _ = self.model(x, None)
        recon_loss = loss_func(x, recon_x)
        kld_loss = -0.5 * torch.mean(torch.sum(1 + log_var - mu**2 - log_var.exp(), dim=1), dim=0)
        loss = recon_loss + kld_loss * self.kld_weight
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss
    if bar is not None:
        bar.print(epoch + 1, prefix="", suffix="Complete, Loss {:.4f}".format(total_loss / len(train_data)))
return self._get_anomaly_score(train_data)

In [24]:
from merlion.models.base import NormalizingConfig
from merlion.models.anomaly.base import DetectorBase, DetectorConfig

In [25]:
class VAEConfig(DetectorConfig, NormalizingConfig):
    """
    Configuration class for VAE. The normalization is inherited from `NormalizingConfig`.
    The input data will be standardized automatically.
    """

    _default_threshold = AggregateAlarms(alm_threshold=2.5, abs_score=True)

    @initializer
    def __init__(
        self,
        encoder_hidden_sizes: Sequence[int] = (25, 10, 5),
        decoder_hidden_sizes: Sequence[int] = (5, 10, 25),
        latent_size: int = 5,
        sequence_len: int = 1,
        kld_weight: float = 1.0,
        dropout_rate: float = 0.0,
        num_eval_samples: int = 10,
        lr: float = 1e-3,
        batch_size: int = 1024,
        num_epochs: int = 10,
        **kwargs
    ):
        """
        :param encoder_hidden_sizes: The hidden layer sizes of the MLP encoder
        :param decoder_hidden_sizes: The hidden layer sizes of the MLP decoder
        :param latent_size: The latent size
        :param sequence_len: The input series length, e.g., input = [x(t-sequence_len+1)...,x(t-1),x(t)]
        :param kld_weight: The regularization weight for the KL divergence term
        :param dropout_rate: The dropout rate for the encoder and decoder
        :param num_eval_samples: The number of sampled latent variables during prediction
        :param lr: The learning rate during training
        :param batch_size: The batch size during training
        :param num_epochs: The number of training epochs
        """
        super().__init__(**kwargs)

{'num_epochs': 10,
 'batch_size': 1024,
 'lr': 0.001,
 'num_eval_samples': 10,
 'dropout_rate': 0.0,
 'kld_weight': 1.0,
 'sequence_len': 1,
 'latent_size': 5,
 'decoder_hidden_sizes': (5, 10, 25),
 'encoder_hidden_sizes': (25, 10, 5),
 'transform': {'name': 'Identity'},
 'normalize': {'name': 'MeanVarNormalize',
  'bias': None,
  'scale': None,
  'normalize_bias': True,
  'normalize_scale': True},
 'dim': None,
 'enable_threshold': True,
 'enable_calibrator': True,
 'calibrator': {'max_score': 1000,
  'abs_score': True,
  'anchors': None,
  'name': 'AnomScoreCalibrator'},
 'threshold': {'alm_threshold': 2.5,
  'abs_score': True,
  'min_alm_in_window': 2,
  'alm_window_minutes': 60,
  'alm_suppress_minutes': 120,
  'name': 'AggregateAlarms'}}

In [33]:
config, kwargs = VAEConfig().from_dict({}, return_unused_kwargs=True)

In [41]:
config

<__main__.VAEConfig at 0x21a228eab60>

In [35]:
kwargs

{}

In [38]:
from merlion.models.base import Config

In [39]:
assert isinstance(VAEConfig, Config)

AssertionError: 