In [1]:
import os
import csv
import numpy as np
import matplotlib.pylab as plt
from matplotlib.pyplot import plot, ion, show, savefig, cla, figure

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
cd /content/drive/MyDrive/TX2-VAE-LSTM-for-anomaly-detection/datasets/  

/content/drive/MyDrive/TX2-VAE-LSTM-for-anomaly-detection/datasets


In [4]:
ls

[0m[01;34mNAB-known-anomaly[0m/  Tx2-dataset-preprocessing.ipynb


## Helper functions to load and process original csv files

In [6]:
# this function load one .cvs (a sequence)
def load_data(dataset, csv_folder='./NAB-known-anomaly/csv-files/'):
    if dataset == 'tx2sys3':
        data_file = os.path.join(csv_folder, 'tx2sys3.csv')
        anomalies = ['2022-05-20 00:20:08', '2022-05-20 00:10:13']
        t_unit = 'sec'
    elif dataset == 'tx2usr3':
        data_file = os.path.join(csv_folder, 'tx2usr3.csv')
        anomalies = ['2022-05-20 00:20:08', '2022-05-20 00:10:13']
        t_unit = 'sec'
    # elif dataset == 'tx2read':
    #     data_file = os.path.join(csv_folder, 'tx2read.csv')
    #     anomalies = ['2022-05-20 00:10:01', '2022-05-20 00:10:13']
    #     t_unit = 'sec'
    elif dataset == 'tx2write3':
        data_file = os.path.join(csv_folder, 'tx2write3.csv')
        anomalies = ['2022-05-20 00:20:08', '2022-05-20 00:10:13']
        t_unit = 'sec'
    # elif dataset == 'rogue_agent_key_hold':
    #     data_file = os.path.join(csv_folder, 'rogue_agent_key_hold.csv')
    #     anomalies = ['2014-07-15 08:30:00', '2014-07-17 09:50:00']
    #     t_unit = '5 min'
    # elif dataset == 'rogue_agent_key_updown':
    #     data_file = os.path.join(csv_folder, 'rogue_agent_key_updown.csv')
    #     anomalies = ['2014-07-15 04:00:00', '2014-07-17 08:50:00']
    #     t_unit = '5 min'
    # elif dataset == 'nyc_taxi':
    #     data_file = os.path.join(csv_folder, 'nyc_taxi.csv')
    #     anomalies = ['2014-11-01 19:00:00', '2014-11-27 15:30:00', '2014-12-25 15:00:00', '2015-01-01 01:00:00', 
    #                  '2015-01-27 00:00:00']
    #     t_unit = '30 min'
    
    t = []
    readings = []
    idx_anomaly = []
    i = 0
    with open(data_file) as csvfile:
        readCSV = csv.reader(csvfile, delimiter=',')
        print("\n--> Anomalies occur at:")
        for row in readCSV:
            if i > 0:
                t.append(i)
                readings.append(float(row[1]))
                for j in range(len(anomalies)):
                    if row[0] == anomalies[j]:
                        idx_anomaly.append(i)
                        print("  timestamp #{}: {}".format(j, row[0]))
            i = i + 1
    t = np.asarray(t)
    readings = np.asarray(readings)
    print("\nOriginal csv file contains {} timestamps.".format(t.shape))
    print("Processed time series contain {} readings.".format(readings.shape))
    print("Anomaly indices are {}".format(idx_anomaly))
    
    return t, t_unit, readings, idx_anomaly

In [7]:
# This function plots a dataset with the train/test split and known anomalies
# Relies on helper function load_data()

def process_and_save_specified_dataset(dataset, idx_split, y_scale=5,save_file = True):
    t, t_unit, readings, idx_anomaly = load_data(dataset)
    
    # split into training and test sets
    training = readings[idx_split[0]:idx_split[1]]
    t_train = t[idx_split[0]:idx_split[1]]
    
    # normalise by training mean and std 
    train_m = np.mean(training)
    train_std = np.std(training)
    print("\nTraining set mean is {}".format(train_m))
    print("Training set std is {}".format(train_std))
    readings_normalised = (readings - train_m) / train_std
    
    training = readings_normalised[idx_split[0]:idx_split[1]]
    if idx_split[0] == 0:
        test = readings_normalised[idx_split[1]:]
        t_test = t[idx_split[1]:] - idx_split[1]
        idx_anomaly_test = np.asarray(idx_anomaly) - idx_split[1]
    else:
        test = [readings_normalised[:idx_split[0]], readings_normalised[idx_split[1]:]]
        t_test = [t[:idx_split[0]], t[idx_split[1]:] - idx_split[1]]
        idx_anomaly_split = np.squeeze(np.argwhere(np.asarray(idx_anomaly)>idx_split[0]))
        idx_anomaly_test = [np.asarray(idx_anomaly[:idx_anomaly_split[0]]), 
                            np.asarray(idx_anomaly[idx_anomaly_split[0]:]) - idx_split[1]]
    print("Anomaly indices in the test set are {}".format(idx_anomaly_test))
    # print(t)
    # print(readings)
    # print(training)
    # test = np.asarray(test)
    print(test[0].shape)
    # print(t_train)
    # print(t_test)
    # print(idx_anomaly)
    # print(idx_anomaly_test)
    
    if save_file:
      save_dir = './NAB-known-anomaly/'
      np.savez(save_dir+dataset+'.npz', t=t, t_unit=t_unit, readings=readings, idx_anomaly=idx_anomaly,
                  idx_split=idx_split, training=training, test=test, train_m=train_m, train_std=train_std,
                  t_train=t_train, t_test=t_test, idx_anomaly_test=idx_anomaly_test)
      print("\nProcessed time series are saved at {}".format(save_dir+dataset+'.npz'))
    else:
        print("\nProcessed time series are not saved.")
    
    # plot the whole normalised sequence
    fig, axs = plt.subplots(1, 1, figsize=(18, 4), edgecolor='k')
    fig.subplots_adjust(hspace=.4, wspace=.4)
    # axs = axs.ravel()
    # for i in range(4):
    axs.plot(t, readings_normalised)
    if idx_split[0] == 0:
        axs.plot(idx_split[1]*np.ones(20), np.linspace(-y_scale,y_scale,20), 'b--')
    else:
        for i in range(2):
            axs.plot(idx_split[i]*np.ones(20), np.linspace(-y_scale,y_scale,20), 'b--')
    for j in range(len(idx_anomaly)):
        axs.plot(idx_anomaly[j]*np.ones(20), np.linspace(-y_scale,y_scale,20), 'r--')
    #     axs.plot(data[:,1])
    axs.grid(True)
    axs.set_xlim(0, len(t))
    axs.set_ylim(-y_scale, y_scale)
    axs.set_xlabel("timestamp (every {})".format(t_unit))
    axs.set_ylabel("normalised readings")
    axs.set_title("{} dataset\n(normalised by train mean {:.2f} and std {:.2f})".format(dataset, train_m, train_std))
    axs.legend(('data', 'train test set split', 'anomalies'))
    
    return t, readings_normalised

## Example on ambient temperature series

In [9]:
dataset = 'tx2write3'
idx_split = [1000,4500] #array 2개들어 있어서 이거인가봐, 0으로 설정하면 안돼.

t, readings_normalised = process_and_save_specified_dataset(dataset, idx_split)


--> Anomalies occur at:
  timestamp #1: 2022-05-20 00:10:13
  timestamp #0: 2022-05-20 00:20:08

Original csv file contains (6076,) timestamps.
Processed time series contain (6076,) readings.
Anomaly indices are [614, 1209]

Training set mean is 625136.2011428572
Training set std is 7117643.395865344


IndexError: ignored

In [8]:
data_dir = '../datasets/NAB-known-anomaly/'
data = np.load(data_dir + dataset + '.npz')