# Imports

In [1]:
import sys
from math import ceil
import pandas as pd
import dill as pickle
# to save results to data directory
module_path = '..'
if module_path not in sys.path:
    sys.path.insert(1, module_path)
# increase displayed columns in jupyter notebook
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 300)

In [2]:
import logging
import numpy as np
import matplotlib.pyplot as plt
from src.outlier_model import OutlierModel

import matplotlib
matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False,
})

# increase displayed columns in jupyter notebook
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 300)

# temporarily remove deprecation warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

logging.basicConfig(format='%(asctime)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
logger = logging.getLogger("TimeSeries")
logger.setLevel(logging.INFO)

output_dir = 'beth_output'

# Dataset

In [3]:
base_path = '#datasets'
df1 = pd.read_csv(f'{base_path}/BETH/labelled_2021may-ip-10-100-1-4.csv',index_col=False)
# df = pd.read_csv(f'{base_path}/BETH/labelled_2021may-ip-10-100-1-26.csv',index_col=False) # No evil
# df = pd.read_csv(f'{base_path}/BETH/labelled_2021may-ip-10-100-1-95.csv',index_col=False) # No evil
df2 = pd.read_csv(f'{base_path}/BETH/labelled_2021may-ip-10-100-1-105.csv',index_col=False) # Some evil
# df = pd.read_csv(f'{base_path}/BETH/labelled_2021may-ip-10-100-1-186.csv',index_col=False) # No evil
# df = pd.read_csv(f'{base_path}/BETH/labelled_2021may-ubuntu.csv',index_col=False) # No evil
frames = [df1, df2]
df = pd.concat(frames, ignore_index=True)
df.plot(subplots=True, figsize=(16, 16)); plt.legend(loc='best')

<matplotlib.legend.Legend at 0x10a5684c0>

In [4]:
df.head()
outlier_key = "userId"
data_test = df.copy()

m = 1000
preload_size = 10000
std_dev_mult = 3
range_mult = 2
recent_mult = 2
outlier_model = OutlierModel(m=m, std_dev_mult=std_dev_mult, range_mult=range_mult, recent_mult=recent_mult,
                             time_series=data_test[:preload_size][outlier_key].astype(np.float64),
                             egress=True)

In [5]:
fault = False
for index, row in data_test[preload_size:].iterrows():
    outlier_model.train_one(row[outlier_key])
    fault = outlier_model.predict_one(index)
    if index % 10000 == 0:
        print(f"Current Global index: {index}")

Current Global index: 10000


04/22/2022 11:56:43 AM:  Anomaly at Global index: 11000
04/22/2022 11:56:43 AM: max_mp: 13172.8523, metric:7156.672199999999: metric-max_mp: 6016.180100000001 range: 4241.051600000001


Current Global index: 20000
Current Global index: 30000
Current Global index: 40000
Current Global index: 50000
Current Global index: 60000
Current Global index: 70000
Current Global index: 80000
Current Global index: 90000
Current Global index: 100000
Current Global index: 110000
Current Global index: 120000
Current Global index: 130000
Current Global index: 140000
Current Global index: 150000
Current Global index: 160000
Current Global index: 170000
Current Global index: 180000
Current Global index: 190000
Current Global index: 200000
Current Global index: 210000


04/22/2022 11:57:58 AM:  Anomaly at Global index: 213090
04/22/2022 11:57:58 AM: max_mp: 2716.2331, metric:1071.7787: metric-max_mp: 1644.4543999999999 range: 1708.8236


Current Global index: 220000
Current Global index: 230000


04/22/2022 11:58:07 AM:  Anomaly at Global index: 237204
04/22/2022 11:58:07 AM: max_mp: 4256.8639, metric:2466.7595: metric-max_mp: 1790.1044000000002 range: 2442.4585


Current Global index: 240000
Current Global index: 250000
Current Global index: 260000
Current Global index: 270000
Current Global index: 280000
Current Global index: 290000
Current Global index: 300000
Current Global index: 310000
Current Global index: 320000
Current Global index: 330000
Current Global index: 340000
Current Global index: 350000
Current Global index: 360000
Current Global index: 370000
Current Global index: 380000
Current Global index: 390000
Current Global index: 400000
Current Global index: 410000
Current Global index: 420000
Current Global index: 430000
Current Global index: 440000
Current Global index: 450000
Current Global index: 460000
Current Global index: 470000
Current Global index: 480000


04/22/2022 11:59:42 AM:  Anomaly at Global index: 485354
04/22/2022 11:59:42 AM: max_mp: 5386.6102, metric:2327.0199: metric-max_mp: 3059.5903000000003 range: 3386.1303


Current Global index: 490000
Current Global index: 500000
Current Global index: 510000
Current Global index: 520000
Current Global index: 530000
Current Global index: 540000
Current Global index: 550000
Current Global index: 560000
Current Global index: 570000
Current Global index: 580000
Current Global index: 590000
Current Global index: 600000
Current Global index: 610000
Current Global index: 620000
Current Global index: 630000
Current Global index: 640000
Current Global index: 650000
Current Global index: 660000
Current Global index: 670000
Current Global index: 680000
Current Global index: 690000
Current Global index: 700000
Current Global index: 710000
Current Global index: 720000
Current Global index: 730000
Current Global index: 740000
Current Global index: 750000
Current Global index: 760000
Current Global index: 770000
Current Global index: 780000


04/22/2022 12:01:36 PM:  Anomaly at Global index: 787618
04/22/2022 12:01:36 PM: max_mp: 6206.1965, metric:2062.7696: metric-max_mp: 4143.4269 range: 4714.8508


Current Global index: 790000
Current Global index: 800000
Current Global index: 810000


04/22/2022 12:01:47 PM:  Anomaly at Global index: 816542
04/22/2022 12:01:47 PM: max_mp: 9845.8241, metric:3266.2714: metric-max_mp: 6579.5527 range: 6627.806699999999


Current Global index: 820000
Current Global index: 830000
Current Global index: 840000
Current Global index: 850000
Current Global index: 860000
Current Global index: 870000
Current Global index: 880000
Current Global index: 890000


In [6]:
with open("../masters-thesis-graphing/_data/BETH/model.pkl", 'wb') as handle:
    pickle.dump(outlier_model, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open("../masters-thesis-graphing/_data/BETH/data.pkl", 'wb') as handle:
    pickle.dump(df, handle, protocol=pickle.HIGHEST_PROTOCOL)