# Imports

In [1]:
import logging
from src import common
from src.outlier_model import OutlierModel
import dill as pickle
import matplotlib

logging.basicConfig(format='%(asctime)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
logger = logging.getLogger("TimeSeries")
logger.setLevel(logging.INFO)

output_dir = 'pec_output'

# Dataset preprocessing

In [2]:
column_types_loc = "#datasets/PEC/column_types.json"
column_types = common.json_load(column_types_loc)

target = column_types["target"]
identifier = column_types["identifier"]
cat_cols = column_types["categorical"]
measurement_label = column_types["measurement_label"]

with open(f"#datasets/PEC/dataset.pkl", "rb") as fh:
    data = pickle.load(fh)

# assign unique label to each measurement fault and create dictionary for easier analysis
fault_dict = {}
for label, i in zip(data[measurement_label].unique(), range(len(data[measurement_label].unique()))):
    data.loc[(data[measurement_label] == label) & (data[target] == 1), target] = int(i + 1)
    fault_dict[label] = int(i + 1)

data.drop(columns=[measurement_label], inplace=True)
# Fill NA - 0 for numerical and 'NA' for categorical
# categorical
data[cat_cols] = data[cat_cols].fillna("NA")
data[cat_cols] = data[cat_cols].astype(str)
# non-categorical
non_cat_cols = data.drop(columns=cat_cols + [identifier]).columns.tolist()
data[non_cat_cols] = data[non_cat_cols].fillna(0)


# Outlier_model
* outlier_model code that implementes outlier_model.predict() function

In [3]:
start_index = 0
end_index = 1000000
data_test = data[start_index:end_index]

outlier_key = "f_c"
m = 250
preload_size = 5000
std_dev_mult = 4
range_mult = 0
recent_mult = 1
outlier_model = OutlierModel(m=m, std_dev_mult=std_dev_mult, range_mult=range_mult, recent_mult=recent_mult,
                             time_series=data_test[:preload_size][outlier_key],
                             egress=True)


# Simulate stream of data
## Inverse transform test dataset for evaluation

In [4]:
data_test.head()

Unnamed: 0,f_c,P,m_d,m_q,theta,P_ref,V_DC,V_phaseA,V_phaseB,V_phaseC,I_phaseA,I_phaseB,I_phaseC,fault,Unnamed: 15
0,50.0,2499.997221,311.0,0.0,312.486512,2500.0,800.0,2.487576,-270.569073,268.081497,-0.392875,-4.447507,4.840382,0,0
1,50.0,2499.99725,311.0,0.0,312.50222,2500.0,800.0,7.372088,-272.944264,265.572176,-0.308598,-4.494417,4.803015,0,1
2,50.0,2499.99728,311.0,0.0,312.517928,2500.0,800.0,12.254782,-275.252112,262.99733,-0.224245,-4.540218,4.764463,0,2
3,50.0,2499.99731,311.0,0.0,312.533636,2500.0,800.0,17.134452,-277.492044,260.357593,-0.139837,-4.584899,4.724735,0,3
4,50.0,2499.997341,311.0,0.0,312.549344,2500.0,800.0,22.009894,-279.66351,257.653616,-0.055394,-4.628448,4.683842,0,4


* loading the whole DL model with preprocessors in each iteration is stupid but I wanted to send Alex at least some initial code


In [5]:
fault = False
for index, row in data_test[preload_size:].iterrows():
    outlier_model.train_one(row[outlier_key])
    fault = outlier_model.predict_one(index)
    if index % 10000 == 0:
        print(f"Current Global index: {index}")


Current Global index: 10000
Current Global index: 20000
Current Global index: 30000


04/21/2022 02:47:47 PM:  Anomaly at Global index: 40341
04/21/2022 02:47:47 PM: max_mp: 1689.7735, metric:98.4065: metric-max_mp: 1591.367 range: 1689.7735


Current Global index: 40000


04/21/2022 02:47:49 PM:  Anomaly at Global index: 45783
04/21/2022 02:47:49 PM: max_mp: 46498.2142, metric:35662.835600000006: metric-max_mp: 10835.378599999996 range: 139.84790000000066


Current Global index: 50000
Current Global index: 60000
Current Global index: 70000
Current Global index: 80000
Current Global index: 90000
Current Global index: 100000
Current Global index: 110000


04/21/2022 02:48:16 PM:  Anomaly at Global index: 120038
04/21/2022 02:48:16 PM: max_mp: 0.1243, metric:0.0072: metric-max_mp: 0.1171 range: 0.1243


Current Global index: 120000
Current Global index: 130000
Current Global index: 140000
Current Global index: 150000


04/21/2022 02:48:33 PM:  Anomaly at Global index: 160053
04/21/2022 02:48:33 PM: max_mp: 0.1243, metric:0.0072: metric-max_mp: 0.1171 range: 0.1243


Current Global index: 160000
Current Global index: 170000
Current Global index: 180000
Current Global index: 190000


04/21/2022 02:48:48 PM:  Anomaly at Global index: 200098
04/21/2022 02:48:48 PM: max_mp: 0.0112, metric:0.0008: metric-max_mp: 0.0104 range: 0.0112


Current Global index: 200000
Current Global index: 210000
Current Global index: 220000
Current Global index: 230000


04/21/2022 02:49:03 PM:  Anomaly at Global index: 240054
04/21/2022 02:49:03 PM: max_mp: 1.6339, metric:0.09509999999999999: metric-max_mp: 1.5388 range: 1.6339


Current Global index: 240000
Current Global index: 250000
Current Global index: 260000
Current Global index: 270000


04/21/2022 02:49:17 PM:  Anomaly at Global index: 280395
04/21/2022 02:49:17 PM: max_mp: 7215.7619, metric:420.2196: metric-max_mp: 6795.5423 range: 7215.7619


Current Global index: 280000


04/21/2022 02:49:19 PM:  Anomaly at Global index: 285395
04/21/2022 02:49:19 PM: max_mp: 57649.2738, metric:47016.6066: metric-max_mp: 10632.667200000004 range: 75.36929999999847


Current Global index: 290000
Current Global index: 300000
Current Global index: 310000
Current Global index: 320000


In [6]:
with open("../masters-thesis-graphing/_data/PEC/outlier_model.pkl", 'wb') as handle:
    pickle.dump(outlier_model, handle, protocol=pickle.HIGHEST_PROTOCOL)
