In [1]:
###load example training data

import pandas as pd
import zipfile

z_tr = zipfile.ZipFile('./data/SWaT_train.zip', "r")
f_tr = z_tr.open(z_tr.namelist()[0])
train_df=pd.read_csv(f_tr)
f_tr.close()
z_tr.close()

In [2]:
train_df.head()

Unnamed: 0,Timestamp,FIT101,LIT101,MV101,P101,P102,AIT201,AIT202,AIT203,FIT201,...,P501,P502,PIT501,PIT502,PIT503,FIT601,P601,P602,P603,Normal/Attack
0,22/12/2015 4:00:00 PM,2.470294,261.5804,2,2,1,244.3284,8.19008,306.101,2.471278,...,1,1,10.02948,0.0,4.277749,0.000256,1,1,1,Normal
1,22/12/2015 4:00:05 PM,2.425456,260.0495,2,2,1,244.5847,8.19008,306.101,2.465127,...,1,1,10.02948,0.0,4.277749,0.000256,1,1,1,Normal
2,22/12/2015 4:00:10 PM,2.630753,261.7766,2,2,1,244.809,8.19008,306.101,2.470894,...,1,1,10.02948,0.0,4.277749,0.000256,1,1,1,Normal
3,22/12/2015 4:00:15 PM,2.61602,260.8346,2,2,1,245.0333,8.19008,305.8703,2.474097,...,1,1,10.02948,0.0,4.277749,0.000256,1,1,1,Normal
4,22/12/2015 4:00:20 PM,2.556769,261.6589,2,2,1,245.4499,8.19008,305.8703,2.471663,...,1,1,10.02948,0.0,4.277749,0.000256,1,1,1,Normal


In [3]:
###drop unwanted columns 
train_df = train_df.drop(['Timestamp', 'Normal/Attack'], axis=1)
train_df.head()

Unnamed: 0,FIT101,LIT101,MV101,P101,P102,AIT201,AIT202,AIT203,FIT201,MV201,...,FIT504,P501,P502,PIT501,PIT502,PIT503,FIT601,P601,P602,P603
0,2.470294,261.5804,2,2,1,244.3284,8.19008,306.101,2.471278,2,...,0.0,1,1,10.02948,0.0,4.277749,0.000256,1,1,1
1,2.425456,260.0495,2,2,1,244.5847,8.19008,306.101,2.465127,2,...,0.0,1,1,10.02948,0.0,4.277749,0.000256,1,1,1
2,2.630753,261.7766,2,2,1,244.809,8.19008,306.101,2.470894,2,...,0.0,1,1,10.02948,0.0,4.277749,0.000256,1,1,1
3,2.61602,260.8346,2,2,1,245.0333,8.19008,305.8703,2.474097,2,...,0.0,1,1,10.02948,0.0,4.277749,0.000256,1,1,1
4,2.556769,261.6589,2,2,1,245.4499,8.19008,305.8703,2.471663,2,...,0.0,1,1,10.02948,0.0,4.277749,0.000256,1,1,1


In [4]:
### define features and the PARAnomalyExplainer
from pars import NumericFeature, CategoricFeature, PARAnomalyExplainer

features = []
for name in train_df.columns:
    if len(train_df[name].unique()) > 5:
        features.append( NumericFeature(name,min_value=train_df[name].min(), max_value=train_df[name].max(),
                                    mean_value=train_df[name].mean(), std_value=train_df[name].std()) )
    else:
        features.append( CategoricFeature(name,values=train_df[name].unique().tolist()) )

parexp = PARAnomalyExplainer(features)

In [5]:
### let's train the PARAnomalyExplainer
parexp.train(train_df, max_predicts4rule_mining = 75, max_times4rule_mining = 5, set_seed=False)

### you can also save trained PARAnomalyExplainer to files
parexp.save_model('./pars4swat')

In [6]:
#### load the test data
z_tr = zipfile.ZipFile('./data/SWaT_test.zip', "r")
f_tr = z_tr.open(z_tr.namelist()[0])
test_df=pd.read_csv(f_tr)
f_tr.close()
z_tr.close()

test_df.head()

Unnamed: 0,Timestamp,FIT101,LIT101,MV101,P101,P102,AIT201,AIT202,AIT203,FIT201,...,P502,PIT501,PIT502,PIT503,FIT601,P601,P602,P603,Normal/Attack,label
0,28/12/2015 10:00:00 AM,2.427057,522.8467,2,2,1,262.0161,8.396437,328.6337,2.445391,...,1,250.8652,1.649953,189.5988,0.000128,1,1,1,Normal,0
1,28/12/2015 10:00:05 AM,2.609294,523.8673,2,2,1,262.0161,8.394514,328.6337,2.44411,...,1,250.753,1.649953,189.5027,0.000128,1,1,1,Normal,0
2,28/12/2015 10:00:10 AM,2.630433,524.1028,2,2,1,262.0161,8.394514,328.6337,2.441803,...,1,251.1055,1.649953,189.8231,0.000128,1,1,1,Normal,0
3,28/12/2015 10:00:15 AM,2.567979,522.6505,2,2,1,262.0161,8.390669,328.6337,2.441803,...,1,251.1856,1.649953,189.9994,0.000128,1,1,1,Normal,0
4,28/12/2015 10:00:20 AM,2.489191,522.729,2,2,1,262.0161,8.390669,328.3773,2.439881,...,1,251.1856,1.649953,189.8872,0.000128,1,1,1,Normal,0


In [7]:
### extract anomalies in the test data
test_df = test_df.loc[test_df['label']==1,:]

featnames = [feature.name for feature in features]
anomalies = test_df[featnames].values

In [8]:
### Load pretrained PARAnomalyExplainer from files
pars2load = PARAnomalyExplainer()
pars2load.load_model('./pars4swat')

<pars.anomaly_explainer.PARAnomalyExplainer at 0x1a07203c1c8>

In [9]:
### you can use PARAnomalyExplainer to find top-k violated PARs for an individual anomaly
rules = parexp.find_violated_pars(anomalies[0], topk=5)
print('Violated PARs:')
for rule in rules:
    print(f'{rule}, sup: {rule.support}, conf: {rule.conf}')

Violated PARs:
PIT501<252.15489959716797 and 262.59295654296875<=AIT201<265.79725646972656 and AIT503<263.9547424316406 ---> P203=2, sup: 0.0133756038647343, conf: 0.9984973703981967
PIT501<252.15489959716797 and MV301=1 and AIT503<263.9547424316406 and 262.59295654296875<=AIT201<265.79725646972656 ---> P203=2, sup: 0.012811996779388084, conf: 0.9984313725490197
AIT503<263.9547424316406 and 262.59295654296875<=AIT201<265.79725646972656 and 11.362474918365479<=AIT504<13.015894889831543 ---> P203=2, sup: 0.01252012882447665, conf: 0.9983948635634028
AIT503<263.9547424316406 and 0.3034942001104355<=FIT504<0.30887484550476074 and 262.59295654296875<=AIT201<265.79725646972656 and 11.362474918365479<=AIT504<13.015894889831543 ---> P203=2, sup: 0.0125, conf: 0.9983922829581995
PIT501<252.15489959716797 and AIT503<263.9547424316406 and 0.3034942001104355<=FIT504<0.30887484550476074 and 262.59295654296875<=AIT201<265.79725646972656 ---> P205=2, sup: 0.012489935587761675, conf: 0.998390989541432

In [10]:
### you can also find summarized anomaly explanation for a list of anomalies
explanation = parexp.explain_anomalies(anomalies[0:20])

# each explanation item is a tuple contains the following elements: 
# (anomalous feature,probability,violated rule,rule confidence,rule support,violated locations,related features)
for exp_item in explanation.summary():
    print(f'anomalous feature: {exp_item[0]}')
    print(f'probability: {exp_item[1]}')
    print(f'representive violated PAR: {exp_item[2]}')
    print(f'confidence of representive PAR: {exp_item[3]}')
    print(f'support of the representive PAR: {exp_item[4]}')
    print(f'violated locations: {exp_item[5]}')
    print(f'related features in the representive PAR: {exp_item[6]}')
    print()

anomalous feature: P203
probability: 0.28112548742994664
representive violated PAR: PIT501<252.15489959716797 and 262.59295654296875<=AIT201<265.79725646972656 and AIT503<263.9547424316406 ---> P203=2
confidence of representive PAR: 0.9984973703981967
support of the representive PAR: 0.0133756038647343
violated locations: [0, 4, 5, 9, 10, 13, 14, 15]
related features in the representive PAR: ['P203', 'PIT501', 'AIT201', 'AIT503']

anomalous feature: FIT301
probability: 0.2403296129010238
representive violated PAR: FIT101>=2.537071943283081 and P101=1 and AIT503<263.9547424316406 ---> FIT301<1.940236508846283
confidence of representive PAR: 1.0
support of the representive PAR: 0.015780998389694042
violated locations: [1, 2, 3, 6, 7, 8, 11, 12]
related features in the representive PAR: ['FIT301', 'FIT101', 'P101', 'AIT503']

anomalous feature: DPIT301
probability: 0.16022001308567668
representive violated PAR: FIT101>=2.537071943283081 and P101=1 and AIT503<263.9547424316406 ---> 0.64824