#1 - Imports

In [None]:
# To install scikit learn 1.1.1
!pip install scikit-learn==1.1.1

Collecting scikit-learn==1.1.1
  Downloading scikit_learn-1.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Downloading scikit_learn-1.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.4/30.4 MB[0m [31m49.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.3.2
    Uninstalling scikit-learn-1.3.2:
      Successfully uninstalled scikit-learn-1.3.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 1.13.0 requires scikit-learn>=1.2.2, but you have scikit-learn 1.1.1 which is incompatible.[0m[31m
[0mSuccessfully installed scikit-learn-1.1.1


In [None]:
# Seed value
seed_value= 42

# 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set the `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)

# 3. Set the `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(seed_value)

In [None]:
import pandas as pd
from sklearn.preprocessing import QuantileTransformer
from sklearn.metrics import auc, roc_curve, accuracy_score, balanced_accuracy_score, f1_score, recall_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.svm import OneClassSVM
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

In [None]:
!pip install pyarrow



In [None]:
test = pd.read_parquet("data/test.parquet")
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59435 entries, 0 to 59434
Data columns (total 68 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Protocol                  59435 non-null  float32
 1   Flow Duration             59435 non-null  float64
 2   Total Fwd Packets         59435 non-null  float64
 3   Total Backward Packets    59435 non-null  float64
 4   Fwd Packets Length Total  59435 non-null  float64
 5   Bwd Packets Length Total  59435 non-null  float64
 6   Fwd Packet Length Max     59435 non-null  float32
 7   Fwd Packet Length Min     59435 non-null  float32
 8   Fwd Packet Length Mean    59435 non-null  float32
 9   Fwd Packet Length Std     59435 non-null  float32
 10  Bwd Packet Length Max     59435 non-null  float32
 11  Bwd Packet Length Min     59435 non-null  float32
 12  Bwd Packet Length Mean    59435 non-null  float32
 13  Bwd Packet Length Std     59435 non-null  float32
 14  Flow B

#2 - Load Test Data

*   Load data from parquet or csv;
*   Map 'Heartbleed' and 'Infiltration' attack classes to 'Unknown';

In [None]:
test = pd.read_parquet("data/test.parquet")
# test = pd.read_csv("data/test.csv")

y = test["Y"].replace(["Heartbleed", "Infiltration"], "Unknown")
x = test.drop(columns=['Y'])

y.value_counts()

Unnamed: 0_level_0,count
Y,Unnamed: 1_level_1
Benign,56468
(D)DOS,584
Port Scan,584
Botnet,584
Brute Force,584
Web Attack,584
Unknown,47


#3 - Load Models


*   the pipelines with feature scaler and optimized model combined for binary detection and multi-class classification;
*   the individual feature scalers and optimized models;
*   Random Forest (RF) optimized baseline model and feature scaler;
*   Optimized models following Bovenzi et al. for comparitative analysis.




In [None]:
 # Optimized pipelines
f = open("models/stage1_ocsvm.p","rb")
stage1 = pickle.load(f)
f.close()
f = open("models/stage2_rf.p","rb")
stage2 = pickle.load(f)
f.close()

# My Work
f = open("models/stage1_IsolationForest.p","rb")
stage1_iforest = pickle.load(f)
f.close()
# with Scaler Transformer
f = open("models/stage1_IsolationForest_ST.p","rb")
stage1_iforestst = pickle.load(f)
f.close()

# Individual feature scalers and classification models
f = open("models/stage1_ocsvm_model.p","rb")
stage1_model = pickle.load(f)
f.close()
f = open("models/stage1_ocsvm_scaler.p","rb")
stage1_scaler = pickle.load(f)
f.close()
f = open("models/stage2_rf_model.p","rb")
stage2_model = pickle.load(f)
f.close()
f = open("models/stage2_rf_scaler.p","rb")
stage2_scaler = pickle.load(f)
f.close()

# RF baseline model and feature scaler
f = open("models/baseline_rf.p","rb")
baseline_rf = pickle.load(f)
f.close()
f = open("models/baseline_rf_scaler.p","rb")
baseline_rf_scaler = pickle.load(f)
f.close()

# Optimized models for Bovenzi et al.
from tensorflow import keras
sota_stage1 = keras.models.load_model("models/sota_stage1.h5")
f = open("models/sota_stage2.p","rb")
sota_stage2 = pickle.load(f)
f.close()

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [None]:
stage1

In [None]:
stage1_iforest

In [None]:
stage1_iforestst

In [None]:
stage2

In [None]:
baseline_rf

In [None]:
sota_stage1

<Functional name=model_600, built=True>

In [None]:
sota_stage2

#3 -  Thresholds $\tau_B$, $\tau_M$ and $\tau_U$

These balanced thresholds are experimentally obtained, see full paper for more details.

In [None]:
tau_b = -0.0002196942507948895
tau_m = 0.98
tau_u = 0.0040588613744241275

#4 - Evaluation of Time Complexity

##4.1 - Definitions

In [None]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
 def hids_predict(x, tau_b, tau_m, tau_u):
    proba_1 = -stage1.decision_function(x) # invert sign to act as anomaly score
    pred_1 = np.where(proba_1 < tau_b, "Benign", "Attack").astype(object)
    proba_2 = stage2.predict_proba(x[pred_1 == "Attack"])
    pred_2 = np.where(
        np.max(proba_2, axis=1) > tau_m,
        stage2.classes_[np.argmax(proba_2, axis=1)],
        "Unknown")
    proba_3 = proba_1[pred_1 == "Attack"][pred_2 == "Unknown"]
    pred_3 = np.where(proba_3 < tau_u, "Benign", "Unknown")
    pred_1[pred_1 == "Attack"] = pred_2
    pred_1[pred_1 == "Unknown"] = pred_3
    return pred_1

In [None]:
 def hids_predictIF(x, tau_b, tau_m, tau_u):
    proba_1 = -stage1_iforest.decision_function(x) # invert sign to act as anomaly score
    pred_1 = np.where(proba_1 > tau_b, "Attack", "Benign").astype(object)
    proba_2 = stage2.predict_proba(x[pred_1 == "Attack"])
    pred_2 = np.where(
        np.max(proba_2, axis=1) > tau_m,
        stage2.classes_[np.argmax(proba_2, axis=1)],
        "Unknown")
    proba_3 = proba_1[pred_1 == "Attack"][pred_2 == "Unknown"]
    pred_3 = np.where(proba_3 < tau_u, "Benign", "Unknown")
    pred_1[pred_1 == "Attack"] = pred_2
    pred_1[pred_1 == "Unknown"] = pred_3
    return pred_1

In [None]:
 def hids_predictIFST(x, tau_b, tau_m, tau_u):
    proba_1 = -stage1_iforestst.decision_function(x) # invert sign to act as anomaly score
    pred_1 = np.where(proba_1 > tau_b, "Attack", "Benign").astype(object)
    proba_2 = stage2.predict_proba(x[pred_1 == "Attack"])
    pred_2 = np.where(
        np.max(proba_2, axis=1) > tau_m,
        stage2.classes_[np.argmax(proba_2, axis=1)],
        "Unknown")
    proba_3 = proba_1[pred_1 == "Attack"][pred_2 == "Unknown"]
    pred_3 = np.where(proba_3 < tau_u, "Benign", "Unknown")
    pred_1[pred_1 == "Attack"] = pred_2
    pred_1[pred_1 == "Unknown"] = pred_3
    return pred_1

In [None]:
 def hids_sota_predict(x, tau_b, tau_m):
    x_s = stage1_scaler.transform(x)
    x_pred = sota_stage1.predict(x_s)
    proba_1 = np.sum((x_s - x_pred)**2, axis=1)
    pred_1 = np.where(proba_1 < tau_b, "Benign", "Attack").astype(object)
    x_s = stage2_scaler.transform(x)
    proba_2 = sota_stage2.predict_proba(x_s[pred_1 == "Attack"])
    pred_1[pred_1 == "Attack"] = np.where(
        np.max(proba_2, axis=1) > tau_m,
        stage2.classes_[np.argmax(proba_2, axis=1)],
        "Unknown")
    return pred_1

##4.2 - Max F-score thesholds

In [None]:
 %%timeit -r3 -n3 -p6
tau_b = -0.0002196942507948895
tau_m = 0.98
tau_u = 0.004530129828299084
y = hids_predict(x, tau_b, tau_m, tau_u)

6.76616 s ± 407.545 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)


##4.3 - Max bACC thresholds

In [None]:
 %%timeit -r3 -n3 -p6
tau_b = -0.0004064190600459828
tau_m = 0.98
tau_u = 0.0006590265510403005
y = hids_predict(x, tau_b, tau_m, tau_u)

6.69243 s ± 294.02 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)


##4.4 - Best "balanced" thesholds

In [None]:
%%timeit -r3 -n3 -p6
tau_b = -0.0002196942507948895
tau_m = 0.98
tau_u = 0.0040588613744241275
y = hids_predict(x, tau_b, tau_m, tau_u)

6.60928 s ± 230.043 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)


##4.5 - Baseline RF

In [None]:
 threshold = 0.43

In [None]:
%%timeit -r3 -n3 -p6
x_s = baseline_rf_scaler.transform(x)
y_proba = baseline_rf.predict_proba(x_s)
y_pred = np.where(np.max(y_proba, axis=1) > threshold, baseline_rf.classes_[np.argmax(y_proba, axis=1)], 'Unknown')

1.27515 s ± 55.0791 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)


##4.6 - Bovenzi et al.

In [None]:
# Thresholds experimentally optimized
tau_b = 0.7580776764761945
tau_m = 0.98

In [None]:
%%timeit -r3 -n3 -p6
y = hids_sota_predict(x, tau_b, tau_m)

[1m1858/1858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step
[1m1858/1858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
[1m1858/1858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step
[1m1858/1858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
[1m1858/1858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step
[1m1858/1858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
[1m1858/1858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step
[1m1858/1858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
[1m1858/1858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step
6.45925 s ± 376.207 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)


##4.7 - Isolation Forest in First Stage

In [None]:
%%timeit -r3 -n3 -p6
tau_b = -0.180834
tau_m = 0.98
tau_u = 0.08210494936373211
y = hids_predictIF(x, tau_b, tau_m, tau_u)

3.62471 s ± 68.2477 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)


##4.8 - Isolation Forest with Scaler Transformer

In [None]:
%%timeit -r3 -n3 -p6
tau_b = -0.122349
tau_m = 0.98
tau_u = 0.06766017781741557
y = hids_predictIFST(x, tau_b, tau_m, tau_u)

4.4663 s ± 24.4387 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)


Checando o tempo aproximado gasto com a tranformação dos dados...

In [None]:
%%timeit -r3 -n3 -p6
scaler = QuantileTransformer(output_distribution='normal')
x_s = scaler.fit_transform(x)

1.12134 s ± 131.077 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)
