In [1]:
import os
import pandas as pd
import numpy as np

from efc import EnergyBasedFlowClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import IsolationForest
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC

from sklearn.neighbors import LocalOutlierFactor

import xgboost as xgb

from anomaly_flow.utils.binary_processing import split_flag_columns

In [2]:
TRAIN_DATASET = "NF-CSE-CIC-IDS2018-v2-DDoS"
CROSS_DATASET = ""

In [3]:
dtypes_netflow = {
    "IPV4_SRC_ADDR":                "object",
    "L4_SRC_PORT":                  "float32",
    "IPV4_DST_ADDR":                "object",
    "L4_DST_PORT":                  "float32",
    "PROTOCOL":                     "float32",
    "L7_PROTO":                     "float64",
    "IN_BYTES":                     "float32",
    "IN_PKTS":                      "float32",
    "OUT_BYTES":                    "float32",
    "OUT_PKTS":                     "float32",
    "TCP_FLAGS":                    "int32",
    "CLIENT_TCP_FLAGS":             "int32",
    "SERVER_TCP_FLAGS":             "int32",
    "FLOW_DURATION_MILLISECONDS":   "float32",
    "DURATION_IN":                  "float32",
    "DURATION_OUT":                 "float32",
    "MIN_TTL":                      "float32",
    "MAX_TTL":                      "float32",
    "LONGEST_FLOW_PKT":             "float32",
    "SHORTEST_FLOW_PKT":            "float32",
    "MIN_IP_PKT_LEN":               "float32",
    "MAX_IP_PKT_LEN":               "float32",
    "SRC_TO_DST_SECOND_BYTES":      "float64",
    "DST_TO_SRC_SECOND_BYTES":      "float64",
    "RETRANSMITTED_IN_BYTES":       "float32",
    "RETRANSMITTED_IN_PKTS":        "float32",
    "RETRANSMITTED_OUT_BYTES":      "float32",
    "RETRANSMITTED_OUT_PKTS":       "float32",
    "SRC_TO_DST_AVG_THROUGHPUT":    "float32",
    "DST_TO_SRC_AVG_THROUGHPUT":    "float32",
    "NUM_PKTS_UP_TO_128_BYTES":     "float32",
    "NUM_PKTS_128_TO_256_BYTES":    "float32",
    "NUM_PKTS_256_TO_512_BYTES":    "float32",
    "NUM_PKTS_512_TO_1024_BYTES":   "float32",
    "NUM_PKTS_1024_TO_1514_BYTES":  "float32",
    "TCP_WIN_MAX_IN":               "float32",
    "TCP_WIN_MAX_OUT":              "float32",
    "ICMP_TYPE":                    "float32",
    "ICMP_IPV4_TYPE":               "float32",
    "DNS_QUERY_ID":                 "float32",
    "DNS_QUERY_TYPE":               "float32",
    "DNS_TTL_ANSWER":               "float32",
    "FTP_COMMAND_RET_CODE":         "float32",
    "Attack":                       "object",
    "Label":                        "float32",
}

In [4]:
FEATURES_TO_DROP = [
    'IPV4_SRC_ADDR', 
    'IPV4_DST_ADDR', 
    'L7_PROTO', 
    'L4_SRC_PORT', 
    'L4_DST_PORT', 
    'FTP_COMMAND_RET_CODE',
    'Attack'
]

In [5]:
df = pd.read_csv(
    f"./datasets/{TRAIN_DATASET}-downsample.csv.gz",
    dtype=dtypes_netflow
)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 478055 entries, 0 to 478054
Data columns (total 45 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   IPV4_SRC_ADDR                478055 non-null  object 
 1   L4_SRC_PORT                  478055 non-null  float32
 2   IPV4_DST_ADDR                478055 non-null  object 
 3   L4_DST_PORT                  478055 non-null  float32
 4   PROTOCOL                     478055 non-null  float32
 5   L7_PROTO                     478055 non-null  float64
 6   IN_BYTES                     478055 non-null  float32
 7   IN_PKTS                      478055 non-null  float32
 8   OUT_BYTES                    478055 non-null  float32
 9   OUT_PKTS                     478055 non-null  float32
 10  TCP_FLAGS                    478055 non-null  int32  
 11  CLIENT_TCP_FLAGS             478055 non-null  int32  
 12  SERVER_TCP_FLAGS             478055 non-null  int32  
 13 

In [7]:
cross_df = pd.read_csv(
    f"./datasets/{CROSS_DATASET}-downsample.csv.gz",
    dtype=dtypes_netflow
)

In [8]:
threshold = np.finfo(np.float32).max

In [9]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)
print(df['Attack'].value_counts())
df.drop(FEATURES_TO_DROP, axis=1, inplace=True)
df = df[df < threshold]
df = split_flag_columns(df)
df.dropna(inplace=True)

Attack
Benign            459044
Exploits            6310
Fuzzers             4462
Generic             3312
Reconnaissance      2556
DoS                 1159
Analysis             460
Backdoor             434
Shellcode            285
Worms                 33
Name: count, dtype: int64
Using cached file: 2e1b49bdf7ef775ccf86409f36645a01.


In [10]:
cross_df.replace([np.inf, -np.inf], np.nan, inplace=True)
cross_df.dropna(inplace=True)
print(cross_df['Attack'].value_counts())
cross_df.drop(FEATURES_TO_DROP, axis=1, inplace=True)
cross_df = cross_df[cross_df < threshold]
cross_df = split_flag_columns(cross_df)
cross_df.dropna(inplace=True)

Attack
Benign                    4990670
DDOS attack-HOIC           324257
DDoS attacks-LOIC-HTTP      92190
DDOS attack-LOIC-UDP          634
Name: count, dtype: int64
Using cached file: 38e466053581bb706ccdb35435c58a35.


In [11]:
cross_x, cross_y = cross_df.drop(['Label', 'Unnamed: 0'], axis=1), cross_df['Label']

X, y = df.drop(['Label'], axis=1), df['Label']

X_train, X_test, y_train, y_test = train_test_split(
                                        X, y, stratify=y,
                                        test_size=0.33, random_state=42
                                   )
cross_x_train, cross_x_test, cross_y_train, cross_y_test = train_test_split(
                                                                cross_x, cross_y,
                                                                stratify=cross_y,
                                                                test_size=0.9,
                                                                random_state=42
                                                           )

# Reescale the models to train and test
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

cross_x_train = scaler.transform(cross_x_train)
cross_x_test = scaler.transfor(cross_x_test)

In [14]:
# Define parameter grids for each algorithm
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
}

param_grid_lr = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2'],
}

param_grid_if = {
    'n_estimators': [50, 100, 150],
    'contamination': [0.01, 0.05, 0.1, 0.2]
}

param_grid_xgb = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.1, 0.01, 0.001]
}

param_grid_mlp = {
    'hidden_layer_sizes': [(50,), (100,), (50, 25), (100, 50)],
    'activation': ['relu', 'tanh'],
    'max_iter': [200, 300, 400],
}

param_grid_svm = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto', 0.1, 1],
}

param_grid_lof = {
    'contamination': [0.01, 0.02, 0.03, 0.04, 0.05]
}

param_grid_efc = {}

In [15]:
rf_classifier = RandomForestClassifier()
lr_classifier = LogisticRegression(solver='lbfgs', max_iter=1000)
if_classifier = IsolationForest(random_state=42)
xgb_classifier = xgb.XGBClassifier()
mlp_classifier = MLPClassifier(random_state=42)
svm_classifier = SVC(random_state=42)
lof_classifier = LocalOutlierFactor(novelty=True)
efc_classifier = EnergyBasedFlowClassifier()


In [16]:
# Create a dictionary of classifiers and their respective parameter grids
classifiers = {
    'Random Forest': (rf_classifier, param_grid_rf, 'f1', False),
    'Logistic Regression': (lr_classifier, param_grid_lr, 'f1', False),
    'Isolation Forest': (if_classifier, param_grid_if, 'roc_auc', True), 
    'eXtreme Gradient Boosting': (xgb_classifier, param_grid_xgb, 'f1', False),
    'Multilayer Perceptron': (mlp_classifier, param_grid_mlp, 'f1', False),
    # 'Support Vector Machine': (svm_classifier, param_grid_svm, 'f1', False),
    # 'Local Outlier Factor': (lof_classifier, param_grid_lof, 'f1', False),
    'Energy Based Flow Classifier': (efc_classifier, param_grid_efc, 'f1', False),
}

In [17]:
result_file = dict()

# Evaluate each classifier using GridSearchCV
for classifier_name, (classifier, param_grid, metric, probability) in classifiers.items():

    grid_search = GridSearchCV(
        classifier,
        param_grid,
        cv=3,
        scoring=metric,
        n_jobs=-1,
        verbose=1
    )

    grid_search.fit(X_train, y_train)

    # Print the best parameters and the corresponding accuracy on the test set
    print(f"Best parameters for {classifier_name}: {grid_search.best_params_}")
    result_file[classifier_name]['best_params'] = grid_search.best_params_

    if probability is False:
        # Make predictions on the test set
        y_pred = grid_search.predict(X_test)
        cross_pred = grid_search.predict(cross_x_test)

        # Evaluate the performance on the same silo
        accuracy_value = accuracy_score(y_test, y_pred)
        f1_value = f1_score(y_test, y_pred)
        f2_value = fbeta_score(y_test, y_pred, beta=2)

        print(f"Accuracy on the test set: {accuracy_value:.4f}")
        print(f"F1-Score on the test set: {f1_value:.4f}")
        print(f"F2-Score on the test set: {f2_value:.4f}")

        result_file[classifier_name]['local_evaluation']['accuracy'] = accuracy_value
        result_file[classifier_name]['local_evaluation']['f1_value'] = f1_value
        result_file[classifier_name]['local_evaluation']['f2_value'] = f2_value

        # Evaluate the performance on cross silos approach
        accuracy_cross = accuracy_score(cross_y_test, cross_pred)
        f1_cross = f1_score(cross_y_test, cross_pred)
        f2_cross = fbeta_score(cross_y_test, cross_pred, beta=2)

        print(f"Accuracy on the cross-evaluation set: {accuracy_cross:.4f}")
        print(f"F1-Score on the cross-evaluation set: {f1_cross:.4f}")
        print(f"F2-Score on the cross-evaluation set: {f2_cross:.4f}")

        result_file[classifier_name]['cross_evaluation']['accuracy'] = accuracy_cross
        result_file[classifier_name]['cross_evaluation']['f1_value'] = f1_cross
        result_file[classifier_name]['cross_evaluation']['f2_value'] = f2_cross

    elif probability is True:
        anomaly_scores = grid_search.decision_function(X_test)

        # Evaluate the perfomance on the local data samples
        roc_auc = roc_auc_score(y_test, -anomaly_scores)
        pr_auc = average_precision_score(y_test, -anomaly_scores)
        print(f"ROC AUC Score on the test set: {roc_auc:.4f}")
        print(f"PR AUC Score on the test set: {pr_auc:.4f}")

        result_file[classifier_name]['local_evaluation']['roc_auc'] = roc_auc
        result_file[classifier_name]['local_evaluation']['pr_auc'] = pr_auc

        # Evaluate the performance on cross silos approach
        anomaly_scores_cross = grid_search.decision_function(cross_x_test)
        roc_auc = roc_auc_score(cross_y_test, -anomaly_scores_cross)
        pr_auc = average_precision_score(cross_y_test, -anomaly_scores_cross)
        print(f"ROC AUC Score on the cross set: {roc_auc:.4f}")
        print(f"PR AUC Score on the cross set: {pr_auc:.4f}")

        result_file[classifier_name]['cross_evaluation']['roc_auc'] = roc_auc
        result_file[classifier_name]['cross_evaluation']['pr_auc'] = pr_auc

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] END max_depth=None, min_samples_split=2, n_estimators=50; total time= 1.3min
[CV] END max_depth=None, min_samples_split=2, n_estimators=50; total time= 1.7min
[CV] END max_depth=None, min_samples_split=2, n_estimators=50; total time= 2.0min
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time= 2.4min
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time= 4.5min
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time= 3.5min
[CV] END max_depth=None, min_samples_split=5, n_estimators=50; total time= 2.0min
[CV] END max_depth=None, min_samples_split=5, n_estimators=50; total time= 1.5min
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time= 4.6min
[CV] END max_depth=None, min_samples_split=5, n_estimators=50; total time= 1.3min
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time= 8.3min
[CV] END max_depth=None, min_sam

In [None]:
directory_path = 'simple_models_results'

if not os.path.exists(directory_path):
    os.makedirs(directory_path)
    print(f"Directory '{directory_path}' created.")
else:
    print(f"Directory '{directory_path}' already exists.")

file_path = f"./simple_models_results/{TRAIN_DATASET}-{CROSS_DATASET}.json"

with open(file_path, 'w') as json_file:
    json.dump(result_file, json_file, indent=4)