In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import IsolationForest
from sklearn.metrics import f1_score


from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb

from anomaly_flow.utils.binary_processing import split_flag_columns
from anomaly_flow.model.autoencoder_flow_nids import AutoEncoder





In [2]:
APPLY_SYNTHETIC = False
APPLY_REAL = False
SAME_SCALE = True
DATASET = "NF-ToN-IoT-v2-DDoS"

In [3]:
threshold = np.finfo(np.float32).max

In [4]:
FEATURES_TO_DROP = [
    'IPV4_SRC_ADDR', 
    'IPV4_DST_ADDR', 
    'L7_PROTO', 
    'L4_SRC_PORT', 
    'L4_DST_PORT', 
    'FTP_COMMAND_RET_CODE',
    'Attack'
]

In [5]:
dtypes_netflow = {
    "IPV4_SRC_ADDR":                "object",
    "L4_SRC_PORT":                  "float32",
    "IPV4_DST_ADDR":                "object",
    "L4_DST_PORT":                  "float32",
    "PROTOCOL":                     "float32",
    "L7_PROTO":                     "float64",
    "IN_BYTES":                     "float32",
    "IN_PKTS":                      "float32",
    "OUT_BYTES":                    "float32",
    "OUT_PKTS":                     "float32",
    "TCP_FLAGS":                    "int32",
    "CLIENT_TCP_FLAGS":             "int32",
    "SERVER_TCP_FLAGS":             "int32",
    "FLOW_DURATION_MILLISECONDS":   "float32",
    "DURATION_IN":                  "float32",
    "DURATION_OUT":                 "float32",
    "MIN_TTL":                      "float32",
    "MAX_TTL":                      "float32",
    "LONGEST_FLOW_PKT":             "float32",
    "SHORTEST_FLOW_PKT":            "float32",
    "MIN_IP_PKT_LEN":               "float32",
    "MAX_IP_PKT_LEN":               "float32",
    "SRC_TO_DST_SECOND_BYTES":      "float64",
    "DST_TO_SRC_SECOND_BYTES":      "float64",
    "RETRANSMITTED_IN_BYTES":       "float32",
    "RETRANSMITTED_IN_PKTS":        "float32",
    "RETRANSMITTED_OUT_BYTES":      "float32",
    "RETRANSMITTED_OUT_PKTS":       "float32",
    "SRC_TO_DST_AVG_THROUGHPUT":    "float32",
    "DST_TO_SRC_AVG_THROUGHPUT":    "float32",
    "NUM_PKTS_UP_TO_128_BYTES":     "float32",
    "NUM_PKTS_128_TO_256_BYTES":    "float32",
    "NUM_PKTS_256_TO_512_BYTES":    "float32",
    "NUM_PKTS_512_TO_1024_BYTES":   "float32",
    "NUM_PKTS_1024_TO_1514_BYTES":  "float32",
    "TCP_WIN_MAX_IN":               "float32",
    "TCP_WIN_MAX_OUT":              "float32",
    "ICMP_TYPE":                    "float32",
    "ICMP_IPV4_TYPE":               "float32",
    "DNS_QUERY_ID":                 "float32",
    "DNS_QUERY_TYPE":               "float32",
    "DNS_TTL_ANSWER":               "float32",
    "FTP_COMMAND_RET_CODE":         "float32",
    "Attack":                       "object",
    "Label":                        "float32",
}

In [6]:
"""
    Function used to load the Anomaly-flow Synthetic Data 

    @args: dataset_name: String 
    @output: synthetic_x: np.nd_array, synthetic_y: np.nd_array
    
"""
def load_synthetic_dataset(dataset_name: str, apply_scaler: bool = False): 
    synthetic_df = pd.read_parquet(
        f"./datasets/{dataset_name}"
    )
    synthetic_df["Label"] = 0
    synthetic_x, synthetic_y = synthetic_df.drop(['Label'], axis=1), synthetic_df['Label']
    synthetic_x = synthetic_x.to_numpy()

    if (apply_scaler is True): 
        synthetic_scaler = MinMaxScaler()
        synthetic_x = synthetic_scaler.fit_transform(synthetic_x)
    
    return synthetic_x, synthetic_y

In [7]:
synthetic_x, synthetic_y = load_synthetic_dataset("Anomaly-Flow-Synthetic.parquet", True)

In [8]:
rf_classifier = RandomForestClassifier(warm_start=True, n_estimators=100, max_depth=20, min_samples_split=5)
if_classifier = IsolationForest(random_state=42, warm_start=True, n_estimators=50, contamination=0.01)
xgb_classifier = xgb.XGBClassifier(n_estimators=150, max_depth=9, learning_rate=0.1)
mlp_classifier = MLPClassifier(random_state=42, warm_start=False, hidden_layer_sizes=(50, 25), activation="tanh", max_iter=200)
ae_classifier = AutoEncoder(num_features=synthetic_x.shape[1])





In [9]:
print("Training Started...")

rf_classifier.fit(synthetic_x, synthetic_y)
if_classifier.fit(synthetic_x, synthetic_y)
xgb_classifier.fit(synthetic_x, synthetic_y)
mlp_classifier.partial_fit(synthetic_x, synthetic_y, classes=[0, 1])
ae_classifier.fit((synthetic_x, synthetic_y), epochs=10, batch_size=128, shuffle=True)

print("Training Finished...")

Training Started...
> Loaded Unknown Dataset  | Trainset: (100000, 52)
> Train samples: 100000
Epoch 1/10

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
>>> Threshold: 0.014781216159462929
Training Finished...


In [10]:
df = pd.read_csv(
    "./datasets/NF-UNSW-NB15-v2-downsample.csv.gz",
    dtype=dtypes_netflow
)

In [11]:
# Show information about the dataset 
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 478055 entries, 0 to 478054
Data columns (total 45 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   IPV4_SRC_ADDR                478055 non-null  object 
 1   L4_SRC_PORT                  478055 non-null  float32
 2   IPV4_DST_ADDR                478055 non-null  object 
 3   L4_DST_PORT                  478055 non-null  float32
 4   PROTOCOL                     478055 non-null  float32
 5   L7_PROTO                     478055 non-null  float64
 6   IN_BYTES                     478055 non-null  float32
 7   IN_PKTS                      478055 non-null  float32
 8   OUT_BYTES                    478055 non-null  float32
 9   OUT_PKTS                     478055 non-null  float32
 10  TCP_FLAGS                    478055 non-null  int32  
 11  CLIENT_TCP_FLAGS             478055 non-null  int32  
 12  SERVER_TCP_FLAGS             478055 non-null  int32  
 13 

In [12]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)
print(df['Attack'].value_counts())
df.drop(FEATURES_TO_DROP, axis=1, inplace=True)
df = df[df < threshold]
df = split_flag_columns(df)
df.dropna(inplace=True)

Attack
Benign            459044
Exploits            6310
Fuzzers             4462
Generic             3312
Reconnaissance      2556
DoS                 1159
Analysis             460
Backdoor             434
Shellcode            285
Worms                 33
Name: count, dtype: int64
Using cached file: 2e1b49bdf7ef775ccf86409f36645a01.


In [13]:
X, y = df.drop(['Label'], axis=1), df['Label']

X_train, X_test, y_train, y_test = train_test_split(
                                        X, y, stratify=y,
                                        test_size=0.33, random_state=42
                                   )


# Reescale the models to train and test
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [14]:
print("Incremental Training Started...")
rf_classifier.n_estimators += 100
rf_classifier.fit(X_train, y_train)

if_classifier.n_estimators += 50
if_classifier.fit(X_train, y_train)

xgb_classifier.fit(X_train, y_train, xgb_model=xgb_classifier)
mlp_classifier.partial_fit(X_train, y_train)

ae_classifier.fit((X_train, y_train), epochs=10, batch_size=128, shuffle=True)
print("Incremental Training Finished...")

Incremental Training Started...
> Loaded Unknown Dataset  | Trainset: (320296, 52)
> Train samples: 320296
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
>>> Threshold: 0.0007913918816484511
Incremental Training Finished...


In [15]:
def evaluate_models(X_test, y_test, rf_classifier, if_classifier, xgb_classifier, mlp_classifier, ae_classifier):
    y_pred_rf = rf_classifier.predict(X_test)
    y_pred_if = if_classifier.predict(X_test)
    y_pred_if[y_pred_if == 1] = 0
    y_pred_if[y_pred_if == -1] = 1
    y_pred_xgb = xgb_classifier.predict(X_test)
    y_pred_mlp = mlp_classifier.predict(X_test)

    print(f"Random Forest:\t\t{f1_score(y_test, y_pred_rf)}")
    print(f"Isolation Forest:\t{f1_score(y_test, y_pred_if)}")
    print(f"XGB:\t\t{f1_score(y_test, y_pred_xgb)}")
    print(f"MLP:\t\t{f1_score(y_test, y_pred_mlp)}")

    ae_classifier.evaluate((X_test, y_test))

In [16]:
evaluate_models(
    X_test=X_test, 
    y_test=y_test, 
    rf_classifier=rf_classifier, 
    if_classifier=if_classifier, 
    xgb_classifier= xgb_classifier, 
    mlp_classifier= mlp_classifier, 
    ae_classifier=ae_classifier
)

Random Forest:		0.9655876807039598
Isolation Forest:	0.017904425671415963
XGB:		0.07649680247267318
MLP:		0.9407186999313343
> Loaded Unknown Dataset  | Testset: (157759, 52)
> Test samples: Label, 0.0    151485, 1.0      6274
Test Results:
{'acc': 0.039769521865630486, 'rec': 1.0, 'prec': 0.039769521865630486, 'f1': 0.07649680247267318, 'mcc': 0.0, 'missrate': 0.0, 'fallout': 1.0, 'auc': 0.5, 'f2-score': 0.17155669793005388}


In [24]:
def load_cross_df(dataset_name, scaler=None): 

    print(f"Loading the dataset: {dataset_name}")
    cross_df = pd.read_csv(
        f"./datasets/{dataset_name}-downsample.csv.gz",
        dtype=dtypes_netflow
    )

    cross_df.replace([np.inf, -np.inf], np.nan, inplace=True)
    cross_df.dropna(inplace=True)
    print(cross_df['Attack'].value_counts())
    cross_df.drop(FEATURES_TO_DROP, axis=1, inplace=True)
    cross_df = cross_df[cross_df < threshold]
    cross_df = split_flag_columns(cross_df)
    cross_df.dropna(inplace=True)
    cross_x, cross_y = cross_df.drop(['Label', 'Unnamed: 0'], axis=1), cross_df['Label']

    cross_x_train, cross_x_test, cross_y_train, cross_y_test = train_test_split(
                                                                cross_x, cross_y,
                                                                stratify=cross_y,
                                                                test_size=0.9,
                                                                random_state=42
                                                           )
    
    if(scaler is not None):
        cross_x_train = scaler.transform(cross_x_train)
        cross_x_test = scaler.transform(cross_x_test)

    return cross_x_train, cross_y_train, cross_x_test, cross_y_test


In [25]:
_, _, cross_x_test, cross_y_test = load_cross_df("NF-ToN-IoT-v2-DDoS", scaler=scaler)
evaluate_models(cross_x_test, cross_y_test, rf_classifier, if_classifier, xgb_classifier, mlp_classifier, ae_classifier=ae_classifier)

Loading the dataset: NF-ToN-IoT-v2-DDoS
Attack
Benign    811289
ddos      598938
Name: count, dtype: int64
Using cached file: 4eaaa3e4e07e7f8fa7cab94728e15582.
Random Forest:		0.0002889611674031163
Isolation Forest:	0.00013347075878126366
XGB:		0.5962114353752369
MLP:		0.001511192035649389
> Loaded Unknown Dataset  | Testset: (1269187, 52)
> Test samples: Label, 0.0    730143, 1.0    539044
Test Results:
{'acc': 0.42471597959953894, 'rec': 1.0, 'prec': 0.42471597959953894, 'f1': 0.5962114353752369, 'mcc': 0.0, 'missrate': 0.0, 'fallout': 1.0, 'auc': 0.5, 'f2-score': 0.7868421536637139}


In [26]:
_, _, cross_x_test, cross_y_test = load_cross_df("NF-CSE-CIC-IDS2018-v2-DDoS", scaler=scaler)
evaluate_models(cross_x_test, cross_y_test, rf_classifier, if_classifier, xgb_classifier, mlp_classifier, ae_classifier=ae_classifier)

Loading the dataset: NF-CSE-CIC-IDS2018-v2-DDoS
Attack
Benign                    4990670
DDOS attack-HOIC           324257
DDoS attacks-LOIC-HTTP      92190
DDOS attack-LOIC-UDP          634
Name: count, dtype: int64
Using cached file: 38e466053581bb706ccdb35435c58a35.
Random Forest:		2.1017322477185697e-06
Isolation Forest:	0.0
XGB:		0.1432055197462168
MLP:		0.0
> Loaded Unknown Dataset  | Testset: (4866909, 52)
> Test samples: Label, 0.0    4491548, 1.0     375361
Test Results:
{'acc': 0.07712513219375994, 'rec': 1.0, 'prec': 0.07712513219375994, 'f1': 0.1432055197462168, 'mcc': 0.0, 'missrate': 0.0, 'fallout': 1.0, 'auc': 0.5, 'f2-score': 0.29470806659115784}


In [27]:
_, _, cross_x_test, cross_y_test = load_cross_df("NF-BoT-IoT-v2-DDoS", scaler=scaler)
evaluate_models(cross_x_test, cross_y_test, rf_classifier, if_classifier, xgb_classifier, mlp_classifier, ae_classifier=ae_classifier)

Loading the dataset: NF-BoT-IoT-v2-DDoS
Attack
DDoS      5499554
Benign      40511
Name: count, dtype: int64
Using cached file: e5195c9e25f808ae61dba0f520d09999.
Random Forest:		0.8552305217240332
Isolation Forest:	0.0
XGB:		0.996330388988832
MLP:		0.0
> Loaded Unknown Dataset  | Testset: (4986059, 52)
> Test samples: Label, 1.0    4949599, 0.0      36460
Test Results:
{'acc': 0.9926876115986594, 'rec': 1.0, 'prec': 0.9926876115986594, 'f1': 0.996330388988832, 'mcc': 0.0, 'missrate': 0.0, 'fallout': 1.0, 'auc': 0.5, 'f2-score': 0.9985289166132563}
