In [1]:
# Forked from https://github.com/hihey54/dummy-ML_NIDS
# Updated for my project
# Ahmed Bedair

import pandas as pd
import numpy as np
import pickle
import os, time
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

In [2]:
print("scikit-learn version: {}".format(sklearn.__version__))
print("Pandas version: {}".format(pd.__version__))
print("NumPy version: {}".format(np.__version__))

scikit-learn version: 1.4.1.post1
Pandas version: 2.2.1
NumPy version: 1.26.4


In [3]:
# Reading CSV files, and merging all of them into a single DataFrame
root_folder = "/home/grassfed37/6CCS3PRJ/dummy-ML_NIDS/CICIDS2017ML"
df_list = []
for f in os.listdir(root_folder):
    file_path = os.path.join(root_folder, f)
    if os.path.isfile(file_path):
        print("Reading: ", f)
        df_list.append(pd.read_csv(file_path))

df = pd.concat(df_list, ignore_index=True)

Reading:  Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX_Relabeled.csv
Reading:  Friday-WorkingHours-Morning.pcap_ISCX_Relabeled.csv
Reading:  Wednesday-workingHours.pcap_ISCX_Relabeled.csv
Reading:  Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX_Relabeled.csv
Reading:  Monday-WorkingHours.pcap_ISCX_Relabeled.csv
Reading:  Friday-WorkingHours-Afternoon-DDos.pcap_ISCX_Relabeled.csv
Reading:  Tuesday-WorkingHours.pcap_ISCX_Relabeled.csv
Reading:  Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX_Relabeled.csv


In [4]:
# QUICK PREPROCESSING. 
# Some classifiers do not like "infinite" (inf) or "null" (NaN) values.
df.replace([np.inf, -np.inf], np.nan, inplace=True)
print("Columns with problematic values: ", list(df.columns[df.isna().any()]))
df.dropna(inplace=True)

Columns with problematic values:  [' Flow Packets/s']


In [5]:
# Show all columns (we need to see which column is the 'Ground Truth' of each sample, and which will be the features used to describe each sample)
df.columns

Index([' Flow Duration', ' Total Fwd Packets', ' Total Backward Packets',
       ' Total Length of Bwd Packets', ' Fwd Packet Length Max',
       ' Fwd Packet Length Min', ' Fwd Packet Length Mean',
       ' Fwd Packet Length Std', ' Bwd Packet Length Min',
       ' Bwd Packet Length Mean', ' Bwd Packet Length Std', ' Flow Packets/s',
       ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min',
       ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max', ' Fwd IAT Min',
       ' Bwd IAT Mean', ' Bwd IAT Std', ' Bwd IAT Max', ' Bwd IAT Min',
       ' Bwd PSH Flags', ' Fwd Header Length', ' Bwd Header Length',
       ' Bwd Packets/s', ' Min Packet Length', ' Max Packet Length',
       ' Packet Length Mean', ' Packet Length Std', ' Packet Length Variance',
       ' SYN Flag Count', ' RST Flag Count', ' ACK Flag Count',
       ' Down/Up Ratio', ' Average Packet Size', ' Avg Fwd Segment Size',
       ' Avg Bwd Segment Size', ' Init_Win_bytes_backward',
       ' act_data_pkt_fwd', 

In [6]:
# This is the ground truth column. Let's show which classes contains
df[' Label'].unique()

array(['BENIGN', 'PortScan', 'Bot', 'DoS slowloris', 'DoS Slowhttptest',
       'DoS Hulk', 'DoS GoldenEye', 'Heartbleed',
       'Web Attack � Brute Force', 'Web Attack � XSS',
       'Web Attack � Sql Injection', 'DDoS', 'FTP-Patator', 'SSH-Patator',
       'Infiltration'], dtype=object)

In [7]:
# Create a new column that unifies all malicious classes into a single class for binary classification
df['GT'] = np.where(df[' Label']=='BENIGN', 'Benign', 'Malicious')

In [8]:
# Simple split
train, test = train_test_split(df, test_size=0.5)

In [9]:
# Define the features used by the classifier
features = pd.Index([
    ' Flow Duration',
    ' Total Fwd Packets',
    ' Total Backward Packets',
    ' Total Length of Bwd Packets',
    ' Fwd Packet Length Max',
    ' Fwd Packet Length Min',
    ' Fwd Packet Length Mean',
    ' Fwd Packet Length Std',
    ' Bwd Packet Length Min',
    ' Bwd Packet Length Mean',
    ' Bwd Packet Length Std',
    ' Flow Packets/s',
    ' Flow IAT Mean',
    ' Flow IAT Std',
    ' Flow IAT Max',
    ' Flow IAT Min',
    ' Fwd IAT Mean',
    ' Fwd IAT Std',
    ' Fwd IAT Max',
    ' Fwd IAT Min',
    ' Bwd IAT Mean',
    ' Bwd IAT Std',
    ' Bwd IAT Max',
    ' Bwd IAT Min',
    ' Bwd PSH Flags',
    ' Fwd Header Length',
    ' Bwd Header Length',
    ' Bwd Packets/s',
    ' Min Packet Length',
    ' Max Packet Length',
    ' Packet Length Mean',
    ' Packet Length Std',
    ' Packet Length Variance',
    ' SYN Flag Count',
    ' RST Flag Count',
    ' ACK Flag Count',
    ' Down/Up Ratio',
    ' Average Packet Size',
    ' Avg Fwd Segment Size',
    ' Avg Bwd Segment Size',
    ' Init_Win_bytes_backward',
    ' act_data_pkt_fwd',
    ' Active Std',
    ' Active Max',
    ' Active Min',
    ' Idle Std',
    ' Idle Max',
    ' Idle Min'
])

In [10]:
# Train and test a (binary) RandomForestClassifier, printing some basic performance scores, training time, and confusion matrix
start = time.time()
rfClf_bin = RandomForestClassifier(n_jobs = -2)
rfClf_bin.fit(train[features], train['GT'])
end = time.time() - start
print("Training time: ", end)

# Save the binary RandomForestClassifier model
with open('rfClf_bin.pkl', 'wb') as file:
    pickle.dump(rfClf_bin, file)

predictions_bin = rfClf_bin.predict(test[features])
print("Acc: {:3f}".format(accuracy_score(test['GT'], predictions_bin)))
print("F1-score: {:3f}".format(f1_score(test['GT'], predictions_bin, pos_label = 'Malicious')))
pd.crosstab(test['GT'], predictions_bin, rownames=['True'], colnames=['Pred'])

Training time:  80.30926084518433
Acc: 0.989750
F1-score: 0.974278


Pred,Benign,Malicious
True,Unnamed: 1_level_1,Unnamed: 2_level_1
Benign,1124966,10844
Malicious,3649,274479


In [11]:
# Train and test a (multiclass) RandomForestClassifier, printing some basic performance scores, training time, and confusion matrix
start = time.time()
rfClf_multi = RandomForestClassifier(n_jobs = -2)
rfClf_multi.fit(train[features], train[' Label'])
end = time.time() - start
print("Training time: ", end)

# Save the multiclass RandomForestClassifier model
with open('rfClf_multi.pkl', 'wb') as file:
    pickle.dump(rfClf_multi, file)
    
predictions_multi = rfClf_multi.predict(test[features])
print("Acc: {:3f}".format(accuracy_score(test[' Label'], predictions_multi)))
print("F1-score: {:3f}".format(f1_score(test[' Label'], predictions_multi, average='macro')))
pd.crosstab(test[' Label'], predictions_multi, rownames=['True'], colnames=['Pred'])

Training time:  76.69220423698425
Acc: 0.989486
F1-score: 0.818029


Pred,BENIGN,Bot,DDoS,DoS GoldenEye,DoS Hulk,DoS Slowhttptest,DoS slowloris,FTP-Patator,Heartbleed,Infiltration,PortScan,SSH-Patator,Web Attack � Brute Force,Web Attack � Sql Injection,Web Attack � XSS
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
BENIGN,1124935,77,72,25,9946,109,6,9,0,0,481,130,19,0,1
Bot,537,494,0,0,0,0,0,0,0,0,0,0,0,0,0
DDoS,78,0,64055,0,0,0,0,0,0,0,0,0,0,0,0
DoS GoldenEye,59,0,0,5128,2,2,0,0,0,0,0,0,0,0,0
DoS Hulk,2660,0,0,1,112211,0,0,0,0,0,2,0,0,0,0
DoS Slowhttptest,32,0,0,1,0,2598,7,0,0,0,0,0,0,0,0
DoS slowloris,16,0,0,1,0,5,2916,0,0,0,0,0,1,0,0
FTP-Patator,63,0,0,0,0,0,0,3884,0,0,0,0,0,0,0
Heartbleed,1,0,0,0,0,0,0,0,5,0,0,0,0,0,0
Infiltration,10,0,0,0,0,0,0,0,0,9,0,0,0,0,0


**Where to go from here?**
Here are some ways that can be used to kickstart some research on ML-NIDS by using the code above.

- **Deal with __inf__ or __NaN__ values.** In the notebook, I removed all of these samples. You may want to keep them by, e.g., assigning them a fixed value 
- **Tinker with the features.** In the notebook, I used all features available. Some features may be excessively correlated to a given class, which may not be realistic (perhaps a rule-based NIDS, instead of a ML one, can be applied to detect that specific attack.) Some may be useless, and can be removed. In some cases, some features will be 'categorical', and you must choose how to deal with them (e.g., factorize, or onehotencoding).
- **Change the train:test split.** In the notebook, I simply randomly split the initial dataset. You may want to do this on a "class" basis (e.g., take 80% of benign samples and 20% of malicious samples for train, and put the rest in test). You may even want to see what happens as less data is provided in the training set.
- **Use Validation partition for parameter optimization.** In the notebook, I simply split data into train and test, and fed such data to a RandomForestClassifier using default parameters. You may want to optimize the performance of such classifier, but to do it fairly you must **not** use the test set. Doing this requires to split the train set into two distinct partitions: a "sub_train" and a "validation" partition. 
- **Use grid-search for automatic parameter tuning, or cross-validation (or repeated random samplings) to increase the confidence of the results.** The notebook only trains (and tests) a ML model once. The resulting performance can be biased (e.g., it can be due to a lucky sampling for train or test). To derive more statistically significant results, more trials should be done.
- **Explore different Classifiers and Architectures.** The notebook only uses a classifier based on the Random Forest algorithm. There are many more classifiers available on scikit-learn. You can even, e.g., devise ensembles of classifiers (consider looking into the [mlxtend](http://rasbt.github.io/mlxtend/) library), each focused on a single attack.
- **Consider deep learning.** The code above uses scikit-learn. You can move everything to TensorFlow and use Deep Neural Networks (warning: do this only if you have a GPU!)
- **Choose a different dataset**. The experiments on this notebook only apply to the CICIDS17 dataset. Given that network environments are very diverse, I strongly suggest repeating other experiments on a different dataset and see if the resulting performance is comparable. Alternatively, you can consider subsets of CICIDS17 (e.g., only one day)
- **Visualizations!** The code above only prints the results and corresponding confusion matrix. You may want to visualize the results with proper graphs (via e.g., matplotlib, or seaborn). 


**Tip**: to avoid wasting time, always save your results and also consider saving your ML models (or datasets) as pickle files! Nothing is more painful than doing a bunch of experiments and then losing everything!
