In [2]:
import pandas as pd
import numpy as np
import pickle
import os, time
import cuml
import shap
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

In [3]:
# Reading CSV files, and merging all of them into a single DataFrame
CISIDS2017_folder = "/home/grassfed37/6CCS3PRJ/dummy-ML_NIDS/CICIDS2017ML"
CTU13_folder = "/home/grassfed37/6CCS3PRJ/dummy-ML_NIDS/CTU13ML"

CICIDS2017_df_list = []
CTU13_df_list = []

# Reading CICIDS2017 CSV files into a single DataFrame
for f in os.listdir(CISIDS2017_folder):
    file_path = os.path.join(CISIDS2017_folder, f)
    if os.path.isfile(file_path):
        print("Reading: ", f)
        CICIDS2017_df_list.append(pd.read_csv(file_path))

# Reading CTU13 CSV files into a single DataFrame        
for f in os.listdir(CTU13_folder):
    file_path = os.path.join(CTU13_folder, f)
    if os.path.isfile(file_path):
        print("Reading: ", f)
        CTU13_df_list.append(pd.read_csv(file_path))
        
CICIDS2017_df, CTU13_df = pd.concat(CICIDS2017_df_list, ignore_index=True), pd.concat(CTU13_df_list, ignore_index=True)

Reading:  Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX_Relabeled.csv
Reading:  Friday-WorkingHours-Morning.pcap_ISCX_Relabeled.csv
Reading:  Wednesday-workingHours.pcap_ISCX_Relabeled.csv
Reading:  Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX_Relabeled.csv
Reading:  Monday-WorkingHours.pcap_ISCX_Relabeled.csv
Reading:  Friday-WorkingHours-Afternoon-DDos.pcap_ISCX_Relabeled.csv
Reading:  Tuesday-WorkingHours.pcap_ISCX_Relabeled.csv
Reading:  Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX_Relabeled.csv
Reading:  CTU13_Normal_Traffic_Relabeled.csv
Reading:  CTU13_Attack_Traffic_Relabeled.csv


In [4]:
# QUICK PREPROCESSING. 
# Some classifiers do not like "infinite" (inf) or "null" (NaN) values.
CICIDS2017_df.replace([np.inf, -np.inf], np.nan, inplace=True)
print("Columns with problematic values: ", list(CICIDS2017_df.columns[CICIDS2017_df.isna().any()]))
CICIDS2017_df.dropna(inplace=True)

CTU13_df.replace([np.inf, -np.inf], np.nan, inplace=True)
print("Columns with problematic values: ", list(CTU13_df.columns[CTU13_df.isna().any()]))
CTU13_df.dropna(inplace=True)

Columns with problematic values:  [' Flow Packets/s']
Columns with problematic values:  []


In [5]:
CICIDS2017_df[' Label'].unique()

array(['BENIGN', 'PortScan', 'Bot', 'DoS slowloris', 'DoS Slowhttptest',
       'DoS Hulk', 'DoS GoldenEye', 'Heartbleed',
       'Web Attack � Brute Force', 'Web Attack � XSS',
       'Web Attack � Sql Injection', 'DDoS', 'FTP-Patator', 'SSH-Patator',
       'Infiltration'], dtype=object)

In [6]:
CTU13_df[' Label'].unique()

array(['BENIGN', 'Bot'], dtype=object)

In [7]:
# Create a new column that unifies all malicious classes into a single class for binary classification
CICIDS2017_df['GT'] = np.where(CICIDS2017_df[' Label']=='BENIGN', 'Benign', 'Malicious')
CTU13_df['GT'] = np.where(CTU13_df[' Label']=='BENIGN', 'Benign', 'Malicious')

# Fit and transform 'GT' column for CICIDS2017_df
CICIDS2017_df['GT'] = LabelEncoder().fit_transform(CICIDS2017_df['GT'])

# Fit and transform 'GT' column for CTU13_df 
CTU13_df['GT'] = LabelEncoder().fit_transform(CTU13_df['GT'])

# Perform label encoding on the 'Label' column for each DataFrame separately
CICIDS2017_df[' Label'] = LabelEncoder().fit_transform(CICIDS2017_df[' Label'])
CTU13_df[' Label'] = LabelEncoder().fit_transform(CTU13_df[' Label'])

In [8]:
features = pd.Index([
    ' Flow Duration',
    ' Total Fwd Packets',
    ' Total Backward Packets',
    ' Total Length of Bwd Packets',
    ' Fwd Packet Length Max',
    ' Fwd Packet Length Min',
    ' Fwd Packet Length Mean',
    ' Fwd Packet Length Std',
    ' Bwd Packet Length Min',
    ' Bwd Packet Length Mean',
    ' Bwd Packet Length Std',
    ' Flow Packets/s',
    ' Flow IAT Mean',
    ' Flow IAT Std',
    ' Flow IAT Max',
    ' Flow IAT Min',
    ' Fwd IAT Mean',
    ' Fwd IAT Std',
    ' Fwd IAT Max',
    ' Fwd IAT Min',
    ' Bwd IAT Mean',
    ' Bwd IAT Std',
    ' Bwd IAT Max',
    ' Bwd IAT Min',
    ' Bwd PSH Flags',
    ' Fwd Header Length',
    ' Bwd Header Length',
    ' Bwd Packets/s',
    ' Min Packet Length',
    ' Max Packet Length',
    ' Packet Length Mean',
    ' Packet Length Std',
    ' Packet Length Variance',
    ' SYN Flag Count',
    ' RST Flag Count',
    ' ACK Flag Count',
    ' Down/Up Ratio',
    ' Average Packet Size',
    ' Avg Fwd Segment Size',
    ' Avg Bwd Segment Size',
    ' Init_Win_bytes_backward',
    ' act_data_pkt_fwd',
    ' Active Std',
    ' Active Max',
    ' Active Min',
    ' Idle Std',
    ' Idle Max',
    ' Idle Min'
])

In [9]:
# Encode feature columns to float32
CTU13_df[features] = CTU13_df[features].astype('float32')
CTU13_df['GT'] = CTU13_df['GT'].astype('float32')
CICIDS2017_df[features] = CICIDS2017_df[features].astype('float32')
CICIDS2017_df['GT'] = CICIDS2017_df['GT'].astype('float32')

In [10]:
# Train test splits for both datasets
train_ctu13, test_ctu13 = train_test_split(CTU13_df, test_size=0.4, stratify=CTU13_df['GT'])
train_cicids, test_cicids = train_test_split(CICIDS2017_df, test_size=0.4, stratify=CICIDS2017_df['GT'])

start = time.time()
svmClf_bin_ctu13 = cuml.svm.SVC()
svmClf_bin_ctu13.fit(train_ctu13[features].values, train_ctu13['GT'].values)
end = time.time() - start
print("Training time (CTU13 Binary): ", end)

# Save the binary SVM model trained on CTU13 dataset
with open('../Pickle Files/SVMBinaryCTU13.pkl', 'wb') as file:
    pickle.dump(svmClf_bin_ctu13, file)
    print("Model saved to: ", file.name)

Training time (CTU13 Binary):  1.1096422672271729
Model saved to:  ../Pickle Files/SVMBinaryCTU13.pkl


In [11]:
# Train test splits for both datasets
train_ctu13, test_ctu13 = train_test_split(CTU13_df, test_size=0.4, stratify=CTU13_df['GT'])
train_cicids, test_cicids = train_test_split(CICIDS2017_df, test_size=0.4, stratify=CICIDS2017_df['GT'])

start = time.time()  
svmClf_bin_cicids = cuml.svm.SVC()
svmClf_bin_cicids.fit(train_cicids[features].values, train_cicids['GT'].values)
end = time.time() - start
print("Training time (CICIDS2017 Binary): ", end)

# Save the binary SVM model trained on CICIDS2017 dataset  
with open('../Pickle Files/SVMBinaryCICIDS2017.pkl', 'wb') as file:
    pickle.dump(svmClf_bin_cicids, file)
    print("Model saved to: ", file.name)

Training time (CICIDS2017 Binary):  185.79507637023926
Model saved to:  ../Pickle Files/SVMBinaryCICIDS2017.pkl


In [12]:
# Train test splits for both datasets
train_ctu13, test_ctu13 = train_test_split(CTU13_df, test_size=0.4, stratify=CTU13_df['GT'])
train_cicids, test_cicids = train_test_split(CICIDS2017_df, test_size=0.4, stratify=CICIDS2017_df['GT'])

# Train a Multiclass SVM model on CICIDS2017 dataset    
start = time.time()
svmClf_multi_cicids = cuml.svm.SVC()
svmClf_multi_cicids.fit(train_cicids[features].values, train_cicids[' Label'].values)
end = time.time() - start
print("Training time (CICIDS2017 Multiclass): ", end)

# Save the multiclass SVM model trained on CICIDS2017 dataset
with open('../Pickle Files/SVMMulticlassCICIDS2017.pkl', 'wb') as file:  
    pickle.dump(svmClf_multi_cicids, file)
    print("Model saved to: ", file.name)

Training time (CICIDS2017 Multiclass):  159.04099297523499
Model saved to:  ../Pickle Files/SVMMulticlassCICIDS2017.pkl


In [None]:
# Train test splits for both datasets
train_ctu13, test_ctu13 = train_test_split(CTU13_df, test_size=0.4, stratify=CTU13_df['GT'])
train_cicids, test_cicids = train_test_split(CICIDS2017_df, test_size=0.4, stratify=CICIDS2017_df['GT'])

# Load the binary SVM model trained on CTU13 dataset
with open('../Pickle Files/SVMBinaryCTU13.pkl', 'rb') as file:
    svmClf_bin_ctu13 = pickle.load(file)
    print("Model loaded from: ", file.name)

# Create a SHAP explainer object for the binary SVM model trained on CTU13 dataset

explainer_ctu13 = cuml.explainer.KernelExplainer(model=svmClf_bin_ctu13.predict, data=train_ctu13[features].values)

print("SHAP explainer created")

# Get SHAP values against the CTU13 test set
shap_values_ctu13 = explainer_ctu13.shap_values(test_ctu13[features].values)
print("SHAP values calculated against CTU13 test set")

# Save the SHAP values to a .pkl file
with open('../Pickle Files/shap_values_SVMBinaryCTU13_CTU13.pkl', 'wb') as file:
    pickle.dump(shap_values_ctu13, file)
    print("SHAP values saved to: ", file.name)

# Get SHAP values against the CICIDS2017 test set
shap_values_cicids = explainer_ctu13.shap_values(test_cicids[features].values)
print("SHAP values calculated against CICIDS2017 test set")

# Save the SHAP values to a .pkl file
with open('../Pickle Files/shap_values_SVMBinaryCTU13_CICIDS2017.pkl', 'wb') as file:
    pickle.dump(shap_values_cicids, file)
    print("SHAP values saved to: ", file.name)

In [None]:
# Train test splits for both datasets
train_ctu13, test_ctu13 = train_test_split(CTU13_df, test_size=0.4, stratify=CTU13_df['GT'])
train_cicids, test_cicids = train_test_split(CICIDS2017_df, test_size=0.4, stratify=CICIDS2017_df['GT'])

# Load the binary SVM model trained on CICIDS2017 dataset
with open('../Pickle Files/SVMBinaryCICIDS2017.pkl', 'rb') as file:
    svmClf_bin_cicids = pickle.load(file)
    print("Model loaded from: ", file.name)

# Create a SHAP explainer object for the binary SVM model trained on CICIDS2017 dataset

explainer_cicids = cuml.explainer.KernelExplainer(model=svmClf_bin_cicids.predict, data=train_cicids[features].values)
print("SHAP explainer created")

# Get SHAP values against the CTU13 test set
shap_values_ctu13 = explainer_cicids.shap_values(test_ctu13[features].values)
print("SHAP values calculated against CTU13 test set")

# Save the SHAP values to a .pkl file
with open('../Pickle Files/shap_values_SVMBinaryCICIDS2017_CTU13.pkl', 'wb') as file:
    pickle.dump(shap_values_ctu13, file)
    print("SHAP values saved to: ", file.name)

# Get SHAP values against the CICIDS2017 test set
shap_values_cicids = explainer_cicids.shap_values(test_cicids[features].values)
print("SHAP values calculated against CICIDS2017 test set")

# Save the SHAP values to a .pkl file
with open('../Pickle Files/shap_values_SVMBinaryCICIDS2017_CICIDS2017.pkl', 'wb') as file:
    pickle.dump(shap_values_cicids, file)
    print("SHAP values saved to: ", file.name)

In [13]:
# Train test splits for both datasets
train_ctu13, test_ctu13 = train_test_split(CTU13_df, test_size=0.4, stratify=CTU13_df['GT'])
train_cicids, test_cicids = train_test_split(CICIDS2017_df, test_size=0.4, stratify=CICIDS2017_df['GT'])

# Load the binary SVM model trained on CTU13 dataset
with open('../Pickle Files/SVMBinaryCTU13.pkl', 'rb') as file:
    svmClf_bin_ctu13 = pickle.load(file)
    print("Model loaded from: ", file.name)
    
# Test on CTU13 dataset    
predictions_bin_ctu13 = svmClf_bin_ctu13.predict(test_ctu13[features])

# Now compute the metrics
print("Acc (CTU13 Binary) SVM: {:.3f}".format(accuracy_score(test_ctu13['GT'], predictions_bin_ctu13)))  
print("Precision (CTU13 Binary) SVM: {:.3f}".format(precision_score(test_ctu13['GT'], predictions_bin_ctu13, pos_label=1)))
print("Recall (CTU13 Binary) SVM: {:.3f}".format(recall_score(test_ctu13['GT'], predictions_bin_ctu13, pos_label=1)))
print("F1-score (CTU13 Binary) SVM: {:.3f}".format(f1_score(test_ctu13['GT'], predictions_bin_ctu13, pos_label=1)))

# Confusion Matrix
pd.crosstab(test_ctu13['GT'], predictions_bin_ctu13, rownames=['True'], colnames=['Pred']) 


Model loaded from:  ../Pickle Files/SVMBinaryCTU13.pkl
Acc (CTU13 Binary) SVM: 0.727
Precision (CTU13 Binary) SVM: 0.950
Recall (CTU13 Binary) SVM: 0.372
F1-score (CTU13 Binary) SVM: 0.534


Pred,0.0,1.0
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,21024,302
1.0,9775,5784


In [None]:
# Train test splits for both datasets
train_ctu13, test_ctu13 = train_test_split(CTU13_df, test_size=0.4, stratify=CTU13_df['GT']) 
train_cicids, test_cicids = train_test_split(CICIDS2017_df, test_size=0.4, stratify=CICIDS2017_df['GT'])

# Load the binary SVM model trained on CICIDS2017 dataset
with open('../Pickle Files/SVMBinaryCICIDS2017.pkl', 'rb') as file:  
    svmClf_bin_cicids = pickle.load(file)
    print("Model loaded from: ", file.name)

# Test on CICIDS2017 dataset
predictions_bin_cicids = svmClf_bin_cicids.predict(test_cicids[features])  
print("Acc (CICIDS2017 Binary) SVM: {:3f}".format(accuracy_score(test_cicids['GT'], predictions_bin_cicids)))
print("Precision (CICIDS2017 Binary) SVM: {:3f}".format(precision_score(test_cicids['GT'], predictions_bin_cicids, pos_label=1)))  
print("Recall (CICIDS2017 Binary) SVM: {:3f}".format(recall_score(test_cicids['GT'], predictions_bin_cicids, pos_label=1)))
print("F1-score (CICIDS2017 Binary) SVM: {:3f}".format(f1_score(test_cicids['GT'], predictions_bin_cicids, pos_label=1)))
pd.crosstab(test_cicids['GT'], predictions_bin_cicids, rownames=['True'], colnames=['Pred'])


In [14]:
# Train test splits for both datasets
train_ctu13, test_ctu13 = train_test_split(CTU13_df, test_size=0.4, stratify=CTU13_df['GT']) 
train_cicids, test_cicids = train_test_split(CICIDS2017_df, test_size=0.4, stratify=CICIDS2017_df['GT'])

# Load the multiclass SVM model trained on CICIDS2017 dataset
with open('../Pickle Files/SVMMulticlassCICIDS2017.pkl', 'rb') as file:
    svmClf_multi_cicids = pickle.load(file)  
    print("Model loaded from: ", file.name)

# Test on CICIDS2017 dataset
predictions_multi_cicids = svmClf_multi_cicids.predict(test_cicids[features])
print("Acc (CICIDS2017 Multiclass) SVM: {:3f}".format(accuracy_score(test_cicids[' Label'], predictions_multi_cicids)))
print("Precision (CICIDS2017 Multiclass) SVM: {:3f}".format(precision_score(test_cicids[' Label'], predictions_multi_cicids, average='macro')))  
print("Recall (CICIDS2017 Multiclass) SVM: {:3f}".format(recall_score(test_cicids[' Label'], predictions_multi_cicids, average='macro'))) 
print("F1-score (CICIDS2017 Multiclass) SVM: {:3f}".format(f1_score(test_cicids[' Label'], predictions_multi_cicids, average='macro')))
pd.crosstab(test_cicids[' Label'], predictions_multi_cicids, rownames=['True'], colnames=['Pred'])  


Model loaded from:  ../Pickle Files/SVMMulticlassCICIDS2017.pkl
Acc (CICIDS2017 Multiclass) SVM: 0.891704
Precision (CICIDS2017 Multiclass) SVM: 0.420886


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Recall (CICIDS2017 Multiclass) SVM: 0.320705
F1-score (CICIDS2017 Multiclass) SVM: 0.355117


Pred,0,2,3,4,5,6,11
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,258306,12731,916,14741,491,218,328
1,708,35,6,49,2,0,0
3,3061,153,13,156,4,3,6
4,82380,4138,275,4658,144,66,120
5,2008,101,6,122,5,3,5
6,2053,92,10,116,1,1,4
8,3,2,0,0,0,0,0
10,56961,2845,186,3298,130,44,71


In [15]:
# Train test splits for both datasets
train_ctu13, test_ctu13 = train_test_split(CTU13_df, test_size=0.4, stratify=CTU13_df['GT']) 
train_cicids, test_cicids = train_test_split(CICIDS2017_df, test_size=0.4, stratify=CICIDS2017_df['GT'])

# Load the binary SVM model trained on CTU13 dataset
with open('../Pickle Files/SVMBinaryCTU13.pkl', 'rb') as file:
    svmClf_bin_ctu13 = pickle.load(file)  
    print("Model loaded from: ", file.name)

# Test on CICIDS2017 dataset
predictions_bin_cicids = svmClf_bin_ctu13.predict(test_cicids[features])
print("Acc (CTU13 to CICIDS2017) SVM: {:3f}".format(accuracy_score(test_cicids['GT'], predictions_bin_cicids))) 
print("Precision (CTU13 to CICIDS2017) SVM: {:3f}".format(precision_score(test_cicids['GT'], predictions_bin_cicids, pos_label=1)))
print("Recall (CTU13 to CICIDS2017) SVM: {:3f}".format(recall_score(test_cicids['GT'], predictions_bin_cicids, pos_label=1)))
print("F1-score (CTU13 to CICIDS2017) SVM: {:3f}".format(f1_score(test_cicids['GT'], predictions_bin_cicids, pos_label=1)))
pd.crosstab(test_cicids['GT'], predictions_bin_cicids, rownames=['True'], colnames=['Pred'])


Model loaded from:  ../Pickle Files/SVMBinaryCTU13.pkl
Acc (CTU13 to CICIDS2017) SVM: 0.828847
Precision (CTU13 to CICIDS2017) SVM: 0.577034
Recall (CTU13 to CICIDS2017) SVM: 0.488274
F1-score (CTU13 to CICIDS2017) SVM: 0.528956


Pred,0.0,1.0
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,828850,79678
1.0,113922,108701


In [16]:
# Train test splits for both datasets 
train_ctu13, test_ctu13 = train_test_split(CTU13_df, test_size=0.4, stratify=CTU13_df['GT'])
train_cicids, test_cicids = train_test_split(CICIDS2017_df, test_size=0.4, stratify=CICIDS2017_df['GT']) 

# Load the binary SVM model trained on CICIDS2017
with open('../Pickle Files/SVMBinaryCICIDS2017.pkl', 'rb') as file:  
    svmClf_bin = pickle.load(file)
    print("Model loaded from: ", file.name)

# Test on CTU13 dataset  
predictions_bin = svmClf_bin.predict(test_ctu13[features])
print("Acc (CICIDS2017 Binary to CTU13) SVM: {:3f}".format(accuracy_score(test_ctu13['GT'], predictions_bin)))
print("Precision (CICIDS2017 Binary to CTU13) SVM: {:3f}".format(precision_score(test_ctu13['GT'], predictions_bin, pos_label=1)))
print("Recall (CICIDS2017 Binary to CTU13) SVM: {:3f}".format(recall_score(test_ctu13['GT'], predictions_bin, pos_label=1)))
print("F1-score (CICIDS2017 Binary to CTU13) SVM: {:3f}".format(f1_score(test_ctu13['GT'], predictions_bin, pos_label=1)))
pd.crosstab(test_ctu13['GT'], predictions_bin, rownames=['True'], colnames=['Pred'])

Model loaded from:  ../Pickle Files/SVMBinaryCICIDS2017.pkl
Acc (CICIDS2017 Binary to CTU13) SVM: 0.581049
Precision (CICIDS2017 Binary to CTU13) SVM: 0.675497
Recall (CICIDS2017 Binary to CTU13) SVM: 0.013111
F1-score (CICIDS2017 Binary to CTU13) SVM: 0.025723


Pred,0.0,1.0
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,21228,98
1.0,15355,204
