In [16]:
import pandas as pd
import numpy as np
import pickle
import os, time
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import shap

In [17]:
# Reading CSV files, and merging all of them into a single DataFrame
CISIDS2017_folder = "/home/grassfed37/6CCS3PRJ/dummy-ML_NIDS/CICIDS2017ML"
CTU13_folder = "/home/grassfed37/6CCS3PRJ/dummy-ML_NIDS/CTU13ML"

CICIDS2017_df_list = []
CTU13_df_list = []

# Reading CICIDS2017 CSV files into a single DataFrame
for f in os.listdir(CISIDS2017_folder):
    file_path = os.path.join(CISIDS2017_folder, f)
    if os.path.isfile(file_path):
        print("Reading: ", f)
        CICIDS2017_df_list.append(pd.read_csv(file_path))

# Reading CTU13 CSV files into a single DataFrame
for f in os.listdir(CTU13_folder):
    file_path = os.path.join(CTU13_folder, f)
    if os.path.isfile(file_path):
        print("Reading: ", f)
        CTU13_df_list.append(pd.read_csv(file_path))

CICIDS2017_df, CTU13_df = pd.concat(CICIDS2017_df_list, ignore_index=True), pd.concat(CTU13_df_list, ignore_index=True)

Reading:  Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX_Relabeled.csv
Reading:  Friday-WorkingHours-Morning.pcap_ISCX_Relabeled.csv
Reading:  Wednesday-workingHours.pcap_ISCX_Relabeled.csv
Reading:  Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX_Relabeled.csv
Reading:  Monday-WorkingHours.pcap_ISCX_Relabeled.csv


KeyboardInterrupt: 

In [None]:
# QUICK PREPROCESSING. 
# Some classifiers do not like "infinite" (inf) or "null" (NaN) values.
CICIDS2017_df.replace([np.inf, -np.inf], np.nan, inplace=True)
print("Columns with problematic values: ", list(CICIDS2017_df.columns[CICIDS2017_df.isna().any()]))
CICIDS2017_df.dropna(inplace=True)

CTU13_df.replace([np.inf, -np.inf], np.nan, inplace=True)
print("Columns with problematic values: ", list(CTU13_df.columns[CTU13_df.isna().any()]))
CTU13_df.dropna(inplace=True)

Columns with problematic values:  [' Flow Packets/s']
Columns with problematic values:  []


In [None]:
CICIDS2017_df[' Label'].unique()

array(['BENIGN', 'PortScan', 'Bot', 'DoS slowloris', 'DoS Slowhttptest',
       'DoS Hulk', 'DoS GoldenEye', 'Heartbleed',
       'Web Attack � Brute Force', 'Web Attack � XSS',
       'Web Attack � Sql Injection', 'DDoS', 'FTP-Patator', 'SSH-Patator',
       'Infiltration'], dtype=object)

In [None]:
CTU13_df[' Label'].unique()

array(['BENIGN', 'Bot'], dtype=object)

In [None]:
# Create a new column that unifies all malicious classes into a single class for binary classification
CICIDS2017_df['GT'] = np.where(CICIDS2017_df[' Label']=='BENIGN', 'Benign', 'Malicious')
CTU13_df['GT'] = np.where(CTU13_df[' Label']=='BENIGN', 'Benign', 'Malicious')

In [None]:
features = pd.Index([
    ' Flow Duration',
    ' Total Fwd Packets',
    ' Total Backward Packets',
    ' Total Length of Bwd Packets',
    ' Fwd Packet Length Max',
    ' Fwd Packet Length Min',
    ' Fwd Packet Length Mean',
    ' Fwd Packet Length Std',
    ' Bwd Packet Length Min',
    ' Bwd Packet Length Mean',
    ' Bwd Packet Length Std',
    ' Flow Packets/s',
    ' Flow IAT Mean',
    ' Flow IAT Std',
    ' Flow IAT Max',
    ' Flow IAT Min',
    ' Fwd IAT Mean',
    ' Fwd IAT Std',
    ' Fwd IAT Max',
    ' Fwd IAT Min',
    ' Bwd IAT Mean',
    ' Bwd IAT Std',
    ' Bwd IAT Max',
    ' Bwd IAT Min',
    ' Bwd PSH Flags',
    ' Fwd Header Length',
    ' Bwd Header Length',
    ' Bwd Packets/s',
    ' Min Packet Length',
    ' Max Packet Length',
    ' Packet Length Mean',
    ' Packet Length Std',
    ' Packet Length Variance',
    ' SYN Flag Count',
    ' RST Flag Count',
    ' ACK Flag Count',
    ' Down/Up Ratio',
    ' Average Packet Size',
    ' Avg Fwd Segment Size',
    ' Avg Bwd Segment Size',
    ' Init_Win_bytes_backward',
    ' act_data_pkt_fwd',
    ' Active Std',
    ' Active Max',
    ' Active Min',
    ' Idle Std',
    ' Idle Max',
    ' Idle Min'
])

In [None]:
# Train test splits for both datasets
train_ctu13, test_ctu13 = train_test_split(CTU13_df, test_size=0.4, stratify=CTU13_df['GT'])
train_cicids, test_cicids = train_test_split(CICIDS2017_df, test_size=0.4, stratify=CICIDS2017_df['GT'])

start = time.time()
svmClf_bin_ctu13 = LinearSVC()
svmClf_bin_ctu13.fit(train_ctu13[features], train_ctu13['GT'])
end = time.time() - start
print("Training time (CTU13 Binary): ", end)

# Save the binary SVM model trained on CTU13 dataset
with open('../Pickle Files/SVMBinaryCTU13.pkl', 'wb') as file:
    pickle.dump(svmClf_bin_ctu13, file)

In [None]:
# Train test splits for both datasets
train_ctu13, test_ctu13 = train_test_split(CTU13_df, test_size=0.4, stratify=CTU13_df['GT'])
train_cicids, test_cicids = train_test_split(CICIDS2017_df, test_size=0.4, stratify=CICIDS2017_df['GT'])

start = time.time()
svmClf_bin_cicids = LinearSVC()
svmClf_bin_cicids.fit(train_cicids[features], train_cicids['GT'])
end = time.time() - start
print("Training time (CICIDS2017 Binary): ", end)

# Save the binary SVM model trained on CICIDS2017 dataset
with open('../Pickle Files/SVMBinaryCICIDS2017.pkl', 'wb') as file:
    pickle.dump(svmClf_bin_cicids, file)

In [None]:
# Train test splits for both datasets
train_ctu13, test_ctu13 = train_test_split(CTU13_df, test_size=0.4, stratify=CTU13_df['GT'])
train_cicids, test_cicids = train_test_split(CICIDS2017_df, test_size=0.4, stratify=CICIDS2017_df['GT'])

# Train a Multiclass SVM model on CICIDS2017 dataset
start = time.time()
svmClf_multi_cicids = LinearSVC()
svmClf_multi_cicids.fit(train_cicids[features], train_cicids[' Label'])
end = time.time() - start
print("Training time (CICIDS2017 Multiclass): ", end)

# Save the multiclass SVM model trained on CICIDS2017 dataset
with open('../Pickle Files/SVMMulticlassCICIDS2017.pkl', 'wb') as file:
    pickle.dump(svmClf_multi_cicids, file)

In [None]:
# Train test splits for both datasets
train_ctu13, test_ctu13 = train_test_split(CTU13_df, test_size=0.4, stratify=CTU13_df['GT'])
train_cicids, test_cicids = train_test_split(CICIDS2017_df, test_size=0.4, stratify=CICIDS2017_df['GT'])

# Load the binary SVM model trained on CTU13 dataset
with open('../Pickle Files/SVMBinaryCTU13.pkl', 'rb') as file:
    svmClf_bin_ctu13 = pickle.load(file)
    
# Test on CTU13 dataset
predictions_bin_ctu13 = svmClf_bin_ctu13.predict(test_ctu13[features])
print("Acc (CTU13 Binary): {:3f}".format(accuracy_score(test_ctu13['GT'], predictions_bin_ctu13)))
print("F1-score (CTU13 Binary): {:3f}".format(f1_score(test_ctu13['GT'], predictions_bin_ctu13, pos_label='Malicious')))
pd.crosstab(test_ctu13['GT'], predictions_bin_ctu13, rownames=['True'], colnames=['Pred'])

Acc (CTU13 Binary): 0.722136
F1-score (CTU13 Binary): 0.612792


Pred,Benign,Malicious
True,Unnamed: 1_level_1,Unnamed: 2_level_1
Benign,18526,2800
Malicious,7449,8110


In [None]:
# Train test splits for both datasets
train_ctu13, test_ctu13 = train_test_split(CTU13_df, test_size=0.4, stratify=CTU13_df['GT'])
train_cicids, test_cicids = train_test_split(CICIDS2017_df, test_size=0.4, stratify=CICIDS2017_df['GT'])

# Load the binary SVM model trained on CICIDS2017 dataset
with open('../Pickle Files/SVMBinaryCICIDS2017.pkl', 'rb') as file:
    svmClf_bin_cicids = pickle.load(file)

# Test on CICIDS2017 dataset
predictions_bin_cicids = svmClf_bin_cicids.predict(test_cicids[features])
print("Acc (CICIDS2017 Binary): {:3f}".format(accuracy_score(test_cicids['GT'], predictions_bin_cicids)))
print("F1-score (CICIDS2017 Binary): {:3f}".format(f1_score(test_cicids['GT'], predictions_bin_cicids, pos_label='Malicious')))
pd.crosstab(test_cicids['GT'], predictions_bin_cicids, rownames=['True'], colnames=['Pred'])

Acc (CICIDS2017 Binary): 0.734591
F1-score (CICIDS2017 Binary): 0.433686


Pred,Benign,Malicious
True,Unnamed: 1_level_1,Unnamed: 2_level_1
Benign,715979,192549
Malicious,107669,114954


In [None]:
# Train test splits for both datasets
train_ctu13, test_ctu13 = train_test_split(CTU13_df, test_size=0.4, stratify=CTU13_df['GT'])
train_cicids, test_cicids = train_test_split(CICIDS2017_df, test_size=0.4, stratify=CICIDS2017_df['GT'])

# Load the multiclass SVM model trained on CICIDS2017 dataset
with open('../Pickle Files/SVMMulticlassCICIDS2017.pkl', 'rb') as file:
    svmClf_multi_cicids = pickle.load(file)

# Test on CICIDS2017 dataset
predictions_multi_cicids = svmClf_multi_cicids.predict(test_cicids[features])
print("Acc (CICIDS2017 Multiclass): {:3f}".format(accuracy_score(test_cicids[' Label'], predictions_multi_cicids)))
print("F1-score (CICIDS2017 Multiclass): {:3f}".format(f1_score(test_cicids[' Label'], predictions_multi_cicids, average='macro')))
pd.crosstab(test_cicids[' Label'], predictions_multi_cicids, rownames=['True'], colnames=['Pred'])

Acc (CICIDS2017 Multiclass): 0.791722
F1-score (CICIDS2017 Multiclass): 0.242217


Pred,BENIGN,Bot,DDoS,DoS GoldenEye,DoS Hulk,DoS Slowhttptest,DoS slowloris,FTP-Patator,Heartbleed,Infiltration,PortScan,SSH-Patator,Web Attack � Brute Force,Web Attack � Sql Injection,Web Attack � XSS
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
BENIGN,782037,2101,26419,11313,25984,2459,9373,556,7557,492,39593,372,119,92,61
Bot,446,69,0,0,187,0,0,0,0,0,74,0,0,0,0
DDoS,5063,0,42291,1119,15,2058,0,0,16,0,4,393,2,0,0
DoS GoldenEye,1605,3,6,2213,140,13,134,0,1,0,0,0,37,2,0
DoS Hulk,22450,5,57401,2529,8711,377,28,0,4,0,723,56,0,0,0
DoS Slowhttptest,402,10,0,384,73,1244,29,0,58,0,0,0,0,0,0
DoS slowloris,1049,0,39,54,432,682,68,0,0,0,34,1,0,0,0
FTP-Patator,2193,0,0,0,0,0,0,904,0,0,3,0,0,0,0
Heartbleed,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Infiltration,10,0,0,1,0,0,1,1,0,0,0,0,0,0,0


In [None]:
# Train test splits for both datasets
train_ctu13, test_ctu13 = train_test_split(CTU13_df, test_size=0.4, stratify=CTU13_df['GT'])
train_cicids, test_cicids = train_test_split(CICIDS2017_df, test_size=0.4, stratify=CICIDS2017_df['GT'])

# Load the binary SVM model trained on CTU13 dataset
with open('../Pickle Files/SVMBinaryCTU13.pkl', 'rb') as file:
    svmClf_bin_ctu13 = pickle.load(file)

# Test on CICIDS2017 dataset
predictions_bin_cicids = svmClf_bin_ctu13.predict(test_cicids[features])
print("Acc (CTU13 to CICIDS2017): {:3f}".format(accuracy_score(test_cicids['GT'], predictions_bin_cicids)))
print("F1-score (CTU13 to CICIDS2017): {:3f}".format(f1_score(test_cicids['GT'], predictions_bin_cicids, pos_label='Malicious')))
pd.crosstab(test_cicids['GT'], predictions_bin_cicids, rownames=['True'], colnames=['Pred'])

Acc (CTU13 to CICIDS2017): 0.452950
F1-score (CTU13 to CICIDS2017): 0.267690


Pred,Benign,Malicious
True,Unnamed: 1_level_1,Unnamed: 2_level_1
Benign,399257,509271
Malicious,109525,113098


In [None]:
# Train test splits for both datasets
train_ctu13, test_ctu13 = train_test_split(CTU13_df, test_size=0.4, stratify=CTU13_df['GT'])
train_cicids, test_cicids = train_test_split(CICIDS2017_df, test_size=0.4, stratify=CICIDS2017_df['GT'])

# Load the binary SVM model trained on CICIDS2017
with open('../Pickle Files/SVMBinaryCICIDS2017.pkl', 'rb') as file:
    svmClf_bin = pickle.load(file)

# Test on CTU13 dataset
predictions_bin = svmClf_bin.predict(test_ctu13[features])
print("Acc (CICIDS2017 Binary to CTU13): {:3f}".format(accuracy_score(test_ctu13['GT'], predictions_bin)))
print("F1-score (CICIDS2017 Binary to CTU13): {:3f}".format(f1_score(test_ctu13['GT'], predictions_bin, pos_label='Malicious')))
pd.crosstab(test_ctu13['GT'], predictions_bin, rownames=['True'], colnames=['Pred'])

Acc (CICIDS2017 Binary to CTU13): 0.657910
F1-score (CICIDS2017 Binary to CTU13): 0.542727


Pred,Benign,Malicious
True,Unnamed: 1_level_1,Unnamed: 2_level_1
Benign,16779,4547
Malicious,8071,7488


In [None]:
# Train test splits for both datasets
train_ctu13, test_ctu13 = train_test_split(CTU13_df, test_size=0.4, stratify=CTU13_df['GT'])
train_cicids, test_cicids = train_test_split(CICIDS2017_df, test_size=0.4, stratify=CICIDS2017_df['GT'])

# Load the multiclass SVM model trained on CICIDS2017
with open('../Pickle Files/SVMMulticlassCICIDS2017.pkl', 'rb') as file:
    svmClf_multi = pickle.load(file)

# Test on CTU13 dataset
predictions_multi = svmClf_multi.predict(test_ctu13[features])
print("Acc (CICIDS2017 Multi-class to CTU13): {:3f}".format(accuracy_score(test_ctu13[' Label'], predictions_multi)))
print("F1-score (CICIDS2017 Multi-class to CTU13): {:3f}".format(f1_score(test_ctu13[' Label'], predictions_multi, average='macro')))
pd.crosstab(test_ctu13[' Label'], predictions_multi, rownames=['True'], colnames=['Pred'])

Acc (CICIDS2017 Multi-class to CTU13): 0.539027
F1-score (CICIDS2017 Multi-class to CTU13): 0.051402


Pred,BENIGN,Bot,DDoS,DoS GoldenEye,DoS Hulk,DoS Slowhttptest,DoS slowloris,FTP-Patator,Heartbleed,Infiltration,PortScan,SSH-Patator,Web Attack � Brute Force,Web Attack � Sql Injection,Web Attack � XSS
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
BENIGN,19812,27,386,187,586,17,46,19,112,2,117,12,3,0,0
Bot,10856,70,120,94,800,86,456,27,161,4,2826,13,41,1,4


In [23]:
# Create a sample input for SHAP
sample_input = pd.DataFrame(columns=features, data=[[0] * len(features)])

# Create a reference dataset for SVM explainers using the normalized training data
reference_data = shap.sample(train_cicids[features], 100)

# Generate SHAP explanations for SVM models
explainer_svmClf_bin_cicids = shap.KernelExplainer(svmClf_bin_cicids.predict, reference_data)
shap_values_svmClf_bin_cicids = explainer_svmClf_bin_cicids.shap_values(sample_input.values)

explainer_svmClf_multi_cicids = shap.KernelExplainer(svmClf_multi_cicids.predict_proba, reference_data)
shap_values_svmClf_multi_cicids = explainer_svmClf_multi_cicids.shap_values(sample_input.values)

# Print the SHAP values graph for SVM classifiers
print("\nSVM Binary Classifier - SHAP Values:")
shap.summary_plot(shap_values_svmClf_bin_cicids, sample_input.values, plot_type='bar', feature_names=features)

print("\nSVM Multiclass Classifier - SHAP Values:")
shap.summary_plot(shap_values_svmClf_multi_cicids, sample_input.values, plot_type='bar', feature_names=features, class_names=svmClf_multi_cicids.classes_)

TypeError: can't multiply sequence by non-int of type 'float'