In [2]:
################################################################################
# Load dataset and split it into training and test set
################################################################################

import pandas as pd
import os
from tabulate import tabulate

sample_size = 10000

# Load dateset
df = pd.read_csv(os.getcwd() + f'/data/sample-{sample_size}-2.csv')

# Split dataset according to attack type
normal_df = df[df['label'] == 'BenignTraffic']
attack_df = df[df['label'] != 'BenignTraffic']
normal_df.loc[:, 'label'] = 'normal'
attack_df.loc[:, 'label'] = 'attack'

# Split dataset into training and test set
normal_df_train = normal_df.sample(frac=0.8, random_state=42)
normal_df_test = normal_df.drop(normal_df_train.index)
attack_df_train = attack_df.sample(frac=0.8, random_state=42)
attack_df_test = attack_df.drop(attack_df_train.index)

X_train = pd.concat([normal_df_train, attack_df_train]).drop(columns=['label'])
y_train = pd.concat([normal_df_train, attack_df_train])['label']
X_test = pd.concat([normal_df_test, attack_df_test]).drop(columns=['label'])
y_test = pd.concat([normal_df_test, attack_df_test])['label']

print("Training set size: ", X_train.shape[0])
print("Test set size: ", X_test.shape[0])
print(X_train.columns)

Training set size:  8000
Test set size:  2000
Index(['flow_duration', 'Header_Length', 'Protocol Type', 'Duration', 'Rate',
       'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number',
       'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
       'ece_flag_number', 'cwr_flag_number', 'ack_count', 'syn_count',
       'fin_count', 'urg_count', 'rst_count', 'HTTP', 'HTTPS', 'DNS', 'SSH',
       'TCP', 'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min',
       'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue', 'Radius',
       'Covariance', 'Variance', 'Weight'],
      dtype='object')


In [3]:
################################################################################
# Predict from Decision Tree model
################################################################################

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Create an instance of the DecisionTreeClassifier model
model = DecisionTreeClassifier()

# Fit the model to the training data
model.fit(X_train, y_train)

# Predict the labels for the test data
y_true = y_test
y_pred = model.predict(X_test)

# Evaluate the model
c_report = classification_report(y_true, y_pred)
c_matrix = confusion_matrix(y_true, y_pred)

with open(f"results/result-dt-{sample_size}-2.txt", "w") as f:
    f.write(f"Classication Report\n{c_report}\n\nConfusion Matrix\n{c_matrix}")

print(c_report)
print(c_matrix)

              precision    recall  f1-score   support

      attack       0.99      0.99      0.99      1000
      normal       0.99      0.99      0.99      1000

    accuracy                           0.99      2000
   macro avg       0.99      0.99      0.99      2000
weighted avg       0.99      0.99      0.99      2000

[[986  14]
 [ 10 990]]


In [7]:
################################################################################
# Feature Importance - Decision Tree 
################################################################################

from sklearn.tree import DecisionTreeClassifier

n = 100

feature_importances = {}
for i in range(n):
    model = DecisionTreeClassifier()
    model.fit(X_train, y_train)
    sorted_features = sorted(zip(model.feature_importances_, model.feature_names_in_), reverse=True)
    for importance, name in sorted_features:
        if name in feature_importances:
            feature_importances[name].append(importance)
        else:
            feature_importances[name] = [importance]

average_feature_importances = {}
for name, importances in feature_importances.items():
    average_feature_importances[name] = sum(importances) / len(importances)

top_features = sorted(average_feature_importances.items(), key=lambda x: x[1], reverse=True)

with open(f"results/feature-importance-{sample_size}-dt.txt", "a") as f:
    f.write("\n".join([str(feature) for feature in top_features[:5]]))

print(top_features[:5])

[('rst_count', 0.9325711880736708), ('flow_duration', 0.016766377022216086), ('IAT', 0.013764657714541733), ('urg_count', 0.005455290152592193), ('Protocol Type', 0.0045977146366371294)]


In [8]:
################################################################################
# Predict from Random Forest model
################################################################################

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Create an instance of the DecisionTreeClassifier model
model = RandomForestClassifier()

# Fit the model to the training data
model.fit(X_train, y_train)

# Predict the labels for the test data
y_true = y_test
y_pred = model.predict(X_test)

# Evaluate the model
c_report = classification_report(y_true, y_pred)
c_matrix = confusion_matrix(y_true, y_pred)

with open(f"results/result-rf-{sample_size}-2.txt", "w") as f:
    f.write(f"Classication Report\n{c_report}\n\nConfusion Matrix\n{c_matrix}")

print(c_report)
print(c_matrix)

              precision    recall  f1-score   support

      attack       1.00      0.99      0.99      1000
      normal       0.99      1.00      0.99      1000

    accuracy                           0.99      2000
   macro avg       0.99      0.99      0.99      2000
weighted avg       0.99      0.99      0.99      2000

[[ 986   14]
 [   0 1000]]


In [9]:
################################################################################
# Feature Importance - Random Forest
################################################################################

from sklearn.ensemble import RandomForestClassifier

n = 100

feature_importances = {}
for i in range(n):
    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    sorted_features = sorted(zip(model.feature_importances_, model.feature_names_in_), reverse=True)
    for importance, name in sorted_features:
        if name in feature_importances:
            feature_importances[name].append(importance)
        else:
            feature_importances[name] = [importance]

average_feature_importances = {}
for name, importances in feature_importances.items():
    average_feature_importances[name] = sum(importances) / len(importances)

top_features = sorted(average_feature_importances.items(), key=lambda x: x[1], reverse=True)

with open(f"results/feature-importance-{sample_size}-rf.txt", "a") as f:
    f.write("\n".join([str(feature) for feature in top_features[:5]]))

print(top_features[:5])


[('rst_count', 0.17119195823251981), ('urg_count', 0.14423365507210814), ('Variance', 0.09524572706704675), ('Tot size', 0.07837678793548522), ('Magnitue', 0.0723118397394615)]


In [None]:
################################################################################
# Predict from Logistic Regression model
################################################################################

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Create an instance of the DecisionTreeClassifier model
model = LogisticRegression()

# Fit the model to the training data
model.fit(X_train, y_train)

# Predict the labels for the test data
y_true = y_test
y_pred = model.predict(X_test)

# Evaluate the model
c_report = classification_report(y_true, y_pred)
c_matrix = confusion_matrix(y_true, y_pred)

with open(f"results/result-lr-{sample_size}-2.txt", "w") as f:
    f.write(f"Classication Report\n{c_report}\n\nConfusion Matrix\n{c_matrix}")

print(c_report)
print(c_matrix)

In [None]:
################################################################################
# Predict from SVM model
################################################################################

from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

# Create an instance of the DecisionTreeClassifier model
model = SVC()

# Fit the model to the training data
model.fit(X_train, y_train)

# Predict the labels for the test data
y_true = y_test
y_pred = model.predict(X_test)

# Evaluate the model
c_report = classification_report(y_true, y_pred)
c_matrix = confusion_matrix(y_true, y_pred)

with open(f"results/result-svm-{sample_size}-2.txt", "w") as f:
    f.write(f"Classication Report\n{c_report}\n\nConfusion Matrix\n{c_matrix}")

print(c_report)
print(c_matrix)