In [1]:
#The data in the file is stored in memory with df
import pandas as pd
import numpy as np
df = pd.read_csv('web_attacks_data.csv')

In [2]:
#The Label column is encoded as follows: "BENIGN" = 0, attack = 1.
df['Label'] = df['Label'].apply(lambda x: 0 if x == 'BENIGN' else 1)
#How much of all data is an attack, how much is not an attack is written to the screen
unique, counts = np.unique(df['Label'], return_counts=True)
dict(zip(unique, counts))

{0: 5087, 1: 2180}

In [3]:
#Columns that will have a negative impact on the machine learning model have been removed
excluded = ['Flow ID', 'Source IP', 'Source Port', 'Destination IP', 'Destination Port', 'Protocol', 'Timestamp', 'Init_Win_bytes_backward', 'Init_Win_bytes_forward']
df = df.drop(columns=excluded, errors='ignore')

In [4]:
#The label column is that is being tried to find, and the others are the columns that help to find the label column.
y = df['Label'].values
X = df.drop(columns=['Label'])
print(X.shape, y.shape)

(7267, 74) (7267,)


In [5]:
#The data was divided into two as training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
#How much of training data is an attack, how much is not an attack is written to the screen
unique, counts = np.unique(y_train, return_counts=True)
dict(zip(unique, counts))

{0: 3563, 1: 1523}

In [6]:
#Used the decision tree machine learning algorithm
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
decision_tree = DecisionTreeClassifier(max_leaf_nodes=5, random_state=0)
decision_tree = decision_tree.fit(X_train, y_train)
cross_val_score(decision_tree, X_train, y_train, cv=10)

array([0.94695481, 0.95284872, 0.94891945, 0.96660118, 0.94302554,
       0.96856582, 0.96062992, 0.94291339, 0.94291339, 0.96456693])

In [7]:
#The resulting decision tree model is visualized
from sklearn.tree import export_text
r = export_text(decision_tree, feature_names=X_train.columns.to_list())
print(r)

|--- Max Packet Length <= 3.00
|   |--- Fwd IAT Std <= 2477650.38
|   |   |--- Bwd Packets/s <= 8889.06
|   |   |   |--- class: 0
|   |   |--- Bwd Packets/s >  8889.06
|   |   |   |--- class: 0
|   |--- Fwd IAT Std >  2477650.38
|   |   |--- class: 1
|--- Max Packet Length >  3.00
|   |--- Total Length of Fwd Packets <= 34839.50
|   |   |--- class: 0
|   |--- Total Length of Fwd Packets >  34839.50
|   |   |--- class: 1



In [8]:
#Evaluating the performance of the trained decision tree model
from sklearn.metrics import confusion_matrix
y_pred = decision_tree.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[1513,   11],
       [  93,  564]])

In [9]:
#Printed on the screen what percentage of attacks were correctly detected in the trained decision tree model
from sklearn import metrics
specificity = metrics.recall_score(y_test, y_pred, pos_label=1)
print("%",specificity*100)

% 85.84474885844749
