In [6]:
################################################################################
# Load dataset and split it into training and test set
################################################################################

import pandas as pd
import os
from tabulate import tabulate
from sklearn.preprocessing import LabelEncoder

sample_size = 10000

# Load dateset
df_train = pd.read_csv(os.getcwd() + f'/data/sample-{sample_size}-2_train.csv')
df_test = pd.read_csv(os.getcwd() + f'/data/sample-{sample_size}-2_test.csv')

# Encode categorical columns
label_encoder = LabelEncoder()
categorical_columns = df_train.select_dtypes(include=['object']).columns
for column in categorical_columns:
    df_train[column] = label_encoder.fit_transform(df_train[column])
    df_test[column] = label_encoder.fit_transform(df_test[column])

# Split dataset according to attack type and drop columns
normal_df_train = df_train[df_train['label'] == 0]
normal_df_test = df_test[df_test['label'] == 0]
attack_df_train = df_train[df_train['label'] == 1]
attack_df_test = df_test[df_test['label'] == 1]

X_train = pd.concat([normal_df_train, attack_df_train]).drop(columns=['attack_cat', 'label'])
y_train = pd.concat([normal_df_train, attack_df_train])['label']
X_test = pd.concat([normal_df_test, attack_df_test]).drop(columns=['attack_cat', 'label'])
y_test = pd.concat([normal_df_test, attack_df_test])['label']

print("Training set size: ", X_train.shape[0])
print("Test set size: ", X_test.shape[0])
print(X_train.columns)

Training set size:  8000
Test set size:  2000
Index(['id', 'dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes',
       'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss',
       'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin',
       'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
       'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm',
       'ct_srv_dst', 'is_sm_ips_ports'],
      dtype='object')


In [7]:
################################################################################
# Predict from Decision Tree model
################################################################################

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Create an instance of the DecisionTreeClassifier model
model = DecisionTreeClassifier()

# Fit the model to the training data
model.fit(X_train, y_train)

# Predict the labels for the test data
y_true = y_test
y_pred = model.predict(X_test)

# Evaluate the model
c_report = classification_report(y_true, y_pred)
c_matrix = confusion_matrix(y_true, y_pred)

with open(f"results/result-dt-{sample_size}-2.txt", "w") as f:
    f.write(f"Classication Report\n{c_report}\n\nConfusion Matrix\n{c_matrix}")

print(c_report)
print(c_matrix)

              precision    recall  f1-score   support

           0       0.54      0.71      0.61      1000
           1       0.58      0.39      0.47      1000

    accuracy                           0.55      2000
   macro avg       0.56      0.55      0.54      2000
weighted avg       0.56      0.55      0.54      2000

[[712 288]
 [609 391]]


In [8]:
################################################################################
# Feature Importance - Decision Tree 
################################################################################

from sklearn.tree import DecisionTreeClassifier

n = 100

feature_importances = {}
for i in range(n):
    model = DecisionTreeClassifier()
    model.fit(X_train, y_train)
    sorted_features = sorted(zip(model.feature_importances_, model.feature_names_in_), reverse=True)
    for importance, name in sorted_features:
        if name in feature_importances:
            feature_importances[name].append(importance)
        else:
            feature_importances[name] = [importance]

average_feature_importances = {}
for name, importances in feature_importances.items():
    average_feature_importances[name] = sum(importances) / len(importances)

top_features = sorted(average_feature_importances.items(), key=lambda x: x[1], reverse=True)

with open(f"results/feature-importance-{sample_size}-dt.txt", "a") as f:
    f.write("\n".join([str(feature) for feature in top_features[:5]]))

print(top_features[:5])

[('id', 0.8129995491608345), ('is_sm_ips_ports', 0.07031723740165942), ('sload', 0.028997315722627114), ('ct_srv_dst', 0.017523395831451433), ('ct_dst_src_ltm', 0.010367409956935983)]


In [3]:
################################################################################
# Predict from Random Forest model
################################################################################

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Create an instance of the DecisionTreeClassifier model
model = RandomForestClassifier()

# Fit the model to the training data
model.fit(X_train, y_train)

# Predict the labels for the test data
y_true = y_test
y_pred = model.predict(X_test)

# Evaluate the model
c_report = classification_report(y_true, y_pred)
c_matrix = confusion_matrix(y_true, y_pred)

with open(f"results/result-rf-{sample_size}-2.txt", "w") as f:
    f.write(f"Classication Report\n{c_report}\n\nConfusion Matrix\n{c_matrix}")

print(c_report)
print(c_matrix)

              precision    recall  f1-score   support

           0       0.67      0.75      0.71      1000
           1       0.72      0.63      0.67      1000

    accuracy                           0.69      2000
   macro avg       0.69      0.69      0.69      2000
weighted avg       0.69      0.69      0.69      2000

[[752 248]
 [369 631]]


In [9]:
################################################################################
# Feature Importance - Random Forest
################################################################################

from sklearn.ensemble import RandomForestClassifier

n = 100

feature_importances = {}
for i in range(n):
    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    sorted_features = sorted(zip(model.feature_importances_, model.feature_names_in_), reverse=True)
    for importance, name in sorted_features:
        if name in feature_importances:
            feature_importances[name].append(importance)
        else:
            feature_importances[name] = [importance]

average_feature_importances = {}
for name, importances in feature_importances.items():
    average_feature_importances[name] = sum(importances) / len(importances)

top_features = sorted(average_feature_importances.items(), key=lambda x: x[1], reverse=True)

with open(f"results/feature-importance-{sample_size}-rf.txt", "a") as f:
    f.write("\n".join([str(feature) for feature in top_features[:5]]))

print(top_features[:5])

[('id', 0.23401143977770444), ('sttl', 0.11226248109954706), ('ct_state_ttl', 0.0845238981149299), ('sload', 0.05215648273790055), ('dload', 0.04459302308972417)]


In [4]:
################################################################################
# Predict from Logistic Regression model
################################################################################

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Create an instance of the DecisionTreeClassifier model
model = LogisticRegression()

# Fit the model to the training data
model.fit(X_train, y_train)

# Predict the labels for the test data
y_true = y_test
y_pred = model.predict(X_test)

# Evaluate the model
c_report = classification_report(y_true, y_pred)
c_matrix = confusion_matrix(y_true, y_pred)

with open(f"results/result-lr-{sample_size}-2.txt", "w") as f:
    f.write(f"Classication Report\n{c_report}\n\nConfusion Matrix\n{c_matrix}")

print(c_report)
print(c_matrix)

              precision    recall  f1-score   support

           0       0.69      0.51      0.59      1000
           1       0.61      0.77      0.68      1000

    accuracy                           0.64      2000
   macro avg       0.65      0.64      0.63      2000
weighted avg       0.65      0.64      0.63      2000

[[509 491]
 [231 769]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [5]:
################################################################################
# Predict from SVM model
################################################################################

from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

# Create an instance of the DecisionTreeClassifier model
model = SVC()

# Fit the model to the training data
model.fit(X_train, y_train)

# Predict the labels for the test data
y_true = y_test
y_pred = model.predict(X_test)

# Evaluate the model
c_report = classification_report(y_true, y_pred)
c_matrix = confusion_matrix(y_true, y_pred)

with open(f"results/result-svm-{sample_size}-2.txt", "w") as f:
    f.write(f"Classication Report\n{c_report}\n\nConfusion Matrix\n{c_matrix}")

print(c_report)
print(c_matrix)

              precision    recall  f1-score   support

           0       0.68      0.71      0.69      1000
           1       0.70      0.66      0.68      1000

    accuracy                           0.69      2000
   macro avg       0.69      0.69      0.69      2000
weighted avg       0.69      0.69      0.69      2000

[[711 289]
 [340 660]]
