In [1]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# Import a bunch of libraries.
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from google.colab import drive
import kagglehub
import zipfile

drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


# Feature Column Descriptions

| Column Name     | Description     |
| :-------------- | :-------------- |
| avg_ipt | average interpacket Time |
| label | 	The label of the flow, as decided by Tangerine. Either benign, outlier, or malicious |
| entropy | The entropy in bits per byte of the data fields within the flow. This number ranges from 0 to 8 |
| duration | The flow duration time, with microsecond precision |
| src_ip	| The source IP address associated with the flow. This feature is anonymised to the corresponding Autonomous System |
| src_port | The source port number associated with the flow |
| dest_ip	| The destination IP address associated with the flow. This feature is anonymised to the corresponding Autonomous System |
| dest_port	| The destination port number associated with the flow |
| protocol |The protocol number associated with the flow. For example TCP is 6 |
| bytes_in |	The number of bytes transmitted from source to destination |
| bytes_out	| The number of bytes transmitted from destination to source |
| num_pkts_in | The packet count from source to destination |
| num_pkts_out | The packet count from destination to source |


## Cleaning and Preparing Data

In [8]:
np.random.seed(0) # Reproducability
# read the dataset using the compression zip
# https://www.kaggle.com/datasets/mryanm/luflow-network-intrusion-detection-data-set/data
file_names = ['2022.06.13.csv', '2022.06.14.csv', '2022.06.12.csv', '2021.02.06.csv']
df = pd.DataFrame()
for file_name in file_names:
  df_temp = pd.read_csv(f'/content/drive/MyDrive/Capstone Project/AI Model Training/{file_name}')
  df = pd.concat([df, df_temp])

# Dropping uneccessary columns
df.drop('time_end', axis=1, inplace=True, errors='ignore')
df.drop('time_start', axis=1, inplace=True, errors='ignore')
df.drop('total_entropy', axis=1, inplace=True, errors='ignore')


# transform 'label' to numbers and shuffle
df['label'] = df['label'].apply(lambda x: 0 if x == 'benign' else 1)
X_train, X_test, y_train, y_test = train_test_split(df.drop('label', axis=1), df['label'], test_size=0.3, random_state=np.random.randint(100))
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

# display dataset
df.head(10)

(1276830, 12)
(547214, 12)
(1276830,)
(547214,)


Unnamed: 0,avg_ipt,bytes_in,bytes_out,dest_ip,dest_port,entropy,num_pkts_out,num_pkts_in,proto,src_ip,src_port,label,duration
354529,0.0,0,11584,786,9200.0,2.24163,8,2,6,786,45318.0,0,0.000279
366430,0.0,0,11584,786,9200.0,2.234462,8,2,6,786,47224.0,0,0.000218
50287,0.0,0,0,786,9200.0,0.0,1,0,6,786,47230.0,0,0.0
333301,0.0,0,0,786,45318.0,0.0,1,0,6,786,9200.0,0,0.0
222470,0.0,0,14480,786,9200.0,1.897641,10,0,6,786,45326.0,0,0.000114
2863,155.0,0,132,786,445.0,0.323657,3,2,6,786,56055.0,1,0.155032
531257,36.0,34,29,786,5900.0,5.064424,7,10,6,786,39174.0,1,0.717045
334095,0.0,0,7430,786,9200.0,2.766634,6,0,6,786,45324.0,0,0.000204
550946,144.0,89,183,786,445.0,3.611253,6,6,6,786,60005.0,1,0.667724
718914,135.5,270,191,786,445.0,4.559644,6,6,6,786,59209.0,1,1.059404


# First Iteration: Decision Tree

Random Forests and Decision Trees are good for classification tasks where you have a mix of numerical and categorical features (like protocol or src_ip). They handle non-linearity, high-dimensional data, and feature importance.

https://www.sciencedirect.com/science/article/pii/S2772918424000481#:~:text=Their%20method%20utilized%20Random%20Forest%20as%20the%20base%20classifier


In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns

def RandomForestBenchmark(n):

  forest_classifier = RandomForestClassifier(n_estimators=n, random_state=np.random.randint(100))

  fit_time_start = time.time()
  forest_classifier.fit(X_train, y_train)
  fit_time_end = time.time()
  fit_time = fit_time_end - fit_time_start
  print(f"Training duration for n = {n}: {fit_time}")

  predict_time_start = time.time()
  y_pred = forest_classifier.predict(X_test)
  predict_time_end = time.time()
  predict_time = predict_time_end - predict_time_start
  print(f"Prediction duration for n = {n}: {predict_time}")

  accuracy = accuracy_score(y_test, y_pred)
  print(f"Accuracy for n = {n}: {accuracy}")
  print(classification_report(y_test, y_pred))

sizes = [50, 100, 200, 500, 1000]
for size in sizes:
  RandomForestBenchmark(size)

Training duration for n = 50: 104.49620079994202
Prediction duration for n = 50: 2.2365424633026123
Accuracy for n = 50: 0.9998154286988271
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    256200
           1       1.00      1.00      1.00    291014

    accuracy                           1.00    547214
   macro avg       1.00      1.00      1.00    547214
weighted avg       1.00      1.00      1.00    547214

Training duration for n = 100: 209.92143321037292
Prediction duration for n = 100: 4.50894021987915
Accuracy for n = 100: 0.9998190835760781
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    256200
           1       1.00      1.00      1.00    291014

    accuracy                           1.00    547214
   macro avg       1.00      1.00      1.00    547214
weighted avg       1.00      1.00      1.00    547214

Training duration for n = 200: 422.5612952709198
Prediction durat