In [1]:
# set up jupyter
from IPython.display import display, Markdown # pretty display
from IPython.core.interactiveshell import InteractiveShell # multi output

InteractiveShell.ast_node_interactivity = "all" # type: ignore

In [2]:
# import packages
import pandas as pd
from joblib import dump
from sklearn.preprocessing import MinMaxScaler

In [3]:
# read raw data file
file_name = "./dataset/raw.csv"
data = pd.read_csv(file_name)
data.columns = data.columns.str.strip()
Markdown("# Raw data")
display(data)

# Raw data

Unnamed: 0,Flow ID,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,Flow Duration,Total Fwd Packets,Total Backward Packets,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,192.168.10.5-104.16.207.165-54865-443-6,104.16.207.165,443,192.168.10.5,54865,6,7/7/2017 3:30,3,2,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,192.168.10.5-104.16.28.216-55054-80-6,104.16.28.216,80,192.168.10.5,55054,6,7/7/2017 3:30,109,1,1,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,192.168.10.5-104.16.28.216-55055-80-6,104.16.28.216,80,192.168.10.5,55055,6,7/7/2017 3:30,52,1,1,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,192.168.10.16-104.17.241.25-46236-443-6,104.17.241.25,443,192.168.10.16,46236,6,7/7/2017 3:30,34,1,1,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,192.168.10.5-104.19.196.102-54863-443-6,104.19.196.102,443,192.168.10.5,54863,6,7/7/2017 3:30,3,2,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225740,192.168.10.15-72.21.91.29-61374-80-6,72.21.91.29,80,192.168.10.15,61374,6,7/7/2017 5:02,61,1,1,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
225741,192.168.10.15-72.21.91.29-61378-80-6,72.21.91.29,80,192.168.10.15,61378,6,7/7/2017 5:02,72,1,1,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
225742,192.168.10.15-72.21.91.29-61375-80-6,72.21.91.29,80,192.168.10.15,61375,6,7/7/2017 5:02,75,1,1,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
225743,192.168.10.15-8.41.222.187-61323-80-6,8.41.222.187,80,192.168.10.15,61323,6,7/7/2017 5:02,48,2,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


In [4]:
# inspect column info
Markdown("## Column info")
display(data.info())

## Column info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 225745 entries, 0 to 225744
Data columns (total 85 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Flow ID                      225745 non-null  object 
 1   Source IP                    225745 non-null  object 
 2   Source Port                  225745 non-null  int64  
 3   Destination IP               225745 non-null  object 
 4   Destination Port             225745 non-null  int64  
 5   Protocol                     225745 non-null  int64  
 6   Timestamp                    225745 non-null  object 
 7   Flow Duration                225745 non-null  int64  
 8   Total Fwd Packets            225745 non-null  int64  
 9   Total Backward Packets       225745 non-null  int64  
 10  Total Length of Fwd Packets  225745 non-null  int64  
 11  Total Length of Bwd Packets  225745 non-null  int64  
 12  Fwd Packet Length Max        225745 non-null  int64  
 13 

None

In [5]:
# convert labels to 0 and 1
data["Label"] = data["Label"].apply(lambda x: 1 if x == "DDoS" else 0)
Markdown("Converted labels to `0` and `1`")

Converted labels to `0` and `1`

In [6]:
# select columns of interest
data = data[[
    "Average Packet Size",
    "Avg Bwd Segment Size",
    "Bwd Packet Length Max",
    "Bwd Packet Length Mean",
    "Bwd Packet Length Min",
    "Bwd Packet Length Std",
    "Down/Up Ratio",
    "Max Packet Length",
    "Packet Length Std",
    "Packet Length Variance",
    "Label"
]]
Markdown("# Feature selection")
display(data.columns)

# Feature selection

Index(['Average Packet Size', 'Avg Bwd Segment Size', 'Bwd Packet Length Max',
       'Bwd Packet Length Mean', 'Bwd Packet Length Min',
       'Bwd Packet Length Std', 'Down/Up Ratio', 'Max Packet Length',
       'Packet Length Std', 'Packet Length Variance', 'Label'],
      dtype='object')

In [7]:
# generate and save scaler
scaler = MinMaxScaler(feature_range=(0, 1)).fit(data)
scaler_path = "../utils/scaler.joblib"
_ = dump(scaler, scaler_path)
Markdown(f"Saved scaler to `{scaler_path}`")

Saved scaler to `../utils/scaler.joblib`

In [8]:
# normalize values
data = pd.DataFrame(
    scaler.transform(data),
    columns = data.columns
)
Markdown("# Data normalization")
display(data)

# Data normalization

Unnamed: 0,Average Packet Size,Avg Bwd Segment Size,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Down/Up Ratio,Max Packet Length,Packet Length Std,Packet Length Variance,Label
0,0.00356,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.000514,0.0,0.0,0.0
1,0.00356,0.001034,0.000514,0.001034,0.00411,0.0,0.142857,0.000514,0.0,0.0,0.0
2,0.00356,0.001034,0.000514,0.001034,0.00411,0.0,0.142857,0.000514,0.0,0.0,0.0
3,0.00356,0.001034,0.000514,0.001034,0.00411,0.0,0.142857,0.000514,0.0,0.0,0.0
4,0.00356,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.000514,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
225740,0.00356,0.001034,0.000514,0.001034,0.00411,0.0,0.142857,0.000514,0.0,0.0,0.0
225741,0.00356,0.001034,0.000514,0.001034,0.00411,0.0,0.142857,0.000514,0.0,0.0,0.0
225742,0.00356,0.001034,0.000514,0.001034,0.00411,0.0,0.142857,0.000514,0.0,0.0,0.0
225743,0.00356,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.000514,0.0,0.0,0.0


In [9]:
# save to file
file_name = "./dataset/preprocessed.csv"
data.to_csv(file_name, index=False)
Markdown(f"Saved to file `{file_name}`")

Saved to file `./dataset/preprocessed.csv`