In [1]:
# set up jupyter
from IPython.display import display, Markdown # pretty display
from IPython.core.interactiveshell import InteractiveShell # multi output

InteractiveShell.ast_node_interactivity = "all" # type: ignore

In [2]:
# import packages
import pandas as pd
from joblib import dump
from sklearn.preprocessing import MinMaxScaler

In [3]:
# read raw data file
file_name = "./dataset/raw.csv"
data = pd.read_csv(file_name)
data.columns = data.columns.str.strip()
Markdown("# Raw data")
display(data)

# Raw data

Unnamed: 0.1,Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Fwd Packets Length Total,Bwd Packets Length Total,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,...,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,Class
0,0,17,216631,6,0,2088.0,0.0,393.0,321.0,348.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,UDP,Attack
1,1,17,2,2,0,802.0,0.0,401.0,401.0,401.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,UDP,Attack
2,2,17,48,2,0,766.0,0.0,383.0,383.0,383.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,UDP,Attack
3,3,17,107319,4,0,1398.0,0.0,369.0,330.0,349.50000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,UDP,Attack
4,4,17,107271,4,0,1438.0,0.0,389.0,330.0,359.50000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,UDP,Attack
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
431366,306196,6,550740,2,1,0.0,31.0,0.0,0.0,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign,Benign
431367,306197,6,337766,1,3,31.0,62.0,31.0,31.0,31.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign,Benign
431368,306198,6,228,1,2,0.0,0.0,0.0,0.0,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign,Benign
431369,306199,17,47709,2,2,86.0,194.0,43.0,43.0,43.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign,Benign


In [4]:
# inspect column info
Markdown("## Column info")
display(data.info())

## Column info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 431371 entries, 0 to 431370
Data columns (total 80 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Unnamed: 0                431371 non-null  int64  
 1   Protocol                  431371 non-null  int64  
 2   Flow Duration             431371 non-null  int64  
 3   Total Fwd Packets         431371 non-null  int64  
 4   Total Backward Packets    431371 non-null  int64  
 5   Fwd Packets Length Total  431371 non-null  float64
 6   Bwd Packets Length Total  431371 non-null  float64
 7   Fwd Packet Length Max     431371 non-null  float64
 8   Fwd Packet Length Min     431371 non-null  float64
 9   Fwd Packet Length Mean    431371 non-null  float64
 10  Fwd Packet Length Std     431371 non-null  float64
 11  Bwd Packet Length Max     431371 non-null  float64
 12  Bwd Packet Length Min     431371 non-null  float64
 13  Bwd Packet Length Mean    431371 non-null  f

None

In [5]:
# convert labels to 0 and 1
data["Class"] = data["Class"].apply(lambda x: 1 if x == "Attack" else 0)
Markdown("Converted labels to `0` and `1`")

Converted labels to `0` and `1`

In [6]:
# select columns of interest
data = data[[
    "Avg Packet Size",
    "Avg Bwd Segment Size",
    "Bwd Packet Length Max",
    "Bwd Packet Length Mean",
    "Bwd Packet Length Min",
    "Bwd Packet Length Std",
    "Down/Up Ratio",
    "Packet Length Max",
    "Packet Length Std",
    "Packet Length Variance",
    "Class"
]]
Markdown("# Feature selection")
display(data.columns)

# Feature selection

Index(['Avg Packet Size', 'Avg Bwd Segment Size', 'Bwd Packet Length Max',
       'Bwd Packet Length Mean', 'Bwd Packet Length Min',
       'Bwd Packet Length Std', 'Down/Up Ratio', 'Packet Length Max',
       'Packet Length Std', 'Packet Length Variance', 'Class'],
      dtype='object')

In [7]:
# generate and save scaler
scaler = MinMaxScaler(feature_range=(0, 1)).fit(data)
scaler_path = "../utils/scaler.joblib"
_ = dump(scaler, scaler_path)
Markdown(f"Saved scaler to `{scaler_path}`")

Saved scaler to `../utils/scaler.joblib`

In [8]:
# normalize values
data = pd.DataFrame(
    scaler.transform(data),
    columns = data.columns
)
Markdown("# Data normalization")
display(data)

# Data normalization

Unnamed: 0,Avg Packet Size,Avg Bwd Segment Size,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Down/Up Ratio,Packet Length Max,Packet Length Std,Packet Length Variance,Class
0,0.099732,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.010353,0.005081,0.000026,1.0
1,0.149412,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.010564,0.000000,0.000000,1.0
2,0.142705,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.010090,0.000000,0.000000,1.0
3,0.107308,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.009721,0.003228,0.000010,1.0
4,0.109792,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.010248,0.004884,0.000024,1.0
...,...,...,...,...,...,...,...,...,...,...,...
431366,0.002567,0.006186,0.000817,0.006186,0.021233,0.000000,0.000000,0.000817,0.002343,0.000005,0.0
431367,0.007700,0.004124,0.000817,0.004124,0.000000,0.002540,0.130435,0.000817,0.002095,0.000004,0.0
431368,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.086957,0.000000,0.000000,0.000000,0.0
431369,0.020058,0.019357,0.002555,0.019357,0.066438,0.000000,0.043478,0.002555,0.004470,0.000020,0.0


In [7]:
# save to file
file_name = "./dataset/preprocessed.csv"
data.to_csv(file_name, index=False)
Markdown(f"Saved to file `{file_name}`")

Saved to file `./dataset/preprocessed.csv`