In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
import pickle

## MCAD Dataset Features

The MCAD dataset contains network flow data with the following features:

**IP Address Information:**

* `src`: Source IP address
* `dst`: Destination IP address

**Flow Information:**

* `table_id`: ID of the table where the flow is stored
* `ip_bytes`: Number of bytes in the flow on the IP address level
* `ip_packet`: Number of packets in the flow on the IP address level
* `ip_duration`: Duration (in seconds) the flow has been alive on the IP address level

**Port Information:**

* `in_port`: Receiver port number
* `dl dst`: MAC address of the receiver host
* `port_bytes`: Number of bytes in the flow on the port level
* `port_packet`: Number of packets in the flow on the port level (may be a duplicate of `ip_packet`)
* `port_flow_count`: Number of flows on the port level

**Table Information:**

* `table_active_count`: Number of active entries in the table
* `table_lookup_count`: Number of packets looked up in the table
* `table_matched_count`: Number of packets that matched an entry in the table

**Packet Transfer Information:**

* `port_rx_packets`: Number of received packets on the port level
* `port_tx_packets`: Number of transmitted packets on the port level
* `port_rx_bytes`: Number of received bytes on the port level
* `port_tx_bytes`: Number of transmitted bytes on the port level

**Packet Drop and Error Information:**

* `port_rx_dropped`: Number of packets dropped by the receiver on the port level
* `port_tx_dropped`: Number of packets dropped by the transmitter on the port level
* `port_rx_errors`: Number of received errors
* `port_tx_errors`: Number of transmitted errors

**Frame Error Information:**

* `port_rx_frame_err`: Number of frame alignment errors
* `port_rx_over_err`: Number of received packets with overrun
* `port_rx_crc_err`: Number of CRC errors

**Collision Information:**

* `port_collisions`: Number of collisions

### Data Preprocessing

In [2]:
DATASET_PATH = "../../Dataset/Collected Dataset/"

# flow data
flow_files = {
    "attack": [
        "icmp_ddos_flood.csv",
        "land_attack.csv",
        "malformed_packets.csv",
        "nestea_attack.csv",
        "nmap_probe_attack.csv",
        "ping_of_death_attack.csv",
        "tcp_ddos_flood.csv",
        "udp_ddos_flood.csv"
    ],
    "normal": [
        "normal_iperf.csv",
        "normal_w3m.csv"
    ]
}

In [3]:
dfs = []

for attack_type, files in flow_files.items():
    for file in files:
        try:
            data = pd.read_csv(DATASET_PATH + file)
            data["type"] = attack_type
            dfs.append(data)
        except FileNotFoundError:
            print(f"Error: File '{file}' not found. Skipping...")

In [4]:
# Concatenate all DataFrames
combined_data = pd.concat(dfs, ignore_index=True)
print("Successfully combined data into a single DataFrame!")

Successfully combined data into a single DataFrame!


In [5]:
# Randomize data, remove previous indices
combined = combined_data.sample(frac=1).reset_index().drop('index', axis = 1)
combined.head()

Unnamed: 0,src,dst,table_id,ip_bytes,ip_packet,ip_duration,in_port,port_bytes,port_packet,port_flow_count,...,port_rx_dropped,port_tx_dropped,port_rx_errors,port_tx_errors,port_rx_frame_err,port_rx_over_err,port_rx_crc_err,port_collisions,port_duration_sec,type
0,,,0,18033839,5363,535,4,57922829,62901,7,...,0,0,0,0,0,0,0,0,577,normal
1,,,0,3748168,2564,62,1,16075194,11220,7,...,0,0,0,0,0,0,0,0,99,attack
2,,,0,49401858,33789,356,3,198171552,135643,7,...,0,0,0,0,0,0,0,0,391,attack
3,,,0,10332,246,155,2,99278,2089,7,...,0,0,0,0,0,0,0,0,174,attack
4,,,0,572009,6801,623,1,26273162,15503,7,...,0,0,0,0,0,0,0,0,771,normal


In [6]:
# Extracting relevant features
combined['ip_bytes_sec'] = combined['ip_bytes'] / combined['ip_duration']
combined['ip_packets_sec'] = combined['ip_packet'] / combined['ip_duration']
combined['ip_bytes_packet'] = combined['ip_bytes'] / combined['ip_packet']
combined['port_bytes_sec'] = combined['port_bytes'] / combined['ip_duration']
combined['port_packet_sec'] = combined['port_packet'] / combined['ip_duration']
combined['port_byte_packet'] = combined['port_bytes'] / combined['port_packet']
combined['port_flow_count_sec'] = combined['port_flow_count'] / combined['ip_duration']
combined['table_matched_lookup'] = combined['table_matched_count'] / combined['table_lookup_count']
combined['table_active_lookup'] = combined['table_active_count'] / combined['table_lookup_count']
combined['port_rx_packets_sec'] = combined['port_rx_packets'] / combined['port_duration_sec']
combined['port_tx_packets_sec'] = combined['port_tx_packets'] / combined['port_duration_sec']
combined['port_rx_bytes_sec'] = combined['port_rx_bytes'] / combined['port_duration_sec']
combined['port_tx_bytes_sec'] = combined['port_tx_bytes'] / combined['port_duration_sec']

In [7]:
# Removing identifying features
features = combined[
    [
        'ip_bytes_sec',
        'ip_packets_sec',
        'ip_bytes_packet',
        'port_bytes_sec',
        'port_packet_sec',
        'port_byte_packet',
        'port_flow_count_sec',
        'table_matched_lookup',
        'table_active_lookup',
        'port_rx_packets_sec',
        'port_tx_packets_sec',
        'port_rx_bytes_sec',
        'port_tx_bytes_sec',
        'type'
    ]
].copy()
features.describe()

Unnamed: 0,ip_bytes_sec,ip_packets_sec,ip_bytes_packet,port_bytes_sec,port_packet_sec,port_byte_packet,port_flow_count_sec,table_matched_lookup,table_active_lookup,port_rx_packets_sec,port_tx_packets_sec,port_rx_bytes_sec,port_tx_bytes_sec
count,50530.0,50530.0,49849.0,50573.0,50573.0,50573.0,50573.0,50573.0,50573.0,50573.0,50573.0,50573.0,50573.0
mean,inf,inf,494.915613,inf,inf,539.605586,inf,0.999999,0.000615,334.377167,334.484294,109532.0,106728.9
std,,,944.887709,,,668.153399,,7.9e-05,0.003125,628.565251,630.734287,229522.2,231407.2
min,0.0,0.0,42.0,18.14542,0.2145422,42.003799,0.002153625,0.9875,3e-06,0.04717,0.059514,2.842767,6.446359
25%,6.125,0.1150877,42.0,14022.97,101.6242,46.575499,0.0162413,1.0,1.2e-05,0.480519,0.967742,36.07407,93.16327
50%,997.3598,11.87286,54.0,93274.14,560.0522,55.617732,0.02508961,1.0,4.3e-05,37.783163,34.684932,2146.387,6597.053
75%,67548.16,314.0652,154.194175,410710.8,1437.216,1065.198882,0.04895105,1.0,0.000284,294.468368,287.739033,104549.7,100096.6
max,inf,inf,6221.172685,inf,inf,3174.864845,inf,1.0,0.093333,3050.359127,3049.918651,1135476.0,1135706.0


In [8]:
# Removing inf and NaN values
features.replace([np.inf, -np.inf], np.nan, inplace=True)
features.dropna(inplace=True)
features.describe()

Unnamed: 0,ip_bytes_sec,ip_packets_sec,ip_bytes_packet,port_bytes_sec,port_packet_sec,port_byte_packet,port_flow_count_sec,table_matched_lookup,table_active_lookup,port_rx_packets_sec,port_tx_packets_sec,port_rx_bytes_sec,port_tx_bytes_sec
count,49800.0,49800.0,49800.0,49800.0,49800.0,49800.0,49800.0,49800.0,49800.0,49800.0,49800.0,49800.0,49800.0
mean,93671.81,236.13131,494.886636,281928.3,907.439541,542.432774,0.066142,1.0,0.000566,339.41009,335.60413,111185.5,106797.7
std,235150.7,415.716971,945.110971,451296.1,1786.114418,670.550416,0.254826,3e-06,0.00272,632.046705,634.771152,230887.3,232471.3
min,0.2359551,0.005618,42.0,18.14542,0.214542,42.003799,0.002154,0.999696,3e-06,0.04717,0.059514,2.842767,6.446359
25%,6.786446,0.143802,42.0,13999.84,101.198653,46.581597,0.016166,1.0,1.2e-05,0.541667,0.904255,43.56643,84.9901
50%,1096.707,12.144878,54.0,92853.99,548.652665,55.620882,0.024735,1.0,4.2e-05,38.743503,33.900826,2216.779,6349.408
75%,67825.4,314.449498,154.183838,410189.3,1427.216086,1065.232359,0.04698,1.0,0.000273,297.171163,286.930287,107735.2,98284.74
max,1607856.0,3167.0,6221.172685,28185520.0,181944.0,3174.864845,9.0,1.0,0.093333,3050.359127,3049.918651,1135476.0,1135706.0


In [9]:
# Total attack and normal data
features['type'].value_counts()

type
attack    40356
normal     9444
Name: count, dtype: int64

### Train, Test splitting

In [10]:
# Splitting into dependent and independent variables
X = features.loc[ : , features.columns != 'type']
y = features['type']

# Splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

### Data scaling

In [11]:
# Scaling data using RS
X_train = pd.DataFrame(RobustScaler().fit_transform(X_train), columns=[
    'ip_bytes_sec',
    'ip_packets_sec',
    'ip_bytes_packet',
    'port_bytes_sec',
    'port_packet_sec',
    'port_byte_packet',
    'port_flow_count_sec',
    'table_matched_lookup',
    'table_active_lookup',
    'port_rx_packets_sec',
    'port_tx_packets_sec',
    'port_rx_bytes_sec',
    'port_tx_bytes_sec',
])

X_test = pd.DataFrame(RobustScaler().fit_transform(X_test), columns=[
    'ip_bytes_sec',
    'ip_packets_sec',
    'ip_bytes_packet',
    'port_bytes_sec',
    'port_packet_sec',
    'port_byte_packet',
    'port_flow_count_sec',
    'table_matched_lookup',
    'table_active_lookup',
    'port_rx_packets_sec',
    'port_tx_packets_sec',
    'port_rx_bytes_sec',
    'port_tx_bytes_sec',
])

### Model training and accuracy (No PCA)

In [12]:
# Fitting model to data
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [13]:
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8821285140562249


### Model training and accuracy (PCA applied)

In [14]:
from sklearn.decomposition import PCA

pca = PCA(n_components=5)
X_train2 = pd.DataFrame(pca.fit_transform(X_train))

In [15]:
rf = RandomForestClassifier()
rf.fit(X_train2, y_train)

In [16]:
X_test2 = pd.DataFrame(pca.transform(X_test))
y_pred = rf.predict(X_test2)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9987951807228915


### Exporting model

In [22]:
filename = 'test_model.sav'
pickle.dump(rf, open(filename, 'wb'), protocol=2)

In [23]:
filename = 'test_pca.sav'
pickle.dump(pca, open(filename, 'wb'), protocol=2)