In [5]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

## MCAD Dataset Features

The MCAD dataset contains network flow data with the following features:

**IP Address Information:**

* `src`: Source IP address
* `dst`: Destination IP address

**Flow Information:**

* `table_id`: ID of the table where the flow is stored
* `ip_bytes`: Number of bytes in the flow on the IP address level
* `ip_packet`: Number of packets in the flow on the IP address level
* `ip_duration`: Duration (in seconds) the flow has been alive on the IP address level

**Port Information:**

* `in_port`: Receiver port number
* `dl dst`: MAC address of the receiver host
* `port_bytes`: Number of bytes in the flow on the port level
* `port_packet`: Number of packets in the flow on the port level (may be a duplicate of `ip_packet`)
* `port_flow_count`: Number of flows on the port level

**Table Information:**

* `table_active_count`: Number of active entries in the table
* `table_lookup_count`: Number of packets looked up in the table
* `table_matched_count`: Number of packets that matched an entry in the table

**Packet Transfer Information:**

* `port_rx_packets`: Number of received packets on the port level
* `port_tx_packets`: Number of transmitted packets on the port level
* `port_rx_bytes`: Number of received bytes on the port level
* `port_tx_bytes`: Number of transmitted bytes on the port level

**Packet Drop and Error Information:**

* `port_rx_dropped`: Number of packets dropped by the receiver on the port level
* `port_tx_dropped`: Number of packets dropped by the transmitter on the port level
* `port_rx_errors`: Number of received errors
* `port_tx_errors`: Number of transmitted errors

**Frame Error Information:**

* `port_rx_frame_err`: Number of frame alignment errors
* `port_rx_over_err`: Number of received packets with overrun
* `port_rx_crc_err`: Number of CRC errors

**Collision Information:**

* `port_collisions`: Number of collisions

### Data Preprocessing

In [6]:
DATASET_PATH = "../../Dataset/MCAD Dataset/"

# Attack flow data
attack_files = {
    "ddos": [
        "attack_ddos_tcp.csv",
        "attack_ddos_udp.csv",
        "ddos_attack_scapy_new.csv"
    ],
    "probe": [
        "attack_os_port_scan.csv"
    ],
    "web": [
        "attack_sql_injection.csv"
    ],
    "r2l": [
        "attack_bruteforce.csv",
        "attack_cmd.csv"
    ],
    "u2r": [
        "attack_vnc.csv",
        "attack_samba.csv"
    ],
    "normal": [
        "normal_ditg.csv",
        "normal_internet1.csv",
        "normal_internet2.csv",
        "normal_internet3.csv",
        "normal_iperf.csv",
    ]
}

dfs = []

for attack_type, files in attack_files.items():
    for file in files:
        try:
            data = pd.read_csv(DATASET_PATH + file)
            data["type"] = attack_type
            dfs.append(data)
        except FileNotFoundError:
            print(f"Error: File '{file}' not found. Skipping...")


# Concatenate all DataFrames
combined_data = pd.concat(dfs, ignore_index=True)
print("Successfully combined data into a single DataFrame!")

Successfully combined data into a single DataFrame!


In [7]:
# Randomize data, remove previous indices
combined = combined_data.sample(frac=1).reset_index().drop('index', axis = 1)
combined.head()

Unnamed: 0,src,dst,table_id,ip_bytes,ip_packet,ip_duration,in_port,dl_dst,port_bytes,port_packet,...,port_rx_dropped,port_tx_dropped,port_rx_errors,port_tx_errors,port_rx_frame_err,port_rx_over_err,port_rx_crc_err,port_collisions,port_duration_sec,type
0,10.0.0.4,internet,0,12204,152,2,3,52:75:9b:69:f1:fc,14256,162,...,0,0,0,0,0,0,0,0,1064,normal
1,10.0.0.2,10.0.0.3,0,24750,375,37,2,1a:16:b6:94:29:a4,24750,375,...,0,0,0,0,0,0,0,0,52349,normal
2,192.168.221.129,192.168.18.129,0,206,3,3,1,08:00:27:c8:c8:75,11272,165,...,0,0,0,0,0,0,0,0,47354,probe
3,10.0.0.1,10.0.0.4,0,8470007292,184758,5,1,36:97:e5:0a:a1:9d,8470007292,184758,...,0,0,0,0,0,0,0,0,33443,normal
4,192.168.221.129,192.168.18.129,0,1692,8,3,1,08:00:27:c8:c8:75,15077,45,...,0,0,0,0,0,0,0,0,26325,web


In [8]:
# Extracting relevant features
combined['ip_bytes_sec'] = combined['ip_bytes'] / combined['ip_duration']
combined['ip_packets_sec'] = combined['ip_packet'] / combined['ip_duration']
combined['ip_bytes_packet'] = combined['ip_bytes'] / combined['ip_packet']
combined['port_bytes_sec'] = combined['port_bytes'] / combined['ip_duration']
combined['port_packet_sec'] = combined['port_packet'] / combined['ip_duration']
combined['port_byte_packet'] = combined['port_bytes'] / combined['port_packet']
combined['port_flow_count_sec'] = combined['port_flow_count'] / combined['ip_duration']
combined['table_matched_lookup'] = combined['table_matched_count'] / combined['table_lookup_count']
combined['table_active_lookup'] = combined['table_active_count'] / combined['table_lookup_count']
combined['port_rx_packets_sec'] = combined['port_rx_packets'] / combined['port_duration_sec']
combined['port_tx_packets_sec'] = combined['port_tx_packets'] / combined['port_duration_sec']
combined['port_rx_bytes_sec'] = combined['port_rx_bytes'] / combined['port_duration_sec']
combined['port_tx_bytes_sec'] = combined['port_tx_bytes'] / combined['port_duration_sec']

In [9]:
# Removing identifying features
features = combined[
    [
        'ip_bytes_sec',
        'ip_packets_sec',
        'ip_bytes_packet',
        'port_bytes_sec',
        'port_packet_sec',
        'port_byte_packet',
        'port_flow_count_sec',
        'table_matched_lookup',
        'table_active_lookup',
        'port_rx_packets_sec',
        'port_tx_packets_sec',
        'port_rx_bytes_sec',
        'port_tx_bytes_sec',
        'type'
    ]
].copy()
features.describe()

Unnamed: 0,ip_bytes_sec,ip_packets_sec,ip_bytes_packet,port_bytes_sec,port_packet_sec,port_byte_packet,port_flow_count_sec,table_matched_lookup,table_active_lookup,port_rx_packets_sec,port_tx_packets_sec,port_rx_bytes_sec,port_tx_bytes_sec
count,580404.0,580404.0,452277.0,597407.0,597407.0,494550.0,600639.0,600639.0,600639.0,600639.0,600639.0,600639.0,600639.0
mean,inf,inf,1202.044452,inf,inf,1041.411101,inf,0.999452,0.00019,2257.680682,2335.643851,6631135.0,6667613.0
std,,,6414.474897,,,6123.736533,,0.001358,0.001123,10219.996477,11071.044552,26038330.0,26183250.0
min,0.0,0.0,42.0,0.0,0.0,42.0,0.0,0.96671,0.0,0.075705,0.031126,5.342051,1.945194
25%,60.0,1.0,69.5,140.395,2.004526,72.095238,1.0,0.999298,7e-06,13.510969,2.672675,840.8912,245.6661
50%,556.0,5.0,81.0,5006.0,36.0,103.3,20.0,0.999996,2e-05,40.713272,7.152422,9082.562,2374.347
75%,859055.0,11400.05,211.5,1624788.0,18984.74,309.021127,87.0,1.0,0.000149,56.834448,13.714397,25227.39,13554.93
max,inf,inf,59433.840607,inf,inf,59433.840607,inf,1.0,0.117647,76046.285714,77672.248623,361362000.0,362690300.0


In [10]:
# Removing inf and NaN values
features.replace([np.inf, -np.inf], np.nan, inplace=True)
features.dropna(inplace=True)
features.describe()

Unnamed: 0,ip_bytes_sec,ip_packets_sec,ip_bytes_packet,port_bytes_sec,port_packet_sec,port_byte_packet,port_flow_count_sec,table_matched_lookup,table_active_lookup,port_rx_packets_sec,port_tx_packets_sec,port_rx_bytes_sec,port_tx_bytes_sec
count,351440.0,351440.0,351440.0,351440.0,351440.0,351440.0,351440.0,351440.0,351440.0,351440.0,351440.0,351440.0,351440.0
mean,46547300.0,3109.895009,1386.777235,47029060.0,5180.485586,1297.69058,12.271354,0.999423,8e-05,3747.721088,3907.38817,9034158.0,9101573.0
std,323254700.0,9352.464107,6996.579666,325832100.0,15300.665796,6991.41621,25.896976,0.001452,0.000349,13148.260145,14256.084923,29984900.0,30145590.0
min,13.5,0.25,42.0,13.5,0.25,42.0,7e-05,0.966762,0.0,0.075705,0.031173,5.342051,1.951717
25%,81.0,1.001032,69.5,1336.0,10.105263,70.0,0.032258,0.999284,5e-06,6.210569,4.37277,552.2635,569.466
50%,664.8889,5.5,82.25,4883.0,34.666667,100.108108,3.5,0.999995,1.4e-05,42.562171,7.80197,14633.53,7416.903
75%,7937.619,19.0,210.444444,74088.0,136.0,337.571429,16.0,1.0,4.2e-05,58.571233,23.105143,26826.65,14726.4
max,5466231000.0,102303.0,57707.308373,5909109000.0,134900.0,57707.308373,857.0,1.0,0.046875,76046.285714,77672.248623,354913800.0,289363100.0


In [11]:
# Total attack and normal data
features['type'].value_counts()

type
normal    112688
ddos       72041
u2r        56036
r2l        47482
web        43432
probe      19761
Name: count, dtype: int64

### Train, Test splitting

In [12]:
# Splitting into dependent and independent variables
X = features.loc[ : , features.columns != 'type']
y = features['type']

# Splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

### Data scaling

In [13]:
# Scaling data using RS
X_train = pd.DataFrame(RobustScaler().fit_transform(X_train), columns=[
    'ip_bytes_sec',
    'ip_packets_sec',
    'ip_bytes_packet',
    'port_bytes_sec',
    'port_packet_sec',
    'port_byte_packet',
    'port_flow_count_sec',
    'table_matched_lookup',
    'table_active_lookup',
    'port_rx_packets_sec',
    'port_tx_packets_sec',
    'port_rx_bytes_sec',
    'port_tx_bytes_sec',
])

X_test = pd.DataFrame(RobustScaler().fit_transform(X_test), columns=[
    'ip_bytes_sec',
    'ip_packets_sec',
    'ip_bytes_packet',
    'port_bytes_sec',
    'port_packet_sec',
    'port_byte_packet',
    'port_flow_count_sec',
    'table_matched_lookup',
    'table_active_lookup',
    'port_rx_packets_sec',
    'port_tx_packets_sec',
    'port_rx_bytes_sec',
    'port_tx_bytes_sec',
])

### Model training and accuracy

Without PCA

In [None]:
# Fitting model to data
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [None]:
# Getting accuracy of the model
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

With PCA

In [14]:
from sklearn.decomposition import PCA

pca = PCA(n_components=6)
X_train2 = pd.DataFrame(pca.fit_transform(X_train))

In [15]:
rf = RandomForestClassifier()
rf.fit(X_train2, y_train)

In [16]:
X_test2 = pd.DataFrame(pca.transform(X_test))

In [17]:
y_pred = rf.predict(X_test2)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.957830639653995


**Accuracy records**:
 - Accuracy: 0.9438026405645344
 - Accuracy: 0.9455383564762122
 - Accuracy: 0.9928864101980424