# Data Processing

## Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import RobustScaler

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

from boruta import BorutaPy
from sklearn.feature_selection import SelectFdr, SequentialFeatureSelector, chi2
from sklearn.inspection import permutation_importance

from sklearn.decomposition import PCA, FastICA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

## Constants

In [None]:
MCAD_DATASET_PATH = ""
COLLECTED_DATASET_PATH = ""

MCAD_CLASS_FILE_MAP = { 
    "ddos": [
        "attack_ddos_tcp.csv",
        "attack_ddos_udp.csv",
        "ddos_attack_scapy_new.csv"
    ],
    "probe": [
        "attack_os_port_scan.csv"
    ],
    "web": [
        "attack_sql_injection.csv"
    ],
    "r2l": [
        "attack_bruteforce.csv",
        "attack_cmd.csv"
    ],
    "u2r": [
        "attack_vnc.csv",
        "attack_samba.csv"
    ],
    "normal": [
        "normal_ditg.csv",
        "normal_internet1.csv",
        "normal_internet2.csv",
        "normal_internet3.csv",
        "normal_iperf.csv",
    ]
}
COLLECTED_CLASS_FILE_MAP = {
    "ddos": [
        "attack_ddos_icmp.csv",
        "attack_ddos_tcp.csv",
        "attack_ddos_udp.csv",
    ],
    "probe": [
        "attack_nmap_probe.csv"
    ],
    "web": [
        "attack_selenium_sqli.csv",
        "attack_selenium_xss.csv",
    ],
    "r2l": [
        "attack_selenium_bruteforce.csv",
        "attack_selenium_cmd.csv",
        "attack_selenium_file_upload.csv",
    ],
    "u2r": [
        "attack_msf_samba.csv",
        "attack_msf_vnc.csv",
    ],
    "normal": [
        "normal_dns.csv",
        "normal_iperf.csv",
        "normal_ping.csv",
        "normal_telnet.csv",
        "normal_w3m.csv",
    ]
}

## Data Loading

### MCAD Dataset

In [None]:
mcad_dfs = []

for attack_class, files in MCAD_CLASS_FILE_MAP.items():
    for file in files:
        try:
            data = pd.read_csv(MCAD_DATASET_PATH + file)
            data["type"] = attack_class
            mcad_dfs.append(data)
        except FileNotFoundError:
            print(f"Error: File '{file}' not found. Skipping...")

# Concatenate all DataFrames
mcad_combined_data = pd.concat(mcad_dfs, ignore_index=True)
print("Successfully combined MCAD data into a single DataFrame!")

In [None]:
mcad_combined_data.head()

In [None]:
# Plot of counts per class

### Collected Dataset

In [None]:
collected_dfs = []

for attack_class, files in COLLECTED_CLASS_FILE_MAP.items():
    for file in files:
        try:
            data = pd.read_csv(COLLECTED_DATASET_PATH + file)
            data["type"] = attack_class
            collected_dfs.append(data)
        except FileNotFoundError:
            print(f"Error: File '{file}' not found. Skipping...")

# Concatenate all DataFrames
collected_combined_data = pd.concat(collected_dfs, ignore_index=True)
print("Successfully combined collected data into a single DataFrame!")

In [None]:
collected_combined_data.head()

In [None]:
# Plot of counts per attack class

In [None]:
# Plot of counts per normal class

### Dataset Comparison

In [None]:
# Plot of MCAD, collected dataset counts comparison

## Preprocessing

### Cleansing / Shuffling

In [None]:
combined = mcad_combined_data.sample(frac=1).reset_index().drop('index', axis = 1)
combined.head()

### Division Transformation

In [None]:
# Laplacian Correction
relevant_attributes = [
    'ip_bytes',
    'ip_packet',
    'ip_duration',
    'port_bytes',
    'port_packet',
    'port_flow_count',
    'table_matched_count',
    'table_active_count',
    'table_lookup_count',
    'port_rx_packets',
    'port_tx_packets',
    'port_rx_bytes',
    'port_tx_bytes',
    'port_duration_sec',
]
for attribute in relevant_attributes:
    combined[attribute] += 1

In [None]:
combined['ip_bytes_sec'] = combined['ip_bytes'] / combined['ip_duration']
combined['ip_packets_sec'] = combined['ip_packet'] / combined['ip_duration']
combined['ip_bytes_packet'] = combined['ip_bytes'] / combined['ip_packet']
combined['port_bytes_sec'] = combined['port_bytes'] / combined['ip_duration']
combined['port_packet_sec'] = combined['port_packet'] / combined['ip_duration']
combined['port_byte_packet'] = combined['port_bytes'] / combined['port_packet']
combined['port_flow_count_sec'] = combined['port_flow_count'] / combined['ip_duration']
combined['table_matched_lookup'] = combined['table_matched_count'] / combined['table_lookup_count']
combined['table_active_lookup'] = combined['table_active_count'] / combined['table_lookup_count']
combined['port_rx_packets_sec'] = combined['port_rx_packets'] / combined['port_duration_sec']
combined['port_tx_packets_sec'] = combined['port_tx_packets'] / combined['port_duration_sec']
combined['port_rx_bytes_sec'] = combined['port_rx_bytes'] / combined['port_duration_sec']
combined['port_tx_bytes_sec'] = combined['port_tx_bytes'] / combined['port_duration_sec']

In [None]:
data = combined[
    [
        'ip_bytes_sec',
        'ip_packets_sec',
        'ip_bytes_packet',
        'port_bytes_sec',
        'port_packet_sec',
        'port_byte_packet',
        'port_flow_count_sec',
        'table_matched_lookup',
        'table_active_lookup',
        'port_rx_packets_sec',
        'port_tx_packets_sec',
        'port_rx_bytes_sec',
        'port_tx_bytes_sec',
        'type'
    ]
].copy()
data.describe()

## Feature Selection

In [None]:
# Splitting into dependent and independent variables
X = data.loc[ : , data.columns != 'type']
y = data['type']

### Benjamini–Hochberg False Discovery Rate (FDR) method

In [None]:
selector = SelectFdr(score_func=chi2, alpha=1e-8).fit(X, y)

In [None]:
fdr_selected = set(selector.get_feature_names_out())

### Sequential Feature Selection  

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)

In [None]:
sfs = SequentialFeatureSelector(knn, n_features_to_select=7)
sfs.fit(X, y)

In [None]:
sfs_selected = set(sfs.get_feature_names_out())

### Boruta

In [None]:
rf = RandomForestClassifier(n_jobs=-1, max_depth=5, class_weight='balanced')

In [None]:
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=1, max_iter=10)
feat_selector.fit(X, y)

In [None]:
print("\n------Support and Ranking for each feature------")
for i in range(len(feat_selector.support_)):
    if feat_selector.support_[i]:
        print("Passes the test: ", X.columns[i],
              " - Ranking: ", feat_selector.ranking_[i])
    else:
        print("Doesn't pass the test: ",
              X.columns[i], " - Ranking: ", feat_selector.ranking_[i], " X")

In [None]:
boruta_selected = set()
for i in range(len(feat_selector.support_)):
    if feat_selector.support_[i]:
        boruta_selected.add(X.columns[i])

### Intersection of Selected Features

In [None]:
selected = list(fdr_selected & sfs_selected & boruta_selected)
selected

## Scaling

In [None]:
X = X[selected]

In [None]:
# Splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
scaler = RobustScaler()

In [None]:
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=selected)
X_test = pd.DataFrame(scaler.transform(X_test), columns=selected)

## Dimensionality Reduction

### Without dimensionality reduction

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision, recall, fscore, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
print("Accuracy:", accuracy)

### With PCA

In [None]:
pca = PCA(n_components=4)
X_train_pca = pd.DataFrame(pca.fit_transform(X_train))

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train_pca, y_train)

In [None]:
X_test_pca = pd.DataFrame(pca.transform(X_test))

In [None]:
y_pred = rf.predict(X_test_pca)
accuracy = accuracy_score(y_test, y_pred)
pca_precision, pca_recall, pca_fscore, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
print("Accuracy:", accuracy)

### With ICA

In [None]:
ica = FastICA(n_components=4)
X_train_ica = pd.DataFrame(ica.fit_transform(X_train))

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train_ica, y_train)

In [None]:
X_test_ica = pd.DataFrame(ica.transform(X_test))

In [None]:
y_pred = rf.predict(X_test_ica)
accuracy = accuracy_score(y_test, y_pred)
ica_precision, ica_recall, ica_fscore, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
print("Accuracy:", accuracy)

### With LDA

In [None]:
lda = LinearDiscriminantAnalysis(n_components=4)
lda.fit(X_train, y_train)
X_train_lda = pd.DataFrame(lda.transform(X_train))

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train_lda, y_train)

In [None]:
X_test_lda = pd.DataFrame(lda.transform(X_test))

In [None]:
y_pred = rf.predict(X_test_lda)
accuracy = accuracy_score(y_test, y_pred)
lda_precision, lda_recall, lda_fscore, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
print("Accuracy:", accuracy)

### Dimensionality Reduction Methods Comparison

In [None]:
# Plot comparing all four, with Accuracy, Precision, Recall and F1 score