## Cell 1: Imports & Config

In [1]:
import pandas as pd
import numpy as np
import glob
import os
import re
import unicodedata

from rapidfuzz import process, fuzz
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

import xgboost as xgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM


## Cell 2: Load & Merge with Fuzzy Matching

In [2]:
def load_merge_files(folder_path):
    all_files = glob.glob(os.path.join(folder_path, "*.csv"))
    df_list = []
    for file in all_files:
        print(f"Loading: {file}")
        df = pd.read_csv(file, low_memory=False)
        df.columns = df.columns.str.strip()
        df_list.append(df)
    return pd.concat(df_list, ignore_index=True)

# Paths
path_2017 = "C:/Users/GPU RTX 5000/Desktop/Major Project Dataset/major/major_merge/cicids2017_dataset"
path_2018 = "C:/Users/GPU RTX 5000/Desktop/Major Project Dataset/major/major_merge/cicids2018_dataset"

# Load data
df_2017 = load_merge_files(path_2017)
df_2018 = load_merge_files(path_2018)

# Fuzzy match columns
def fuzzy_align_columns(cols_src, cols_target, threshold=85):
    rename_map = {}
    for col in cols_src:
        match, score, _ = process.extractOne(col, cols_target, scorer=fuzz.token_sort_ratio)
        if score >= threshold:
            rename_map[match] = col
    return rename_map

rename_dict = fuzzy_align_columns(df_2017.columns, df_2018.columns)
df_2018_renamed = df_2018.rename(columns=rename_dict)

# Keep common columns
common_cols = df_2017.columns.intersection(df_2018_renamed.columns)
df_2017 = df_2017[common_cols]
df_2018_renamed = df_2018_renamed[common_cols]


Loading: C:/Users/GPU RTX 5000/Desktop/Major Project Dataset/major/major_merge/cicids2017_dataset\Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
Loading: C:/Users/GPU RTX 5000/Desktop/Major Project Dataset/major/major_merge/cicids2017_dataset\Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv
Loading: C:/Users/GPU RTX 5000/Desktop/Major Project Dataset/major/major_merge/cicids2017_dataset\Friday-WorkingHours-Morning.pcap_ISCX.csv
Loading: C:/Users/GPU RTX 5000/Desktop/Major Project Dataset/major/major_merge/cicids2017_dataset\Monday-WorkingHours.pcap_ISCX.csv
Loading: C:/Users/GPU RTX 5000/Desktop/Major Project Dataset/major/major_merge/cicids2017_dataset\Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv
Loading: C:/Users/GPU RTX 5000/Desktop/Major Project Dataset/major/major_merge/cicids2017_dataset\Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv
Loading: C:/Users/GPU RTX 5000/Desktop/Major Project Dataset/major/major_merge/cicids2017_dataset\Tuesday-WorkingHours.

## Save the Merged File Locally

In [4]:
df = pd.concat([df_2017, df_2018_renamed], ignore_index=True).dropna()

# Save the merged file
df.to_csv("CICIDS2017_2018_Merged_Fuzzy.csv", index=False)
print("✅ Merged dataset saved as 'CICIDS2017_2018_Merged_Fuzzy.csv'")


✅ Merged dataset saved as 'CICIDS2017_2018_Merged_Fuzzy.csv'


## Cell 3: Clean Labels and Merge

In [3]:
def clean_label(label):
    if not isinstance(label, str):
        return 'Attack'
    label = label.strip().lower()
    label = unicodedata.normalize("NFKD", label).encode("ascii", "ignore").decode()
    label = re.sub(r"[^a-zA-Z0-9_]", "_", label)
    return label.replace("__", "_")

df_2017['Label'] = df_2017['Label'].apply(clean_label)
df_2018_renamed['Label'] = df_2018_renamed['Label'].apply(clean_label)

# Binary label for stage 1
df_2017['BinaryLabel'] = df_2017['Label'].apply(lambda x: 'Benign' if 'benign' in x else 'Malicious')
df_2018_renamed['BinaryLabel'] = df_2018_renamed['Label'].apply(lambda x: 'Benign' if 'benign' in x else 'Malicious')

df = pd.concat([df_2017, df_2018_renamed], ignore_index=True).dropna()


In [None]:
df = pd.concat([df_2017, df_2018_renamed], ignore_index=True).dropna()

# Save the merged file
df.to_csv("CICIDS2017_2018_Merged_Fuzzy.csv", index=False)
print("✅ Merged dataset saved as 'CICIDS2017_2018_Merged_Fuzzy.csv'")

## Cell 4: Feature Selection (Random Forest)

In [5]:
X = df.drop(columns=['Label', 'BinaryLabel'])
X = X.apply(pd.to_numeric, errors='coerce').dropna()
y_binary = df.loc[X.index, 'BinaryLabel']

# Encode binary labels
le_bin = LabelEncoder()
y_binary_encoded = le_bin.fit_transform(y_binary)

# Random Forest for top features
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y_binary_encoded)

# Select top 40 features
feat_imp = pd.Series(rf.feature_importances_, index=X.columns)
top_features = feat_imp.sort_values(ascending=False).head(40).index.tolist()
X = X[top_features]


ValueError: Input X contains infinity or a value too large for dtype('float32').

## 4.1

In [None]:
# Save the full merged dataset with cleaned labels
df.to_csv("CICIDS2017_2018_Merged_Cleaned.csv", index=False)

# Save the top 40 selected features with binary labels
df_selected = X.copy()
df_selected['BinaryLabel'] = y_binary
df_selected.to_csv("CICIDS2017_2018_SelectedFeatures_Binary.csv", index=False)


## file format

In [None]:
import joblib

# Save LabelEncoder
joblib.dump(le_bin, 'binary_label_encoder.pkl')


## Cell 5: Stage 1 - Binary Classification

In [None]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_binary_encoded, test_size=0.2, random_state=42, stratify=y_binary_encoded
)

# Binary Classifier (Random Forest)
rf_stage1 = RandomForestClassifier(n_estimators=100, random_state=42)
rf_stage1.fit(X_train, y_train)
binary_preds = rf_stage1.predict(X_test)

print("=== Stage 1: Binary Classification ===")
print(confusion_matrix(y_test, binary_preds))
print(classification_report(y_test, binary_preds, target_names=le_bin.classes_))


## Cell 6: Stage 2 - Multiclass Classification

# Filter malicious only
malicious_idx = np.where(binary_preds == 1)[0]
X_malicious = X_test[malicious_idx]
y_multiclass = df.loc[X.index, 'Label']
y_multiclass_encoded = LabelEncoder().fit_transform(y_multiclass)
y_mal_test = y_multiclass_encoded[y_test == 1]

# Train multiclass model
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train[y_train == 1], y_multiclass_encoded[y_train == 1])

multi_preds = xgb_model.predict(X_malicious)

print("=== Stage 2: Multiclass Classification ===")
print(confusion_matrix(y_mal_test, multi_preds))
print(classification_report(y_mal_test, multi_preds))
