In [1]:
# --- Stage 1: Data Loading, Cleaning, and Merging ---

import pandas as pd
import numpy as np
import glob
import os
import re
import unicodedata
from rapidfuzz import process, fuzz
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import xgboost as xgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
import joblib

# Load pre-merged dataset
df = pd.read_csv("CICIDS2017_2018_Merged_Fuzzy.csv")

# Clean label column again in case needed
def clean_label(label):
    if not isinstance(label, str):
        return 'Attack'
    label = label.strip().lower()
    label = unicodedata.normalize("NFKD", label).encode("ascii", "ignore").decode()
    label = re.sub(r"[^a-zA-Z0-9_]", "_", label)
    return label.replace("__", "_")

df['Label'] = df['Label'].apply(clean_label)
df['BinaryLabel'] = df['Label'].apply(lambda x: 'Benign' if 'benign' in x else 'Malicious')

  df = pd.read_csv("CICIDS2017_2018_Merged_Fuzzy.csv")


In [None]:
# --- Stage 1: Binary Classification ---

# Drop target columns
X = df.drop(columns=['Label', 'BinaryLabel'])

# Convert to numeric
X = X.apply(pd.to_numeric, errors='coerce')

# Replace inf/-inf with NaN
X.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop rows with any NaN (including previously inf)
X = X.dropna()

# Now define the binary target aligned with the cleaned X
y_binary = df.loc[X.index, 'BinaryLabel']

# Encode labels
le_bin = LabelEncoder()
y_binary_encoded = le_bin.fit_transform(y_binary)

# Feature selection with Random Forest
rf_bin = RandomForestClassifier(n_estimators=100, random_state=42)
rf_bin.fit(X, y_binary_encoded)

# Select top 40 features
feat_imp_bin = pd.Series(rf_bin.feature_importances_, index=X.columns)
top_features_bin = feat_imp_bin.sort_values(ascending=False).head(40).index.tolist()

# Use top features
X_stage1 = X[top_features_bin]



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot top 40 feature importances
plt.figure(figsize=(12, 10))
sns.barplot(
    x=feat_imp_bin[top_features_bin].values,
    y=top_features_bin,
    palette='viridis'
)
plt.title('Top 40 Features by Random Forest Importance (Stage 1)', fontsize=16)
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.tight_layout()
plt.show()
