## 2 stage IDS Hybrid

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical

import xgboost as xgb

### Loading Merge Dataset

In [3]:
# Load dataset
file_path = 'CICIDS2017_2018_Merged_Fuzzy.csv'
df = pd.read_csv(file_path, low_memory=False)

### Cleaning NaN And Infinity Value

In [4]:
# Clean data
df = df.dropna(axis=1, how='all')  # Drop columns that are all NaN
df = df.dropna()  # Drop rows with NaN
df = df.replace([np.inf, -np.inf], np.nan).dropna()  # Drop infs
df = df.select_dtypes(include=[np.number])  # Only numeric features

# Load labels separately (from original CSV)
labels = pd.read_csv(file_path, usecols=['Label', 'BinaryLabel'])
labels = labels.loc[df.index]  # Align with filtered df

In [5]:
# Stage 1: Binary Classification with RF
y_binary = labels['BinaryLabel']
le_bin = LabelEncoder()
y_binary_encoded = le_bin.fit_transform(y_binary)

# Sample 30% for memory safety
X_sampled, _, y_sampled, _ = train_test_split(df, y_binary_encoded, test_size=0.7, random_state=42)

# Train RF
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_sampled, y_sampled)

# Top 30 features
feat_imp = pd.Series(rf.feature_importances_, index=df.columns)
top_features = feat_imp.sort_values(ascending=False).head(30).index.tolist()
print("Top 30 features used:\n", top_features)

# Plot
plt.figure(figsize=(12, 6))
sns.barplot(x=feat_imp[top_features], y=top_features)
plt.title("Top 30 Feature Importances (Stage 1 - RF)")
plt.tight_layout()
plt.show()

ValueError: at least one array or dtype is required