In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2, RFE, SelectKBest
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
)
from sklearn.preprocessing import MinMaxScaler

# Step 1: Load dataset
df = pd.read_csv('data.csv')

# Step 2: Keep only the 12 most common classes
top_12_classes = df['label'].value_counts().index[:12]
df_12 = df[df['label'].isin(top_12_classes)]

# Step 3: Separate features and labels
X = df_12.drop("label", axis=1, errors='ignore')
y = df_12["label"]

# Step 4: Preprocessing (encode categoricals + fill missing values + scale for chi2)
X = pd.get_dummies(X, drop_first=True)
X.fillna(X.mean(), inplace=True)
X = pd.DataFrame(MinMaxScaler().fit_transform(X), columns=X.columns)  # chi2 requires non-negative values

print("Number of Classes:", y.nunique())
print("Initial Feature Count:", X.shape[1])

# Step 5: Hybrid Feature Selection to get 46 features
# Filter Method - Chi-Square
filter_selector = SelectKBest(score_func=chi2, k=60)  # First filter to top 60 features
X_filtered = filter_selector.fit_transform(X, y)
filter_scores = filter_selector.scores_
top_filter_indices = np.argsort(filter_scores)[-60:]

# Wrapper Method - RFE using Random Forest
rfe_selector = RFE(estimator=RandomForestClassifier(n_estimators=100, random_state=42), n_features_to_select=30)
rfe_selector.fit(X.iloc[:, top_filter_indices], y)
rfe_selected_indices = np.array(top_filter_indices)[rfe_selector.support_]

# Combine: RFE + random features to make 46
remaining_indices = list(set(range(X.shape[1])) - set(rfe_selected_indices))
extra_indices = np.random.choice(remaining_indices, size=16, replace=False)
final_indices = list(rfe_selected_indices) + list(extra_indices)

# Final selected features
X_selected = X.iloc[:, final_indices[:46]]
print("Selected Features Count:", X_selected.shape[1])
print("Selected Features:", X_selected.columns.tolist())

# Step 6: Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

# Step 7: Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, max_depth=7, random_state=42)
rf_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("\nEvaluation Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Number of Classes: 12
Initial Feature Count: 46




Selected Features Count: 46
Selected Features: ['Weight', 'IAT', 'Rate', 'Srate', 'Duration', 'fin_count', 'flow_duration', 'Max', 'Tot sum', 'Radius', 'Std', 'AVG', 'Tot size', 'Header_Length', 'Magnitue', 'urg_count', 'rst_count', 'Min', 'Protocol Type', 'syn_count', 'Variance', 'TCP', 'ack_count', 'syn_flag_number', 'UDP', 'ack_flag_number', 'ICMP', 'rst_flag_number', 'psh_flag_number', 'fin_flag_number', 'Telnet', 'DHCP', 'Covariance', 'IRC', 'cwr_flag_number', 'Drate', 'ece_flag_number', 'Number', 'IPv', 'LLC', 'ARP', 'HTTPS', 'DNS', 'SSH', 'SMTP', 'HTTP']

Evaluation Metrics:
Accuracy: 0.9321862348178138
Precision: 0.9435307747541396
Recall: 0.9321862348178138
F1 Score: 0.9274462744983396

Classification Report:
                         precision    recall  f1-score   support

          BenignTraffic       0.94      1.00      0.97       120
        DDoS-ICMP_Flood       1.00      1.00      1.00       844
      DDoS-PSHACK_Flood       1.00      1.00      1.00       495
       DDoS

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2, RFE, SelectKBest
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
)
from sklearn.preprocessing import MinMaxScaler

# Step 1: Load dataset
df = pd.read_csv('data.csv')

# Step 2: Keep only the 12 most common classes
top_12_classes = df['label'].value_counts().index[:12]
df_12 = df[df['label'].isin(top_12_classes)]

# Step 3: Separate features and labels
X = df_12.drop("label", axis=1, errors='ignore')
y = df_12["label"]

# Step 4: Preprocessing (encode categoricals + fill missing values + scale for chi2)
X = pd.get_dummies(X, drop_first=True)
X.fillna(X.mean(), inplace=True)
X = pd.DataFrame(MinMaxScaler().fit_transform(X), columns=X.columns)  # chi2 requires non-negative values

print("Number of Classes:", y.nunique())
print("Initial Feature Count:", X.shape[1])

# Step 5: Hybrid Feature Selection to get 46 features
# Filter Method - Chi-Square
filter_selector = SelectKBest(score_func=chi2, k=60)  # First filter to top 60 features
X_filtered = filter_selector.fit_transform(X, y)
filter_scores = filter_selector.scores_
top_filter_indices = np.argsort(filter_scores)[-60:]

# Wrapper Method - RFE using Random Forest
rfe_selector = RFE(estimator=RandomForestClassifier(n_estimators=100, random_state=42), n_features_to_select=30)
rfe_selector.fit(X.iloc[:, top_filter_indices], y)
rfe_selected_indices = np.array(top_filter_indices)[rfe_selector.support_]

# Combine: RFE + random features to make 46
remaining_indices = list(set(range(X.shape[1])) - set(rfe_selected_indices))
extra_indices = np.random.choice(remaining_indices, size=16, replace=False)
final_indices = list(rfe_selected_indices) + list(extra_indices)

# Final selected features
X_selected = X.iloc[:, final_indices[:40]]
print("Selected Features Count:", X_selected.shape[1])
print("Selected Features:", X_selected.columns.tolist())

# Step 6: Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

# Step 7: Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, max_depth=7, random_state=42)
rf_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("\nEvaluation Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Number of Classes: 12
Initial Feature Count: 46




Selected Features Count: 40
Selected Features: ['Number', 'Weight', 'IAT', 'Rate', 'Srate', 'Covariance', 'Duration', 'flow_duration', 'Tot size', 'Max', 'Tot sum', 'urg_count', 'fin_count', 'AVG', 'Min', 'Header_Length', 'Magnitue', 'rst_count', 'syn_count', 'Protocol Type', 'Variance', 'TCP', 'ack_count', 'syn_flag_number', 'UDP', 'ack_flag_number', 'ICMP', 'rst_flag_number', 'psh_flag_number', 'fin_flag_number', 'ARP', 'IPv', 'LLC', 'DNS', 'ece_flag_number', 'Telnet', 'cwr_flag_number', 'Drate', 'HTTPS', 'DHCP']

Evaluation Metrics:
Accuracy: 0.9379676709417926
Precision: 0.9484031015063161
Recall: 0.9379676709417926
F1 Score: 0.9346536998717743

Classification Report:
                         precision    recall  f1-score   support

          BenignTraffic       0.94      1.00      0.97      1748
        DDoS-ICMP_Flood       1.00      1.00      1.00     11003
      DDoS-PSHACK_Flood       1.00      1.00      1.00      6237
       DDoS-RSTFINFlood       1.00      1.00      1.00    

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2, RFE, SelectKBest
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
)
from sklearn.preprocessing import MinMaxScaler

# Step 1: Load dataset
df = pd.read_csv('data.csv')

# Step 2: Keep only the 12 most common classes
top_12_classes = df['label'].value_counts().index[:12]
df_12 = df[df['label'].isin(top_12_classes)]

# Step 3: Separate features and labels
X = df_12.drop("label", axis=1, errors='ignore')
y = df_12["label"]

# Step 4: Preprocessing (encode categoricals + fill missing values + scale for chi2)
X = pd.get_dummies(X, drop_first=True)
X.fillna(X.mean(), inplace=True)
X = pd.DataFrame(MinMaxScaler().fit_transform(X), columns=X.columns)  # chi2 requires non-negative values

print("Number of Classes:", y.nunique())
print("Initial Feature Count:", X.shape[1])

# Step 5: Hybrid Feature Selection to get 46 features
# Filter Method - Chi-Square
filter_selector = SelectKBest(score_func=chi2, k=60)  # First filter to top 60 features
X_filtered = filter_selector.fit_transform(X, y)
filter_scores = filter_selector.scores_
top_filter_indices = np.argsort(filter_scores)[-60:]

# Wrapper Method - RFE using Random Forest
rfe_selector = RFE(estimator=RandomForestClassifier(n_estimators=100, random_state=42), n_features_to_select=30)
rfe_selector.fit(X.iloc[:, top_filter_indices], y)
rfe_selected_indices = np.array(top_filter_indices)[rfe_selector.support_]

# Combine: RFE + random features to make 46
remaining_indices = list(set(range(X.shape[1])) - set(rfe_selected_indices))
extra_indices = np.random.choice(remaining_indices, size=16, replace=False)
final_indices = list(rfe_selected_indices) + list(extra_indices)

# Final selected features
X_selected = X.iloc[:, final_indices[:30]]
print("Selected Features Count:", X_selected.shape[1])
print("Selected Features:", X_selected.columns.tolist())

# Step 6: Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

# Step 7: Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, max_depth=7, random_state=42)
rf_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("\nEvaluation Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Number of Classes: 12
Initial Feature Count: 46




Selected Features Count: 30
Selected Features: ['Number', 'Weight', 'IAT', 'Rate', 'Srate', 'Covariance', 'Duration', 'flow_duration', 'Tot size', 'Max', 'Tot sum', 'urg_count', 'fin_count', 'AVG', 'Min', 'Header_Length', 'Magnitue', 'rst_count', 'syn_count', 'Protocol Type', 'Variance', 'TCP', 'ack_count', 'syn_flag_number', 'UDP', 'ack_flag_number', 'ICMP', 'rst_flag_number', 'psh_flag_number', 'fin_flag_number']

Evaluation Metrics:
Accuracy: 0.9406389921833498
Precision: 0.9507227173310414
Recall: 0.9406389921833498
F1 Score: 0.9376669634108029

Classification Report:
                         precision    recall  f1-score   support

          BenignTraffic       0.94      1.00      0.97      1748
        DDoS-ICMP_Flood       1.00      1.00      1.00     11003
      DDoS-PSHACK_Flood       1.00      1.00      1.00      6237
       DDoS-RSTFINFlood       1.00      1.00      1.00      6318
         DDoS-SYN_Flood       0.77      0.99      0.86      6195
DDoS-SynonymousIP_Flood       

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2, RFE, SelectKBest
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
)
from sklearn.preprocessing import MinMaxScaler

# Step 1: Load dataset
df = pd.read_csv('data.csv')

# Step 2: Keep only the 12 most common classes
top_12_classes = df['label'].value_counts().index[:12]
df_12 = df[df['label'].isin(top_12_classes)]

# Step 3: Separate features and labels
X = df_12.drop("label", axis=1, errors='ignore')
y = df_12["label"]

# Step 4: Preprocessing (encode categoricals + fill missing values + scale for chi2)
X = pd.get_dummies(X, drop_first=True)
X.fillna(X.mean(), inplace=True)
X = pd.DataFrame(MinMaxScaler().fit_transform(X), columns=X.columns)  # chi2 requires non-negative values

print("Number of Classes:", y.nunique())
print("Initial Feature Count:", X.shape[1])

# Step 5: Hybrid Feature Selection to get 46 features
# Filter Method - Chi-Square
filter_selector = SelectKBest(score_func=chi2, k=60)  # First filter to top 60 features
X_filtered = filter_selector.fit_transform(X, y)
filter_scores = filter_selector.scores_
top_filter_indices = np.argsort(filter_scores)[-80:]

# Wrapper Method - RFE using Random Forest
rfe_selector = RFE(estimator=RandomForestClassifier(n_estimators=100, random_state=42), n_features_to_select=30)
rfe_selector.fit(X.iloc[:, top_filter_indices], y)
rfe_selected_indices = np.array(top_filter_indices)[rfe_selector.support_]

# Combine: RFE + random features to make 46
remaining_indices = list(set(range(X.shape[1])) - set(rfe_selected_indices))
extra_indices = np.random.choice(remaining_indices, size=16, replace=False)
final_indices = list(rfe_selected_indices) + list(extra_indices)

# Final selected features
X_selected = X.iloc[:, final_indices[:20]]
print("Selected Features Count:", X_selected.shape[1])
print("Selected Features:", X_selected.columns.tolist())

# Step 6: Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

# Step 7: Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, max_depth=7, random_state=42)
rf_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("\nEvaluation Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Number of Classes: 12
Initial Feature Count: 46




Selected Features Count: 20
Selected Features: ['Number', 'Weight', 'IAT', 'Rate', 'Srate', 'Covariance', 'Duration', 'flow_duration', 'Tot size', 'Max', 'Tot sum', 'urg_count', 'fin_count', 'AVG', 'Min', 'Header_Length', 'Magnitue', 'rst_count', 'syn_count', 'Protocol Type']

Evaluation Metrics:
Accuracy: 0.9534188358503453
Precision: 0.9591827140092944
Recall: 0.9534188358503453
F1 Score: 0.9503336940498361

Classification Report:
                         precision    recall  f1-score   support

          BenignTraffic       0.93      1.00      0.96      1748
        DDoS-ICMP_Flood       1.00      1.00      1.00     11003
      DDoS-PSHACK_Flood       1.00      1.00      1.00      6237
       DDoS-RSTFINFlood       1.00      1.00      1.00      6318
         DDoS-SYN_Flood       0.86      0.98      0.92      6195
DDoS-SynonymousIP_Flood       1.00      1.00      1.00      5453
         DDoS-TCP_Flood       0.80      0.99      0.88      6875
         DDoS-UDP_Flood       1.00      1.

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2, RFE, SelectKBest
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
)
from sklearn.preprocessing import MinMaxScaler

# Step 1: Load dataset
df = pd.read_csv('data.csv')

# Step 2: Keep only the 12 most common classes
top_12_classes = df['label'].value_counts().index[:12]
df_12 = df[df['label'].isin(top_12_classes)]

# Step 3: Separate features and labels
X = df_12.drop("label", axis=1, errors='ignore')
y = df_12["label"]

# Step 4: Preprocessing (encode categoricals + fill missing values + scale for chi2)
X = pd.get_dummies(X, drop_first=True)
X.fillna(X.mean(), inplace=True)
X = pd.DataFrame(MinMaxScaler().fit_transform(X), columns=X.columns)  # chi2 requires non-negative values

print("Number of Classes:", y.nunique())
print("Initial Feature Count:", X.shape[1])

# Step 5: Hybrid Feature Selection to get 46 features
# Filter Method - Chi-Square
filter_selector = SelectKBest(score_func=chi2, k=60)  # First filter to top 60 features
X_filtered = filter_selector.fit_transform(X, y)
filter_scores = filter_selector.scores_
top_filter_indices = np.argsort(filter_scores)[-80:]

# Wrapper Method - RFE using Random Forest
rfe_selector = RFE(estimator=RandomForestClassifier(n_estimators=100, random_state=42), n_features_to_select=30)
rfe_selector.fit(X.iloc[:, top_filter_indices], y)
rfe_selected_indices = np.array(top_filter_indices)[rfe_selector.support_]

# Combine: RFE + random features to make 46
remaining_indices = list(set(range(X.shape[1])) - set(rfe_selected_indices))
extra_indices = np.random.choice(remaining_indices, size=16, replace=False)
final_indices = list(rfe_selected_indices) + list(extra_indices)

# Final selected features
X_selected = X.iloc[:, final_indices[:15]]
print("Selected Features Count:", X_selected.shape[1])
print("Selected Features:", X_selected.columns.tolist())

# Step 6: Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

# Step 7: Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, max_depth=7, random_state=42)
rf_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("\nEvaluation Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


(173777, 46)
(173777,)


  f = msb / msw


top_features_filter  [ 7 28 25 24 23 22 12 13]
top_features_rfe  [ 7 15 34 36 38 39 41]
top_features_combined  [np.int64(34), np.int64(36), np.int64(38), np.int64(7), np.int64(39), np.int64(41), np.int64(12), np.int64(13), np.int64(15), np.int64(22), np.int64(23), np.int64(24), np.int64(25), np.int64(28)]
Selected Features: ['Min', 'AVG', 'Tot size', 'fin_flag_number', 'IAT', 'Magnitue', 'ece_flag_number', 'cwr_flag_number', 'syn_count', 'Telnet', 'SMTP', 'SSH', 'IRC', 'DHCP']
Accuracy: 0.9986764875129474
Precision: 0.9987169513691142
Recall: 0.9986764875129474
F1 Score: 0.9986889647506856
Classification Report:
                          precision    recall  f1-score   support

 DDoS-ACK_Fragmentation       0.99      0.98      0.99       441
        DDoS-HTTP_Flood       0.85      0.92      0.88        48
        DDoS-ICMP_Flood       1.00      1.00      1.00     10925
DDoS-ICMP_Fragmentation       0.98      0.99      0.99       726
      DDoS-PSHACK_Flood       1.00      1.00      1.0

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2, RFE, SelectKBest
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
)
from sklearn.preprocessing import MinMaxScaler

# Step 1: Load dataset
df = pd.read_csv('data.csv')

# Step 2: Keep only the 12 most common classes
top_12_classes = df['label'].value_counts().index[:12]
df_12 = df[df['label'].isin(top_12_classes)]

# Step 3: Separate features and labels
X = df_12.drop("label", axis=1, errors='ignore')
y = df_12["label"]

# Step 4: Preprocessing (encode categoricals + fill missing values + scale for chi2)
X = pd.get_dummies(X, drop_first=True)
X.fillna(X.mean(), inplace=True)
X = pd.DataFrame(MinMaxScaler().fit_transform(X), columns=X.columns)  # chi2 requires non-negative values

print("Number of Classes:", y.nunique())
print("Initial Feature Count:", X.shape[1])

# Step 5: Hybrid Feature Selection to get 46 features
# Filter Method - Chi-Square
filter_selector = SelectKBest(score_func=chi2, k=60)  # First filter to top 60 features
X_filtered = filter_selector.fit_transform(X, y)
filter_scores = filter_selector.scores_
top_filter_indices = np.argsort(filter_scores)[-60:]

# Wrapper Method - RFE using Random Forest
rfe_selector = RFE(estimator=RandomForestClassifier(n_estimators=100, random_state=42), n_features_to_select=30)
rfe_selector.fit(X.iloc[:, top_filter_indices], y)
rfe_selected_indices = np.array(top_filter_indices)[rfe_selector.support_]

# Combine: RFE + random features to make 46
remaining_indices = list(set(range(X.shape[1])) - set(rfe_selected_indices))
extra_indices = np.random.choice(remaining_indices, size=16, replace=False)
final_indices = list(rfe_selected_indices) + list(extra_indices)

# Final selected features
X_selected = X.iloc[:, final_indices[:10]]
print("Selected Features Count:", X_selected.shape[1])
print("Selected Features:", X_selected.columns.tolist())

# Step 6: Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

# Step 7: Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, max_depth=7, random_state=42)
rf_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("\nEvaluation Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Number of Classes: 12
Initial Feature Count: 46




Selected Features Count: 10
Selected Features: ['Number', 'Weight', 'IAT', 'Rate', 'Srate', 'Covariance', 'Duration', 'flow_duration', 'Tot size', 'Max']

Evaluation Metrics:
Accuracy: 0.9314715033770965
Precision: 0.9474530517857356
Recall: 0.9314715033770965
F1 Score: 0.9281941011433205

Classification Report:
                         precision    recall  f1-score   support

          BenignTraffic       0.96      1.00      0.98      1748
        DDoS-ICMP_Flood       1.00      1.00      1.00     11003
      DDoS-PSHACK_Flood       1.00      1.00      1.00      6237
       DDoS-RSTFINFlood       0.90      1.00      0.95      6318
         DDoS-SYN_Flood       1.00      0.95      0.98      6195
DDoS-SynonymousIP_Flood       0.99      0.90      0.94      5453
         DDoS-TCP_Flood       0.66      0.98      0.79      6875
         DDoS-UDP_Flood       1.00      1.00      1.00      8430
          DoS-SYN_Flood       0.91      0.53      0.67      3047
          DoS-TCP_Flood       0.99 

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2, RFE, SelectKBest
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
)
from sklearn.preprocessing import MinMaxScaler

# Step 1: Load dataset
df = pd.read_csv('data.csv')

# Step 2: Keep only the 12 most common classes
top_12_classes = df['label'].value_counts().index[:12]
df_12 = df[df['label'].isin(top_12_classes)]

# Step 3: Separate features and labels
X = df_12.drop("label", axis=1, errors='ignore')
y = df_12["label"]

# Step 4: Preprocessing (encode categoricals + fill missing values + scale for chi2)
X = pd.get_dummies(X, drop_first=True)
X.fillna(X.mean(), inplace=True)
X = pd.DataFrame(MinMaxScaler().fit_transform(X), columns=X.columns)  # chi2 requires non-negative values

print("Number of Classes:", y.nunique())
print("Initial Feature Count:", X.shape[1])

# Step 5: Hybrid Feature Selection to get 46 features
# Filter Method - Chi-Square
filter_selector = SelectKBest(score_func=chi2, k=60)  # First filter to top 60 features
X_filtered = filter_selector.fit_transform(X, y)
filter_scores = filter_selector.scores_
top_filter_indices = np.argsort(filter_scores)[-60:]

# Wrapper Method - RFE using Random Forest
rfe_selector = RFE(estimator=RandomForestClassifier(n_estimators=100, random_state=42), n_features_to_select=30)
rfe_selector.fit(X.iloc[:, top_filter_indices], y)
rfe_selected_indices = np.array(top_filter_indices)[rfe_selector.support_]

# Combine: RFE + random features to make 46
remaining_indices = list(set(range(X.shape[1])) - set(rfe_selected_indices))
extra_indices = np.random.choice(remaining_indices, size=16, replace=False)
final_indices = list(rfe_selected_indices) + list(extra_indices)

# Final selected features
X_selected = X.iloc[:, final_indices[:5]]
print("Selected Features Count:", X_selected.shape[1])
print("Selected Features:", X_selected.columns.tolist())

# Step 6: Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

# Step 7: Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, max_depth=7, random_state=42)
rf_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("\nEvaluation Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Number of Classes: 12
Initial Feature Count: 46




Selected Features Count: 5
Selected Features: ['Number', 'IAT', 'Rate', 'Srate', 'Duration']

Evaluation Metrics:
Accuracy: 0.9519961692703657
Precision: 0.9558703326402113
Recall: 0.9519961692703657
F1 Score: 0.9426467705555313

Classification Report:
                         precision    recall  f1-score   support

          BenignTraffic       1.00      1.00      1.00       448
        DDoS-ICMP_Flood       1.00      0.95      0.97      2828
      DDoS-PSHACK_Flood       1.00      1.00      1.00      1612
       DDoS-RSTFINFlood       0.77      0.99      0.86      1523
         DDoS-SYN_Flood       1.00      0.97      0.99      1549
DDoS-SynonymousIP_Flood       0.92      0.99      0.96      1419
         DDoS-TCP_Flood       1.00      1.00      1.00      1762
         DDoS-UDP_Flood       0.99      1.00      0.99      2113
          DoS-SYN_Flood       0.83      0.99      0.90       794
          DoS-TCP_Flood       0.98      1.00      0.99      1004
          DoS-UDP_Flood       1