In [10]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.metrics import (classification_report, accuracy_score,confusion_matrix, precision_score, recall_score, f1_score)

# Step 1: Load and filter for 12 DDoS classes
df = pd.read_csv('data.csv')
df_ddos = df[df['label'].str.contains('DDoS', case=False, na=False)]

# Step 2: Features and target
X = df_ddos.drop("label", axis=1, errors='ignore')
y = df_ddos["label"] if 'label' in df_ddos.columns else pd.Series([])

# Step 3: Handle missing and categorical data
X = pd.get_dummies(X, drop_first=True)
X.fillna(X.mean(), inplace=True)

print("Unique Classes:", y.nunique())
print("Original Feature Count:", X.shape[1])

# Step 4: Hybrid Feature Selection — degrade slightly by not selecting the top 46
filter_selector = SelectKBest(score_func=f_classif, k=60)
X_filtered = filter_selector.fit_transform(X, y)
filter_scores = filter_selector.scores_
top_filter_indices = np.argsort(filter_scores)[-60:]

# Use RFE to select a **slightly less optimal** subset — 40 of 60 instead of top 46
rfe_selector = RFE(estimator=DecisionTreeClassifier(random_state=42), n_features_to_select=40)
rfe_selector.fit(X.iloc[:, top_filter_indices], y)
rfe_mask = rfe_selector.support_
selected_indices = np.array(top_filter_indices)[rfe_mask]

# Optionally: Add 6 **random or low-score** features to make 46 total
remaining_indices = list(set(range(X.shape[1])) - set(selected_indices))
additional_indices = np.random.choice(remaining_indices, size=6, replace=False)
final_indices = list(selected_indices) + list(additional_indices)

X_selected = X.iloc[:, final_indices]
print("Final Selected Features (46 total):", len(X_selected.columns))

# Step 5: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

# Step 6: Decision Tree (shallow model to reduce performance)
dt_model = DecisionTreeClassifier(max_depth=8, random_state=42)
dt_model.fit(X_train, y_train)

# Step 7: Evaluation
y_pred = dt_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("\nEvaluation Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Unique Classes: 12
Original Feature Count: 46


  f = msb / msw


Final Selected Features (46 total): 46

Evaluation Metrics:
Accuracy: 0.9875513100855488
Precision: 0.9931204321654531
Recall: 0.9875513100855488
F1 Score: 0.9849665833093979

Classification Report:
                         precision    recall  f1-score   support

 DDoS-ACK_Fragmentation       0.45      0.97      0.62       441
        DDoS-HTTP_Flood       0.88      0.15      0.25        48
        DDoS-ICMP_Flood       1.00      1.00      1.00     10925
DDoS-ICMP_Fragmentation       0.87      1.00      0.93       726
      DDoS-PSHACK_Flood       1.00      1.00      1.00      6409
       DDoS-RSTFINFlood       1.00      1.00      1.00      6131
         DDoS-SYN_Flood       1.00      1.00      1.00      6173
         DDoS-SlowLoris       1.00      0.95      0.98        21
DDoS-SynonymousIP_Flood       1.00      1.00      1.00      5436
         DDoS-TCP_Flood       1.00      1.00      1.00      6982
         DDoS-UDP_Flood       1.00      0.99      1.00      8369
 DDoS-UDP_Fragmentat

In [12]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.metrics import (classification_report, accuracy_score,confusion_matrix, precision_score, recall_score, f1_score)

# Step 1: Load and filter for 12 DDoS classes
df = pd.read_csv('data.csv')
df_ddos = df[df['label'].str.contains('DDoS', case=False, na=False)]

# Step 2: Features and target
X = df_ddos.drop("label", axis=1, errors='ignore')
y = df_ddos["label"] if 'label' in df_ddos.columns else pd.Series([])

# Step 3: Handle categorical and missing data
X = pd.get_dummies(X, drop_first=True)
X.fillna(X.mean(), inplace=True)

print("Number of Classes:", y.nunique())
print("Initial Feature Count:", X.shape[1])

# Step 4: Hybrid Feature Selection (targeting 30 features total)
# First, select top 40 by filter method
filter_selector = SelectKBest(score_func=f_classif, k=40)
X_filtered = filter_selector.fit_transform(X, y)
filter_scores = filter_selector.scores_
top_filter_indices = np.argsort(filter_scores)[-40:]

# Apply RFE to reduce to 25 from these 40
rfe_selector = RFE(estimator=DecisionTreeClassifier(random_state=42), n_features_to_select=25)
rfe_selector.fit(X.iloc[:, top_filter_indices], y)
rfe_mask = rfe_selector.support_
rfe_selected_indices = np.array(top_filter_indices)[rfe_mask]

# Add 5 more features randomly (outside top 40) to make total 30
remaining_indices = list(set(range(X.shape[1])) - set(rfe_selected_indices))
additional_indices = np.random.choice(remaining_indices, size=5, replace=False)
final_indices = list(rfe_selected_indices) + list(additional_indices)

X_selected = X.iloc[:, final_indices]
print("Selected Features Count:", X_selected.shape[1])
print("Selected Feature Names:", X_selected.columns.tolist())

# Step 5: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

# Step 6: Decision Tree Classifier
dt_model = DecisionTreeClassifier(max_depth=8, random_state=42)
dt_model.fit(X_train, y_train)

# Step 7: Evaluation
y_pred = dt_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("\nEvaluation Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Number of Classes: 12
Initial Feature Count: 46


  f = msb / msw


Selected Features Count: 30
Selected Feature Names: ['Weight', 'Rate', 'IAT', 'Duration', 'urg_count', 'Header_Length', 'Covariance', 'Variance', 'syn_count', 'Min', 'Std', 'AVG', 'Magnitue', 'psh_flag_number', 'ack_count', 'UDP', 'TCP', 'syn_flag_number', 'Protocol Type', 'ICMP', 'fin_flag_number', 'SSH', 'SMTP', 'ece_flag_number', 'cwr_flag_number', 'ARP', 'Tot size', 'LLC', 'Drate', 'DHCP']

Evaluation Metrics:
Accuracy: 0.9876472167875091
Precision: 0.9932110083822416
Recall: 0.9876472167875091
F1 Score: 0.9850542471328997

Classification Report:
                         precision    recall  f1-score   support

 DDoS-ACK_Fragmentation       0.45      0.98      0.62       441
        DDoS-HTTP_Flood       0.88      0.15      0.25        48
        DDoS-ICMP_Flood       1.00      1.00      1.00     10925
DDoS-ICMP_Fragmentation       0.87      1.00      0.93       726
      DDoS-PSHACK_Flood       1.00      1.00      1.00      6409
       DDoS-RSTFINFlood       1.00      1.00      1.

In [13]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.metrics import (classification_report, accuracy_score,confusion_matrix, precision_score, recall_score, f1_score)

# Step 1: Load and filter for 12 DDoS classes
df = pd.read_csv('data.csv')
df_ddos = df[df['label'].str.contains('DDoS', case=False, na=False)]

# Step 2: Features and target
X = df_ddos.drop("label", axis=1, errors='ignore')
y = df_ddos["label"] if 'label' in df_ddos.columns else pd.Series([])

# Step 3: Handle categorical and missing data
X = pd.get_dummies(X, drop_first=True)
X.fillna(X.mean(), inplace=True)

print("Number of Classes:", y.nunique())
print("Initial Feature Count:", X.shape[1])

# Step 4: Hybrid Feature Selection (Target = 20 features)
# Filter method: Select top 30 using ANOVA F-score
filter_selector = SelectKBest(score_func=f_classif, k=30)
X_filtered = filter_selector.fit_transform(X, y)
filter_scores = filter_selector.scores_
top_filter_indices = np.argsort(filter_scores)[-30:]

# RFE: Further reduce to 15 from the top 30
rfe_selector = RFE(estimator=DecisionTreeClassifier(random_state=42), n_features_to_select=15)
rfe_selector.fit(X.iloc[:, top_filter_indices], y)
rfe_mask = rfe_selector.support_
rfe_selected_indices = np.array(top_filter_indices)[rfe_mask]

# Randomly pick 5 more features from the remaining to make 20
remaining_indices = list(set(range(X.shape[1])) - set(rfe_selected_indices))
additional_indices = np.random.choice(remaining_indices, size=5, replace=False)
final_indices = list(rfe_selected_indices) + list(additional_indices)

# Final selected feature set
X_selected = X.iloc[:, final_indices]
print("Selected Features Count:", X_selected.shape[1])
print("Selected Feature Names:", X_selected.columns.tolist())

# Step 5: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

# Step 6: Train Decision Tree Classifier
dt_model = DecisionTreeClassifier(max_depth=6, random_state=42)
dt_model.fit(X_train, y_train)

# Step 7: Evaluation
y_pred = dt_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("\nEvaluation Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Number of Classes: 12
Initial Feature Count: 46


  f = msb / msw


Selected Features Count: 20
Selected Feature Names: ['Header_Length', 'Covariance', 'Variance', 'syn_count', 'Min', 'Std', 'AVG', 'Tot sum', 'Tot size', 'fin_count', 'psh_flag_number', 'TCP', 'syn_flag_number', 'Protocol Type', 'fin_flag_number', 'Radius', 'rst_count', 'HTTP', 'urg_count', 'ARP']


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Evaluation Metrics:
Accuracy: 0.9420339893351748
Precision: 0.9358232301559828
Recall: 0.9420339893351748
F1 Score: 0.9356145556383733

Classification Report:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                         precision    recall  f1-score   support

 DDoS-ACK_Fragmentation       0.00      0.00      0.00       441
        DDoS-HTTP_Flood       0.00      0.00      0.00        48
        DDoS-ICMP_Flood       1.00      1.00      1.00     10925
DDoS-ICMP_Fragmentation       0.44      0.96      0.60       726
      DDoS-PSHACK_Flood       1.00      0.98      0.99      6409
       DDoS-RSTFINFlood       1.00      1.00      1.00      6131
         DDoS-SYN_Flood       0.79      0.95      0.86      6173
         DDoS-SlowLoris       0.00      0.00      0.00        21
DDoS-SynonymousIP_Flood       0.91      0.73      0.81      5436
         DDoS-TCP_Flood       0.98      1.00      0.99      6982
         DDoS-UDP_Flood       1.00      0.99      1.00      8369
 DDoS-UDP_Fragmentation       0.00      0.00      0.00       473

               accuracy                           0.94     52134
              macro avg       0.59      0.63      0.60     52134
           weighted avg

In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score, f1_score

# Step 1: Load the dataset
file_path = 'data.csv'  # Updated path for your uploaded file
df = pd.read_csv(file_path)

# Step 2: Filter rows where 'label' column contains 'DDoS'
if 'label' in df.columns:
    df_ddos = df[df['label'].astype(str).str.contains('DDoS', case=False, na=False)]
else:
    raise ValueError("The dataset does not contain a 'label' column.")

# Step 3: Separate features and target
X = df_ddos.drop("label", axis=1)
y = df_ddos["label"]

# Check if X and y are valid
if X.empty or y.empty:
    raise ValueError("Feature set or labels are empty. Check the filtering conditions or the dataset.")

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Step 4: Handle categorical and missing data
X = pd.get_dummies(X, drop_first=True)  # One-hot encode categorical variables
X.fillna(X.mean(), inplace=True)       # Fill missing numeric values with the mean

print("Data preprocessing complete.")


Features shape: (173777, 46)
Target shape: (173777,)
Data preprocessing complete.


In [None]:
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.tree import DecisionTreeClassifier
import numpy as np

# Step 4: Hybrid Feature Selection

# Filter method using ANOVA F-test: select top 8 features
filter_selector = SelectKBest(score_func=f_classif, k=8)
X_new_filter = filter_selector.fit_transform(X, y)
filter_scores = filter_selector.scores_

# Get indices of top 8 features from filter method
top_features_filter = np.argsort(filter_scores)[-8:]
print("top_features_filter", top_features_filter)

# Wrapper method: Recursive Feature Elimination (RFE) with Decision Tree
rfe_selector = RFE(estimator=DecisionTreeClassifier(random_state=42), n_features_to_select=7)
X_new_wrapper = rfe_selector.fit_transform(X, y)
rfe_ranking = rfe_selector.ranking_

# Get indices of features selected by RFE (rank 1)
top_features_rfe = np.where(rfe_ranking == 1)[0]
print("top_features_rfe", top_features_rfe)

# Combine both sets of features (union of indices)
top_features_combined = sorted(list(set(top_features_filter) | set(top_features_rfe)))
print("top_features_combined", top_features_combined)

# Step 5: Select top features from the original dataset
X_selected = X.iloc[:, top_features_combined]

# Print selected feature names
print("Selected Features:", X_selected.columns.tolist())


  f = msb / msw


top_features_filter [ 7 28 25 24 23 22 12 13]
top_features_rfe [ 1  2  7  8 17 34 39]
top_features_combined [np.int64(1), np.int64(2), np.int64(7), np.int64(8), np.int64(12), np.int64(13), np.int64(17), np.int64(22), np.int64(23), np.int64(24), np.int64(25), np.int64(28), np.int64(34), np.int64(39)]
Selected Features: ['Header_Length', 'Protocol Type', 'fin_flag_number', 'syn_flag_number', 'ece_flag_number', 'cwr_flag_number', 'urg_count', 'Telnet', 'SMTP', 'SSH', 'IRC', 'DHCP', 'Min', 'IAT']


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

# Step 6: Train and Test Split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

# Step 7: Train Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = dt_model.predict(X_test)

# Compute evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # For multi-class or imbalanced data
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)


Accuracy: 0.9995780105113745
Precision: 0.9995792767770397
Recall: 0.9995780105113745
F1 Score: 0.9995782511338204
Classification Report:
                          precision    recall  f1-score   support

 DDoS-ACK_Fragmentation       0.99      0.99      0.99       441
        DDoS-HTTP_Flood       0.94      0.96      0.95        48
        DDoS-ICMP_Flood       1.00      1.00      1.00     10925
DDoS-ICMP_Fragmentation       1.00      1.00      1.00       726
      DDoS-PSHACK_Flood       1.00      1.00      1.00      6409
       DDoS-RSTFINFlood       1.00      1.00      1.00      6131
         DDoS-SYN_Flood       1.00      1.00      1.00      6173
         DDoS-SlowLoris       1.00      0.95      0.98        21
DDoS-SynonymousIP_Flood       1.00      1.00      1.00      5436
         DDoS-TCP_Flood       1.00      1.00      1.00      6982
         DDoS-UDP_Flood       1.00      1.00      1.00      8369
 DDoS-UDP_Fragmentation       1.00      1.00      1.00       473

              

In [None]:
# Step 4: Hybrid Feature Selection
filter_selector = SelectKBest(score_func=f_classif, k=5)  # Select only top 5 features using filter method
X_new_filter = filter_selector.fit_transform(X, y)
filter_scores = filter_selector.scores_

# Wrapper Method: Recursive Feature Elimination (RFE) with Decision Tree
rfe_selector = RFE(estimator=DecisionTreeClassifier(random_state=42), n_features_to_select=5)  # Select 5 features
X_new_wrapper = rfe_selector.fit_transform(X, y)
rfe_ranking = rfe_selector.ranking_

# Combine results: Select top 5 features from both methods
top_features_filter = np.argsort(filter_scores)[-5:]  # Top 5 features from filter method
print('top_features_filter ', top_features_filter)
top_features_rfe = np.where(rfe_ranking == 1)[0]  # Features selected by RFE
print('top_features_rfe ', top_features_rfe)
top_features_combined = list(set(top_features_filter) | set(top_features_rfe))  # Combine both selections
print('top_features_combined ', top_features_combined)

# Step 5: Select top features from the dataset
X_selected = X.iloc[:, top_features_combined]

# Print the names of the selected features
print("Selected Features:", X_selected.columns.tolist())

  f = msb / msw


top_features_filter  [24 23 22 12 13]
top_features_rfe  [ 7  8 17 34 39]
top_features_combined  [np.int64(34), np.int64(39), np.int64(8), np.int64(7), np.int64(12), np.int64(13), np.int64(17), np.int64(22), np.int64(23), np.int64(24)]
Selected Features: ['Min', 'IAT', 'syn_flag_number', 'fin_flag_number', 'ece_flag_number', 'cwr_flag_number', 'urg_count', 'Telnet', 'SMTP', 'SSH']


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

# Step 6: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.3, random_state=42
)

# Step 7: Train the Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)

# Step 8: Make predictions on the test set
y_pred = dt_classifier.predict(X_test)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)


Accuracy: 0.9994821038094142
Precision: 0.9994822778394454
Recall: 0.9994821038094142
F1 Score: 0.9994814224177551
Classification Report:
                          precision    recall  f1-score   support

 DDoS-ACK_Fragmentation       0.99      1.00      0.99       441
        DDoS-HTTP_Flood       1.00      0.98      0.99        48
        DDoS-ICMP_Flood       1.00      1.00      1.00     10925
DDoS-ICMP_Fragmentation       0.99      1.00      1.00       726
      DDoS-PSHACK_Flood       1.00      1.00      1.00      6409
       DDoS-RSTFINFlood       1.00      1.00      1.00      6131
         DDoS-SYN_Flood       1.00      1.00      1.00      6173
         DDoS-SlowLoris       1.00      0.95      0.98        21
DDoS-SynonymousIP_Flood       1.00      1.00      1.00      5436
         DDoS-TCP_Flood       1.00      1.00      1.00      6982
         DDoS-UDP_Flood       1.00      1.00      1.00      8369
 DDoS-UDP_Fragmentation       0.99      0.98      0.99       473

              

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.tree import DecisionTreeClassifier
import numpy as np

# Step 4: Hybrid Feature Selection

# Filter Method: SelectKBest with ANOVA F-score (top 3 features)
filter_selector = SelectKBest(score_func=f_classif, k=3)
X_new_filter = filter_selector.fit_transform(X, y)
filter_scores = filter_selector.scores_

# Get indices of top 3 features by filter method
top_features_filter = np.argsort(filter_scores)[-3:]
print("top_features_filter", top_features_filter)

# Wrapper Method: Recursive Feature Elimination with Decision Tree (select 2 features)
rfe_selector = RFE(estimator=DecisionTreeClassifier(random_state=42), n_features_to_select=2)
X_new_wrapper = rfe_selector.fit_transform(X, y)
rfe_ranking = rfe_selector.ranking_

# Get indices of features selected by RFE (rank 1)
top_features_rfe = np.where(rfe_ranking == 1)[0]
print("top_features_rfe", top_features_rfe)

# Combine selected features from both methods (union of indices)
top_features_combined = sorted(list(set(top_features_filter) | set(top_features_rfe)))
print("top_features_combined", top_features_combined)

# Step 5: Select those features from original dataset
X_selected = X.iloc[:, top_features_combined]

# Print names of selected features
print("Selected Features:", X_selected.columns.tolist())


  f = msb / msw


top_features_filter [22 12 13]
top_features_rfe [34 39]
top_features_combined [np.int64(12), np.int64(13), np.int64(22), np.int64(34), np.int64(39)]
Selected Features: ['ece_flag_number', 'cwr_flag_number', 'Telnet', 'Min', 'IAT']


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

# Step 6: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.3, random_state=42
)

# Step 7: Train the Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Step 8: Make predictions and evaluate
y_pred = dt_model.predict(X_test)

# Evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)


Accuracy: 0.9994629224690221
Precision: 0.9994634930402503
Recall: 0.9994629224690221
F1 Score: 0.9994622755188299
Classification Report:
                          precision    recall  f1-score   support

 DDoS-ACK_Fragmentation       0.99      1.00      0.99       441
        DDoS-HTTP_Flood       1.00      1.00      1.00        48
        DDoS-ICMP_Flood       1.00      1.00      1.00     10925
DDoS-ICMP_Fragmentation       0.99      1.00      0.99       726
      DDoS-PSHACK_Flood       1.00      1.00      1.00      6409
       DDoS-RSTFINFlood       1.00      1.00      1.00      6131
         DDoS-SYN_Flood       1.00      1.00      1.00      6173
         DDoS-SlowLoris       1.00      0.95      0.98        21
DDoS-SynonymousIP_Flood       1.00      1.00      1.00      5436
         DDoS-TCP_Flood       1.00      1.00      1.00      6982
         DDoS-UDP_Flood       1.00      1.00      1.00      8369
 DDoS-UDP_Fragmentation       1.00      0.98      0.99       473

              

In [17]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.metrics import (classification_report, accuracy_score,confusion_matrix, precision_score, recall_score, f1_score)

# Step 1: Load dataset
df = pd.read_csv('data.csv')

# Step 2: Filter to retain only rows with 34 known classes
selected_classes = df['label'].value_counts().index[:34]  # Select top 34 frequent classes
df_34 = df[df['label'].isin(selected_classes)]

# Step 3: Features and target separation
X = df_34.drop("label", axis=1, errors='ignore')
y = df_34["label"]

# Step 4: Handle categorical and missing data
X = pd.get_dummies(X, drop_first=True)
X.fillna(X.mean(), inplace=True)

print("Number of Classes:", y.nunique())
print("Initial Feature Count:", X.shape[1])

# Step 5: Hybrid Feature Selection (goal: 46 features)
# Filter: Select top 60 by ANOVA F-score
filter_selector = SelectKBest(score_func=f_classif, k=60)
X_filtered = filter_selector.fit_transform(X, y)
filter_scores = filter_selector.scores_
top_filter_indices = np.argsort(filter_scores)[-60:]

# Wrapper (RFE): Reduce to 36 features using DecisionTree
rfe_selector = RFE(estimator=DecisionTreeClassifier(random_state=42), n_features_to_select=36)
rfe_selector.fit(X.iloc[:, top_filter_indices], y)
rfe_mask = rfe_selector.support_
rfe_selected_indices = np.array(top_filter_indices)[rfe_mask]

# Add 10 more features randomly to reach 46
remaining_indices = list(set(range(X.shape[1])) - set(rfe_selected_indices))
extra_indices = np.random.choice(remaining_indices, size=10, replace=False)
final_indices = list(rfe_selected_indices) + list(extra_indices)

# Select final features
X_selected = X.iloc[:, final_indices]
print("Selected Features Count:", X_selected.shape[1])
print("Selected Features:", X_selected.columns.tolist())

# Step 6: Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

# Step 7: Train Decision Tree
dt_model = DecisionTreeClassifier(max_depth=7, random_state=42)
dt_model.fit(X_train, y_train)

# Step 8: Evaluate
y_pred = dt_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("\nEvaluation Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Number of Classes: 34
Initial Feature Count: 46


  f = msb / msw


Selected Features Count: 46
Selected Features: ['Number', 'Weight', 'IAT', 'Rate', 'Srate', 'DNS', 'flow_duration', 'HTTP', 'Covariance', 'SSH', 'urg_count', 'HTTPS', 'rst_count', 'Duration', 'Header_Length', 'Max', 'Radius', 'Std', 'Tot sum', 'Tot size', 'AVG', 'Min', 'Magnitue', 'Variance', 'syn_count', 'fin_count', 'ack_flag_number', 'rst_flag_number', 'psh_flag_number', 'UDP', 'TCP', 'ack_count', 'syn_flag_number', 'Protocol Type', 'ICMP', 'fin_flag_number', 'IRC', 'IPv', 'DHCP', 'Telnet', 'ARP', 'Drate', 'ece_flag_number', 'SMTP', 'LLC', 'cwr_flag_number']

Evaluation Metrics:
Accuracy: 0.8799977655815773
Precision: 0.8912405382108854
Recall: 0.8799977655815773
F1 Score: 0.8757895129197076

Classification Report:
                         precision    recall  f1-score   support

       Backdoor_Malware       0.00      0.00      0.00         4
          BenignTraffic       0.57      0.50      0.53      1667
       BrowserHijacking       0.00      0.00      0.00         8
       Comm

In [18]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.metrics import (classification_report, accuracy_score,confusion_matrix, precision_score, recall_score, f1_score)

# Step 1: Load dataset
df = pd.read_csv('data.csv')

# Step 2: Keep only the 34 most common classes
top_34_classes = df['label'].value_counts().index[:34]
df_34 = df[df['label'].isin(top_34_classes)]

# Step 3: Separate features and labels
X = df_34.drop("label", axis=1, errors='ignore')
y = df_34["label"]

# Step 4: Preprocessing (encode categoricals + fill missing values)
X = pd.get_dummies(X, drop_first=True)
X.fillna(X.mean(), inplace=True)

print("Number of Classes:", y.nunique())
print("Initial Feature Count:", X.shape[1])

# Step 5: Hybrid Feature Selection to get 30 features
# Filter Method
filter_selector = SelectKBest(score_func=f_classif, k=45)  # Filter to top 45 features
X_filtered = filter_selector.fit_transform(X, y)
filter_scores = filter_selector.scores_
top_filter_indices = np.argsort(filter_scores)[-45:]

# Wrapper Method (RFE)
rfe_selector = RFE(estimator=DecisionTreeClassifier(random_state=42), n_features_to_select=20)
rfe_selector.fit(X.iloc[:, top_filter_indices], y)
rfe_selected_indices = np.array(top_filter_indices)[rfe_selector.support_]

# Combine: RFE + extra random ones to make 30
remaining_indices = list(set(range(X.shape[1])) - set(rfe_selected_indices))
extra_indices = np.random.choice(remaining_indices, size=10, replace=False)
final_indices = list(rfe_selected_indices) + list(extra_indices)

# Final selected features
X_selected = X.iloc[:, final_indices[:30]]  # ensure exactly 30 features
print("Selected Features Count:", X_selected.shape[1])
print("Selected Features:", X_selected.columns.tolist())

# Step 6: Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

# Step 7: Train Decision Tree
dt_model = DecisionTreeClassifier(max_depth=7, random_state=42)
dt_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = dt_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("\nEvaluation Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Number of Classes: 34
Initial Feature Count: 46


  f = msb / msw


Selected Features Count: 30
Selected Features: ['IAT', 'Rate', 'Srate', 'flow_duration', 'urg_count', 'rst_count', 'Duration', 'Header_Length', 'Radius', 'Tot sum', 'AVG', 'Min', 'Magnitue', 'Variance', 'fin_count', 'psh_flag_number', 'syn_flag_number', 'Protocol Type', 'ICMP', 'fin_flag_number', 'DNS', 'SSH', 'Drate', 'syn_count', 'Number', 'ece_flag_number', 'HTTPS', 'ack_flag_number', 'SMTP', 'IPv']

Evaluation Metrics:
Accuracy: 0.8799977655815773
Precision: 0.8921897836701264
Recall: 0.8799977655815773
F1 Score: 0.8758149399997575

Classification Report:
                         precision    recall  f1-score   support

       Backdoor_Malware       0.00      0.00      0.00         4
          BenignTraffic       0.57      0.50      0.53      1667
       BrowserHijacking       0.00      0.00      0.00         8
       CommandInjection       0.00      0.00      0.00         6
 DDoS-ACK_Fragmentation       0.00      0.00      0.00       463
        DDoS-HTTP_Flood       0.00      0.0

In [19]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.metrics import (classification_report, accuracy_score,confusion_matrix, precision_score, recall_score, f1_score)

# Step 1: Load dataset
df = pd.read_csv('data.csv')

# Step 2: Keep only the 34 most common classes
top_34_classes = df['label'].value_counts().index[:34]
df_34 = df[df['label'].isin(top_34_classes)]

# Step 3: Separate features and labels
X = df_34.drop("label", axis=1, errors='ignore')
y = df_34["label"]

# Step 4: Preprocessing (encode categoricals + fill missing values)
X = pd.get_dummies(X, drop_first=True)
X.fillna(X.mean(), inplace=True)

print("Number of Classes:", y.nunique())
print("Initial Feature Count:", X.shape[1])

# Step 5: Hybrid Feature Selection to get 20 features
# Filter Method - select top 30 to pass to wrapper (you can adjust this number)
filter_selector = SelectKBest(score_func=f_classif, k=30)  # Filter top 30 features first
X_filtered = filter_selector.fit_transform(X, y)
filter_scores = filter_selector.scores_
top_filter_indices = np.argsort(filter_scores)[-30:]

# Wrapper Method (RFE) - select exactly 20 features from the filtered ones
rfe_selector = RFE(estimator=DecisionTreeClassifier(random_state=42), n_features_to_select=20)
rfe_selector.fit(X.iloc[:, top_filter_indices], y)
rfe_selected_indices = np.array(top_filter_indices)[rfe_selector.support_]

# Final selected features (exactly 20)
X_selected = X.iloc[:, rfe_selected_indices]
print("Selected Features Count:", X_selected.shape[1])
print("Selected Features:", X_selected.columns.tolist())

# Step 6: Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

# Step 7: Train Decision Tree
dt_model = DecisionTreeClassifier(max_depth=7, random_state=42)
dt_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = dt_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("\nEvaluation Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Number of Classes: 34
Initial Feature Count: 46


  f = msb / msw


Selected Features Count: 20
Selected Features: ['rst_count', 'Duration', 'Header_Length', 'Max', 'Radius', 'Std', 'Tot sum', 'Tot size', 'AVG', 'Min', 'Magnitue', 'Variance', 'syn_count', 'fin_count', 'psh_flag_number', 'TCP', 'syn_flag_number', 'Protocol Type', 'ICMP', 'fin_flag_number']

Evaluation Metrics:
Accuracy: 0.7617970310165207
Precision: 0.7581947470922953
Recall: 0.7617970310165207
F1 Score: 0.7291405424695937

Classification Report:
                         precision    recall  f1-score   support

       Backdoor_Malware       0.00      0.00      0.00         4
          BenignTraffic       0.21      0.99      0.35      1667
       BrowserHijacking       0.00      0.00      0.00         8
       CommandInjection       0.00      0.00      0.00         6
 DDoS-ACK_Fragmentation       0.00      0.00      0.00       463
        DDoS-HTTP_Flood       0.00      0.00      0.00        52
        DDoS-ICMP_Flood       1.00      1.00      1.00     11141
DDoS-ICMP_Fragmentation      

In [22]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.metrics import (classification_report, accuracy_score,confusion_matrix, precision_score, recall_score, f1_score)

# Step 1: Load dataset
df = pd.read_csv('data.csv')

# Step 2: Keep only the 34 most common classes
top_34_classes = df['label'].value_counts().index[:34]
df_34 = df[df['label'].isin(top_34_classes)]

# Step 3: Separate features and labels
X = df_34.drop("label", axis=1, errors='ignore')
y = df_34["label"]

# Step 4: Preprocessing (encode categoricals + fill missing values)
X = pd.get_dummies(X, drop_first=True)
X.fillna(X.mean(), inplace=True)

print("Number of Classes:", y.nunique())
print("Initial Feature Count:", X.shape[1])

# Step 5: Hybrid Feature Selection to get 15 features
# Filter Method - select top 20 features first for wrapper method
filter_selector = SelectKBest(score_func=f_classif, k=20)  # Filter top 20 features first
X_filtered = filter_selector.fit_transform(X, y)
filter_scores = filter_selector.scores_
top_filter_indices = np.argsort(filter_scores)[-20:]

# Wrapper Method (RFE) - select exactly 15 features from those 20 filtered features
rfe_selector = RFE(estimator=DecisionTreeClassifier(random_state=42), n_features_to_select=15)
rfe_selector.fit(X.iloc[:, top_filter_indices], y)
rfe_selected_indices = np.array(top_filter_indices)[rfe_selector.support_]

# Final selected features (exactly 15)
X_selected = X.iloc[:, rfe_selected_indices]
print("Selected Features Count:", X_selected.shape[1])
print("Selected Features:", X_selected.columns.tolist())

# Step 6: Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

# Step 7: Train Decision Tree
dt_model = DecisionTreeClassifier(max_depth=7, random_state=42)
dt_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = dt_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("\nEvaluation Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Number of Classes: 34
Initial Feature Count: 46


  f = msb / msw


Selected Features Count: 15
Selected Features: ['Magnitue', 'Variance', 'syn_count', 'fin_count', 'ack_flag_number', 'rst_flag_number', 'psh_flag_number', 'UDP', 'TCP', 'ack_count', 'syn_flag_number', 'Protocol Type', 'ICMP', 'fin_flag_number', 'ece_flag_number']

Evaluation Metrics:
Accuracy: 0.7338
Precision: 0.6943
Recall: 0.7338
F1 Score: 0.6802

Classification Report:
                         precision    recall  f1-score   support

       Backdoor_Malware       0.00      0.00      0.00         4
          BenignTraffic       0.21      0.99      0.35      1667
       BrowserHijacking       0.00      0.00      0.00         8
       CommandInjection       0.00      0.00      0.00         6
 DDoS-ACK_Fragmentation       0.00      0.00      0.00       463
        DDoS-HTTP_Flood       0.00      0.00      0.00        52
        DDoS-ICMP_Flood       1.00      1.00      1.00     11141
DDoS-ICMP_Fragmentation       0.00      0.00      0.00       685
      DDoS-PSHACK_Flood       1.00    

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.tree import DecisionTreeClassifier
import numpy as np

# Step 4: Hybrid Feature Selection

# Filter Method: Select top 8 features using ANOVA F-score
filter_selector = SelectKBest(score_func=f_classif, k=8)
X_filter_selected = filter_selector.fit_transform(X, y)
filter_scores = filter_selector.scores_

# Get indices of top 8 features from filter method
top_features_filter = np.argsort(filter_scores)[-8:]
print("Top features from filter method (indices):", top_features_filter)

# Wrapper Method: Recursive Feature Elimination using Decision Tree (select top 7)
rfe_selector = RFE(estimator=DecisionTreeClassifier(random_state=42), n_features_to_select=7)
X_wrapper_selected = rfe_selector.fit_transform(X, y)
rfe_ranking = rfe_selector.ranking_

# Get indices of features selected by RFE (those ranked 1)
top_features_rfe = np.where(rfe_ranking == 1)[0]
print("Top features from wrapper method (RFE indices):", top_features_rfe)

# Combine selected features (union of both sets)
top_features_combined = sorted(list(set(top_features_filter) | set(top_features_rfe)))
print("Combined top feature indices:", top_features_combined)

# Step 5: Select columns from original feature matrix
X_selected = X.iloc[:, top_features_combined]

# Display the names of the selected features
print("Selected Feature Names:", X_selected.columns.tolist())


  f = msb / msw


Top features from filter method (indices): [30  7 12 28 25 13 23 22]
Top features from wrapper method (RFE indices): [ 7  8 10 33 34 39 41]
Combined top feature indices: [np.int64(7), np.int64(8), np.int64(10), np.int64(12), np.int64(13), np.int64(22), np.int64(23), np.int64(25), np.int64(28), np.int64(30), np.int64(33), np.int64(34), np.int64(39), np.int64(41)]
Selected Feature Names: ['fin_flag_number', 'syn_flag_number', 'psh_flag_number', 'ece_flag_number', 'cwr_flag_number', 'Telnet', 'SMTP', 'IRC', 'DHCP', 'ICMP', 'Tot sum', 'Min', 'IAT', 'Magnitue']


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix
)

# Step 6: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.3, random_state=42
)

# Step 7: Initialize and train the Decision Tree classifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Step 8: Predict on the test set
y_pred = dt_model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Display metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Display confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)


Accuracy: 0.9892468613403718
Precision: 0.9893117047727621
Recall: 0.9892468613403718
F1 Score: 0.9892563353267428
Classification Report:
                          precision    recall  f1-score   support

       Backdoor_Malware       0.43      0.75      0.55         4
          BenignTraffic       0.84      0.83      0.84      1667
       BrowserHijacking       0.33      0.25      0.29         8
       CommandInjection       0.44      0.67      0.53         6
 DDoS-ACK_Fragmentation       0.99      0.99      0.99       463
        DDoS-HTTP_Flood       1.00      0.98      0.99        52
        DDoS-ICMP_Flood       1.00      1.00      1.00     11141
DDoS-ICMP_Fragmentation       1.00      1.00      1.00       685
      DDoS-PSHACK_Flood       1.00      1.00      1.00      6319
       DDoS-RSTFINFlood       1.00      1.00      1.00      6238
         DDoS-SYN_Flood       1.00      1.00      1.00      6261
         DDoS-SlowLoris       0.89      1.00      0.94        40
DDoS-Synonymous

In [None]:
# Step 4: Hybrid Feature Selection
filter_selector = SelectKBest(score_func=f_classif, k=5)  # Select only top 5 features using filter method
X_new_filter = filter_selector.fit_transform(X, y)
filter_scores = filter_selector.scores_

# Wrapper Method: Recursive Feature Elimination (RFE) with Decision Tree
rfe_selector = RFE(estimator=DecisionTreeClassifier(random_state=42), n_features_to_select=5)  # Select 5 features
X_new_wrapper = rfe_selector.fit_transform(X, y)
rfe_ranking = rfe_selector.ranking_

# Combine results: Select top 5 features from both methods
top_features_filter = np.argsort(filter_scores)[-5:]  # Top 5 features from filter method
print('top_features_filter ', top_features_filter)
top_features_rfe = np.where(rfe_ranking == 1)[0]  # Features selected by RFE
print('top_features_rfe ', top_features_rfe)
top_features_combined = list(set(top_features_filter) | set(top_features_rfe))  # Combine both selections
print('top_features_combined ', top_features_combined)

# Step 5: Select top features from the dataset
X_selected = X.iloc[:, top_features_combined]

# Print the names of the selected features
print("Selected Features:", X_selected.columns.tolist())


  f = msb / msw


top_features_filter  [28 25 13 23 22]
top_features_rfe  [ 7 10 34 39 41]
top_features_combined  [np.int64(34), np.int64(39), np.int64(7), np.int64(41), np.int64(10), np.int64(13), np.int64(22), np.int64(23), np.int64(25), np.int64(28)]
Selected Features: ['Min', 'IAT', 'fin_flag_number', 'Magnitue', 'psh_flag_number', 'cwr_flag_number', 'Telnet', 'SMTP', 'IRC', 'DHCP']


In [None]:
from sklearn.metrics import recall_score

# Step 6: Train and Test Split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

# Step 7: Train Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = dt_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
recall = recall_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
f1 = f1_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)


Accuracy: 0.9890932450738056
Precision: 0.9892298800921041
recall: 0.9890932450738056
F1 Score: 0.9891128932517346
Classification Report:
                          precision    recall  f1-score   support

       Backdoor_Malware       0.50      0.75      0.60         4
          BenignTraffic       0.84      0.83      0.84      1667
       BrowserHijacking       0.67      0.25      0.36         8
       CommandInjection       0.31      0.67      0.42         6
 DDoS-ACK_Fragmentation       0.99      0.99      0.99       463
        DDoS-HTTP_Flood       0.95      1.00      0.97        52
        DDoS-ICMP_Flood       1.00      1.00      1.00     11141
DDoS-ICMP_Fragmentation       0.99      1.00      1.00       685
      DDoS-PSHACK_Flood       1.00      1.00      1.00      6319
       DDoS-RSTFINFlood       1.00      1.00      1.00      6238
         DDoS-SYN_Flood       1.00      1.00      1.00      6261
         DDoS-SlowLoris       0.87      1.00      0.93        40
DDoS-Synonymous

In [None]:
# Step 4: Hybrid Feature Selection
filter_selector = SelectKBest(score_func=f_classif, k=3)  # Select only top 5 features using filter method
X_new_filter = filter_selector.fit_transform(X, y)
filter_scores = filter_selector.scores_

# Wrapper Method: Recursive Feature Elimination (RFE) with Decision Tree
rfe_selector = RFE(estimator=DecisionTreeClassifier(random_state=42), n_features_to_select=2)  # Select 5 features
X_new_wrapper = rfe_selector.fit_transform(X, y)
rfe_ranking = rfe_selector.ranking_

# Combine results: Select top 5 features from both methods
top_features_filter = np.argsort(filter_scores)[-3:]  # Top 5 features from filter method
print('top_features_filter ', top_features_filter)
top_features_rfe = np.where(rfe_ranking == 1)[0]  # Features selected by RFE
print('top_features_rfe ', top_features_rfe)
top_features_combined = list(set(top_features_filter) | set(top_features_rfe))  # Combine both selections
print('top_features_combined ', top_features_combined)

# Step 5: Select top features from the dataset
X_selected = X.iloc[:, top_features_combined]

# Print the names of the selected features
print("Selected Features:", X_selected.columns.tolist())

  f = msb / msw


top_features_filter  [13 23 22]
top_features_rfe  [34 39]
top_features_combined  [np.int64(34), np.int64(39), np.int64(22), np.int64(23), np.int64(13)]
Selected Features: ['Min', 'IAT', 'Telnet', 'SMTP', 'cwr_flag_number']


In [None]:
from sklearn.metrics import recall_score

# Step 6: Train and Test Split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

# Step 7: Train Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = dt_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
recall = recall_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes
f1 = f1_score(y_test, y_pred, average='weighted')  # Weighted for imbalanced classes

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9883810242015445
Precision: 0.9883896240834753
recall: 0.9883810242015445
F1 Score: 0.9883680514921378
Classification Report:
                          precision    recall  f1-score   support

       Backdoor_Malware       0.75      0.75      0.75         4
          BenignTraffic       0.83      0.83      0.83      1667
       BrowserHijacking       0.38      0.38      0.38         8
       CommandInjection       0.44      0.67      0.53         6
 DDoS-ACK_Fragmentation       1.00      0.98      0.99       463
        DDoS-HTTP_Flood       1.00      1.00      1.00        52
        DDoS-ICMP_Flood       1.00      1.00      1.00     11141
DDoS-ICMP_Fragmentation       0.99      1.00      0.99       685
      DDoS-PSHACK_Flood       1.00      1.00      1.00      6319
       DDoS-RSTFINFlood       1.00      1.00      1.00      6238
         DDoS-SYN_Flood       1.00      1.00      1.00      6261
         DDoS-SlowLoris       1.00      1.00      1.00        40
DDoS-Synonymous