In [None]:
pip install xgboost




In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2, RFE, SelectKBest
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
)
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

# Step 1: Load dataset
df = pd.read_csv('data.csv')

# Step 2: Keep only the 12 most common classes
top_12_classes = df['label'].value_counts().index[:12]
df_12 = df[df['label'].isin(top_12_classes)]

# Step 3: Separate features and labels
X = df_12.drop("label", axis=1, errors='ignore')
y = df_12["label"]

# Step 4: Encode labels and preprocess features
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)  # Encode string labels into integers

X = pd.get_dummies(X, drop_first=True)  # Encode categorical features
X.fillna(X.mean(), inplace=True)  # Fill missing values
X = pd.DataFrame(MinMaxScaler().fit_transform(X), columns=X.columns)  # Scale features

print("Number of Classes:", np.unique(y_encoded).size)
print("Initial Feature Count:", X.shape[1])

# Step 5: Hybrid Feature Selection to get 46 features
# Filter Method - Chi-Square
k_filter = min(60, X.shape[1])  # Ensure k does not exceed number of features
filter_selector = SelectKBest(score_func=chi2, k=k_filter)
X_filtered = filter_selector.fit_transform(X, y_encoded)
filter_scores = filter_selector.scores_
top_filter_indices = np.argsort(filter_scores)[-k_filter:]

# Wrapper Method - RFE using XGBoost
rfe_selector = RFE(estimator=XGBClassifier(n_estimators=100, use_label_encoder=False,
                                            eval_metric='mlogloss', random_state=42),
                   n_features_to_select=30)
rfe_selector.fit(X.iloc[:, top_filter_indices], y_encoded)
rfe_selected_indices = np.array(top_filter_indices)[rfe_selector.support_]

# Combine: RFE + random features to make 46
remaining_indices = list(set(range(X.shape[1])) - set(rfe_selected_indices))
extra_indices = np.random.choice(remaining_indices, size=16, replace=False)
final_indices = list(rfe_selected_indices) + list(extra_indices)

# Final selected features
X_selected = X.iloc[:, final_indices[:46]]
print("Selected Features Count:", X_selected.shape[1])
print("Selected Features:", X_selected.columns.tolist())

# Step 6: Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_selected, y_encoded, test_size=0.3, random_state=42)

# Step 7: Train XGBoost Classifier
xgb_model = XGBClassifier(n_estimators=100, max_depth=7, use_label_encoder=False,
                          eval_metric='mlogloss', random_state=42)
xgb_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = xgb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("\nEvaluation Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0, target_names=label_encoder.classes_))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Number of Classes: 12
Initial Feature Count: 46


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Selected Features Count: 46
Selected Features: ['Number', 'Weight', 'IAT', 'Rate', 'DNS', 'Duration', 'flow_duration', 'Tot size', 'Max', 'Tot sum', 'urg_count', 'HTTP', 'fin_count', 'AVG', 'Min', 'Header_Length', 'Magnitue', 'rst_count', 'syn_count', 'Protocol Type', 'HTTPS', 'Variance', 'TCP', 'ack_count', 'syn_flag_number', 'UDP', 'ICMP', 'rst_flag_number', 'psh_flag_number', 'fin_flag_number', 'ece_flag_number', 'Srate', 'Radius', 'IRC', 'Telnet', 'DHCP', 'LLC', 'cwr_flag_number', 'SSH', 'IPv', 'ack_flag_number', 'Std', 'SMTP', 'Covariance', 'Drate', 'ARP']


Parameters: { "use_label_encoder" } are not used.




Evaluation Metrics:
Accuracy: 0.9995446611520072
Precision: 0.9995449904548045
Recall: 0.9995446611520072
F1 Score: 0.9995446738907007

Classification Report:
                         precision    recall  f1-score   support

          BenignTraffic       1.00      1.00      1.00      1748
        DDoS-ICMP_Flood       1.00      1.00      1.00     11003
      DDoS-PSHACK_Flood       1.00      1.00      1.00      6237
       DDoS-RSTFINFlood       1.00      1.00      1.00      6318
         DDoS-SYN_Flood       1.00      1.00      1.00      6195
DDoS-SynonymousIP_Flood       1.00      1.00      1.00      5453
         DDoS-TCP_Flood       1.00      1.00      1.00      6875
         DDoS-UDP_Flood       1.00      1.00      1.00      8430
          DoS-SYN_Flood       1.00      1.00      1.00      3047
          DoS-TCP_Flood       1.00      1.00      1.00      4012
          DoS-UDP_Flood       1.00      1.00      1.00      5059
     Mirai-greeth_flood       1.00      1.00      1.00     

In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2, RFE, SelectKBest
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
)
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

# Step 1: Load dataset
df = pd.read_csv('data.csv')

# Step 2: Keep only the 12 most common classes
top_12_classes = df['label'].value_counts().index[:12]
df_12 = df[df['label'].isin(top_12_classes)]

# Step 3: Separate features and labels
X = df_12.drop("label", axis=1, errors='ignore')
y = df_12["label"]

# Step 4: Encode labels and preprocess features
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)  # Encode string labels into integers

X = pd.get_dummies(X, drop_first=True)  # Encode categorical features
X.fillna(X.mean(), inplace=True)  # Fill missing values
X = pd.DataFrame(MinMaxScaler().fit_transform(X), columns=X.columns)  # Scale features

print("Number of Classes:", np.unique(y_encoded).size)
print("Initial Feature Count:", X.shape[1])

# Step 5: Hybrid Feature Selection to get 46 features
# Filter Method - Chi-Square
k_filter = min(60, X.shape[1])  # Ensure k does not exceed number of features
filter_selector = SelectKBest(score_func=chi2, k=k_filter)
X_filtered = filter_selector.fit_transform(X, y_encoded)
filter_scores = filter_selector.scores_
top_filter_indices = np.argsort(filter_scores)[-k_filter:]

# Wrapper Method - RFE using XGBoost
rfe_selector = RFE(estimator=XGBClassifier(n_estimators=100, use_label_encoder=False,
                                            eval_metric='mlogloss', random_state=42),
                   n_features_to_select=30)
rfe_selector.fit(X.iloc[:, top_filter_indices], y_encoded)
rfe_selected_indices = np.array(top_filter_indices)[rfe_selector.support_]

# Combine: RFE + random features to make 46
remaining_indices = list(set(range(X.shape[1])) - set(rfe_selected_indices))
extra_indices = np.random.choice(remaining_indices, size=16, replace=False)
final_indices = list(rfe_selected_indices) + list(extra_indices)

# Final selected features
X_selected = X.iloc[:, final_indices[:30]]
print("Selected Features Count:", X_selected.shape[1])
print("Selected Features:", X_selected.columns.tolist())

# Step 6: Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_selected, y_encoded, test_size=0.3, random_state=42)

# Step 7: Train XGBoost Classifier
xgb_model = XGBClassifier(n_estimators=100, max_depth=7, use_label_encoder=False,
                          eval_metric='mlogloss', random_state=42)
xgb_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = xgb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("\nEvaluation Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0, target_names=label_encoder.classes_))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Number of Classes: 12
Initial Feature Count: 46


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Selected Features Count: 30
Selected Features: ['Number', 'Weight', 'IAT', 'Rate', 'DNS', 'Duration', 'flow_duration', 'Tot size', 'Max', 'Tot sum', 'urg_count', 'HTTP', 'fin_count', 'AVG', 'Min', 'Header_Length', 'Magnitue', 'rst_count', 'syn_count', 'Protocol Type', 'HTTPS', 'Variance', 'TCP', 'ack_count', 'syn_flag_number', 'UDP', 'ICMP', 'rst_flag_number', 'psh_flag_number', 'fin_flag_number']


Parameters: { "use_label_encoder" } are not used.




Evaluation Metrics:
Accuracy: 0.999559839113607
Precision: 0.9995602684102799
Recall: 0.999559839113607
F1 Score: 0.999559879983084

Classification Report:
                         precision    recall  f1-score   support

          BenignTraffic       1.00      1.00      1.00      1748
        DDoS-ICMP_Flood       1.00      1.00      1.00     11003
      DDoS-PSHACK_Flood       1.00      1.00      1.00      6237
       DDoS-RSTFINFlood       1.00      1.00      1.00      6318
         DDoS-SYN_Flood       1.00      1.00      1.00      6195
DDoS-SynonymousIP_Flood       1.00      1.00      1.00      5453
         DDoS-TCP_Flood       1.00      1.00      1.00      6875
         DDoS-UDP_Flood       1.00      1.00      1.00      8430
          DoS-SYN_Flood       1.00      1.00      1.00      3047
          DoS-TCP_Flood       1.00      1.00      1.00      4012
          DoS-UDP_Flood       1.00      1.00      1.00      5059
     Mirai-greeth_flood       1.00      1.00      1.00      15

In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2, RFE, SelectKBest
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
)
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

# Step 1: Load dataset
df = pd.read_csv('data.csv')

# Step 2: Keep only the 12 most common classes
top_12_classes = df['label'].value_counts().index[:12]
df_12 = df[df['label'].isin(top_12_classes)]

# Step 3: Separate features and labels
X = df_12.drop("label", axis=1, errors='ignore')
y = df_12["label"]

# Step 4: Encode labels and preprocess features
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)  # Encode string labels into integers

X = pd.get_dummies(X, drop_first=True)  # Encode categorical features
X.fillna(X.mean(), inplace=True)  # Fill missing values
X = pd.DataFrame(MinMaxScaler().fit_transform(X), columns=X.columns)  # Scale features

print("Number of Classes:", np.unique(y_encoded).size)
print("Initial Feature Count:", X.shape[1])

# Step 5: Hybrid Feature Selection to get 46 features
# Filter Method - Chi-Square
k_filter = min(60, X.shape[1])  # Ensure k does not exceed number of features
filter_selector = SelectKBest(score_func=chi2, k=k_filter)
X_filtered = filter_selector.fit_transform(X, y_encoded)
filter_scores = filter_selector.scores_
top_filter_indices = np.argsort(filter_scores)[-k_filter:]

# Wrapper Method - RFE using XGBoost
rfe_selector = RFE(estimator=XGBClassifier(n_estimators=100, use_label_encoder=False,
                                            eval_metric='mlogloss', random_state=42),
                   n_features_to_select=30)
rfe_selector.fit(X.iloc[:, top_filter_indices], y_encoded)
rfe_selected_indices = np.array(top_filter_indices)[rfe_selector.support_]

# Combine: RFE + random features to make 46
remaining_indices = list(set(range(X.shape[1])) - set(rfe_selected_indices))
extra_indices = np.random.choice(remaining_indices, size=16, replace=False)
final_indices = list(rfe_selected_indices) + list(extra_indices)

# Final selected features
X_selected = X.iloc[:, final_indices[:20]]
print("Selected Features Count:", X_selected.shape[1])
print("Selected Features:", X_selected.columns.tolist())

# Step 6: Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_selected, y_encoded, test_size=0.3, random_state=42)

# Step 7: Train XGBoost Classifier
xgb_model = XGBClassifier(n_estimators=100, max_depth=7, use_label_encoder=False,
                          eval_metric='mlogloss', random_state=42)
xgb_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = xgb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("\nEvaluation Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0, target_names=label_encoder.classes_))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Number of Classes: 12
Initial Feature Count: 46


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Selected Features Count: 20
Selected Features: ['Number', 'Weight', 'IAT', 'Rate', 'DNS', 'Duration', 'flow_duration', 'Tot size', 'Max', 'Tot sum', 'urg_count', 'HTTP', 'fin_count', 'AVG', 'Min', 'Header_Length', 'Magnitue', 'rst_count', 'syn_count', 'Protocol Type']


Parameters: { "use_label_encoder" } are not used.




Evaluation Metrics:
Accuracy: 0.9995750170752068
Precision: 0.999575237861138
Recall: 0.9995750170752068
F1 Score: 0.9995750036951843

Classification Report:
                         precision    recall  f1-score   support

          BenignTraffic       1.00      1.00      1.00      1748
        DDoS-ICMP_Flood       1.00      1.00      1.00     11003
      DDoS-PSHACK_Flood       1.00      1.00      1.00      6237
       DDoS-RSTFINFlood       1.00      1.00      1.00      6318
         DDoS-SYN_Flood       1.00      1.00      1.00      6195
DDoS-SynonymousIP_Flood       1.00      1.00      1.00      5453
         DDoS-TCP_Flood       1.00      1.00      1.00      6875
         DDoS-UDP_Flood       1.00      1.00      1.00      8430
          DoS-SYN_Flood       1.00      1.00      1.00      3047
          DoS-TCP_Flood       1.00      1.00      1.00      4012
          DoS-UDP_Flood       1.00      1.00      1.00      5059
     Mirai-greeth_flood       1.00      1.00      1.00      

In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2, RFE, SelectKBest
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
)
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

# Step 1: Load dataset
df = pd.read_csv('data.csv')

# Step 2: Keep only the 12 most common classes
top_12_classes = df['label'].value_counts().index[:12]
df_12 = df[df['label'].isin(top_12_classes)]

# Step 3: Separate features and labels
X = df_12.drop("label", axis=1, errors='ignore')
y = df_12["label"]

# Step 4: Encode labels and preprocess features
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)  # Encode string labels into integers

X = pd.get_dummies(X, drop_first=True)  # Encode categorical features
X.fillna(X.mean(), inplace=True)  # Fill missing values
X = pd.DataFrame(MinMaxScaler().fit_transform(X), columns=X.columns)  # Scale features

print("Number of Classes:", np.unique(y_encoded).size)
print("Initial Feature Count:", X.shape[1])

# Step 5: Hybrid Feature Selection to get 46 features
# Filter Method - Chi-Square
k_filter = min(60, X.shape[1])  # Ensure k does not exceed number of features
filter_selector = SelectKBest(score_func=chi2, k=k_filter)
X_filtered = filter_selector.fit_transform(X, y_encoded)
filter_scores = filter_selector.scores_
top_filter_indices = np.argsort(filter_scores)[-k_filter:]

# Wrapper Method - RFE using XGBoost
rfe_selector = RFE(estimator=XGBClassifier(n_estimators=100, use_label_encoder=False,
                                            eval_metric='mlogloss', random_state=42),
                   n_features_to_select=30)
rfe_selector.fit(X.iloc[:, top_filter_indices], y_encoded)
rfe_selected_indices = np.array(top_filter_indices)[rfe_selector.support_]

# Combine: RFE + random features to make 46
remaining_indices = list(set(range(X.shape[1])) - set(rfe_selected_indices))
extra_indices = np.random.choice(remaining_indices, size=16, replace=False)
final_indices = list(rfe_selected_indices) + list(extra_indices)

# Final selected features
X_selected = X.iloc[:, final_indices[:15]]
print("Selected Features Count:", X_selected.shape[1])
print("Selected Features:", X_selected.columns.tolist())

# Step 6: Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_selected, y_encoded, test_size=0.3, random_state=42)

# Step 7: Train XGBoost Classifier
xgb_model = XGBClassifier(n_estimators=100, max_depth=7, use_label_encoder=False,
                          eval_metric='mlogloss', random_state=42)
xgb_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = xgb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("\nEvaluation Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0, target_names=label_encoder.classes_))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Number of Classes: 12
Initial Feature Count: 46


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Selected Features Count: 15
Selected Features: ['Number', 'Weight', 'IAT', 'Rate', 'Duration', 'flow_duration', 'Covariance', 'Tot sum', 'urg_count', 'HTTP', 'fin_count', 'Max', 'AVG', 'Tot size', 'Header_Length']


Parameters: { "use_label_encoder" } are not used.




Evaluation Metrics:
Accuracy: 0.9971671388101983
Precision: 0.9971669994819308
Recall: 0.9971671388101983
F1 Score: 0.9971663223530516

Classification Report:
                         precision    recall  f1-score   support

          BenignTraffic       1.00      1.00      1.00       370
        DDoS-ICMP_Flood       1.00      1.00      1.00      2351
      DDoS-PSHACK_Flood       1.00      1.00      1.00      1312
       DDoS-RSTFINFlood       1.00      1.00      1.00      1288
         DDoS-SYN_Flood       0.99      0.99      0.99      1293
DDoS-SynonymousIP_Flood       1.00      1.00      1.00      1212
         DDoS-TCP_Flood       0.99      0.99      0.99      1407
         DDoS-UDP_Flood       1.00      1.00      1.00      1669
          DoS-SYN_Flood       0.99      0.99      0.99       641
          DoS-TCP_Flood       1.00      1.00      1.00       842
          DoS-UDP_Flood       1.00      1.00      1.00      1040
     Mirai-greeth_flood       1.00      0.99      1.00     

In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2, RFE, SelectKBest
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
)
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

# Step 1: Load dataset
df = pd.read_csv('data.csv')

# Step 2: Keep only the 12 most common classes
top_12_classes = df['label'].value_counts().index[:12]
df_12 = df[df['label'].isin(top_12_classes)]

# Step 3: Separate features and labels
X = df_12.drop("label", axis=1, errors='ignore')
y = df_12["label"]

# Step 4: Encode labels and preprocess features
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)  # Encode string labels into integers

X = pd.get_dummies(X, drop_first=True)  # Encode categorical features
X.fillna(X.mean(), inplace=True)  # Fill missing values
X = pd.DataFrame(MinMaxScaler().fit_transform(X), columns=X.columns)  # Scale features

print("Number of Classes:", np.unique(y_encoded).size)
print("Initial Feature Count:", X.shape[1])

# Step 5: Hybrid Feature Selection to get 46 features
# Filter Method - Chi-Square
k_filter = min(60, X.shape[1])  # Ensure k does not exceed number of features
filter_selector = SelectKBest(score_func=chi2, k=k_filter)
X_filtered = filter_selector.fit_transform(X, y_encoded)
filter_scores = filter_selector.scores_
top_filter_indices = np.argsort(filter_scores)[-k_filter:]

# Wrapper Method - RFE using XGBoost
rfe_selector = RFE(estimator=XGBClassifier(n_estimators=100, use_label_encoder=False,
                                            eval_metric='mlogloss', random_state=42),
                   n_features_to_select=30)
rfe_selector.fit(X.iloc[:, top_filter_indices], y_encoded)
rfe_selected_indices = np.array(top_filter_indices)[rfe_selector.support_]

# Combine: RFE + random features to make 46
remaining_indices = list(set(range(X.shape[1])) - set(rfe_selected_indices))
extra_indices = np.random.choice(remaining_indices, size=16, replace=False)
final_indices = list(rfe_selected_indices) + list(extra_indices)

# Final selected features
X_selected = X.iloc[:, final_indices[:10]]
print("Selected Features Count:", X_selected.shape[1])
print("Selected Features:", X_selected.columns.tolist())

# Step 6: Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_selected, y_encoded, test_size=0.3, random_state=42)

# Step 7: Train XGBoost Classifier
xgb_model = XGBClassifier(n_estimators=100, max_depth=7, use_label_encoder=False,
                          eval_metric='mlogloss', random_state=42)
xgb_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = xgb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("\nEvaluation Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0, target_names=label_encoder.classes_))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Number of Classes: 12
Initial Feature Count: 46


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Selected Features Count: 10
Selected Features: ['Number', 'Weight', 'IAT', 'Rate', 'DNS', 'Duration', 'flow_duration', 'Tot size', 'Max', 'Tot sum']


Parameters: { "use_label_encoder" } are not used.




Evaluation Metrics:
Accuracy: 0.9958108825984671
Precision: 0.995813708000134
Recall: 0.9958108825984671
F1 Score: 0.9958108972913977

Classification Report:
                         precision    recall  f1-score   support

          BenignTraffic       1.00      1.00      1.00      1748
        DDoS-ICMP_Flood       1.00      1.00      1.00     11003
      DDoS-PSHACK_Flood       1.00      0.99      0.99      6237
       DDoS-RSTFINFlood       0.99      0.99      0.99      6318
         DDoS-SYN_Flood       0.99      0.99      0.99      6195
DDoS-SynonymousIP_Flood       1.00      1.00      1.00      5453
         DDoS-TCP_Flood       0.99      0.99      0.99      6875
         DDoS-UDP_Flood       1.00      1.00      1.00      8430
          DoS-SYN_Flood       0.99      0.99      0.99      3047
          DoS-TCP_Flood       0.99      1.00      0.99      4012
          DoS-UDP_Flood       1.00      1.00      1.00      5059
     Mirai-greeth_flood       1.00      1.00      1.00      

In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2, RFE, SelectKBest
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
)
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

# Step 1: Load dataset
df = pd.read_csv('data.csv')

# Step 2: Keep only the 12 most common classes
top_12_classes = df['label'].value_counts().index[:12]
df_12 = df[df['label'].isin(top_12_classes)]

# Step 3: Separate features and labels
X = df_12.drop("label", axis=1, errors='ignore')
y = df_12["label"]

# Step 4: Encode labels and preprocess features
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)  # Encode string labels into integers

X = pd.get_dummies(X, drop_first=True)  # Encode categorical features
X.fillna(X.mean(), inplace=True)  # Fill missing values
X = pd.DataFrame(MinMaxScaler().fit_transform(X), columns=X.columns)  # Scale features

print("Number of Classes:", np.unique(y_encoded).size)
print("Initial Feature Count:", X.shape[1])

# Step 5: Hybrid Feature Selection to get 46 features
# Filter Method - Chi-Square
k_filter = min(60, X.shape[1])  # Ensure k does not exceed number of features
filter_selector = SelectKBest(score_func=chi2, k=k_filter)
X_filtered = filter_selector.fit_transform(X, y_encoded)
filter_scores = filter_selector.scores_
top_filter_indices = np.argsort(filter_scores)[-k_filter:]

# Wrapper Method - RFE using XGBoost
rfe_selector = RFE(estimator=XGBClassifier(n_estimators=100, use_label_encoder=False,
                                            eval_metric='mlogloss', random_state=42),
                   n_features_to_select=30)
rfe_selector.fit(X.iloc[:, top_filter_indices], y_encoded)
rfe_selected_indices = np.array(top_filter_indices)[rfe_selector.support_]

# Combine: RFE + random features to make 46
remaining_indices = list(set(range(X.shape[1])) - set(rfe_selected_indices))
extra_indices = np.random.choice(remaining_indices, size=16, replace=False)
final_indices = list(rfe_selected_indices) + list(extra_indices)

# Final selected features
X_selected = X.iloc[:, final_indices[:5]]
print("Selected Features Count:", X_selected.shape[1])
print("Selected Features:", X_selected.columns.tolist())

# Step 6: Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_selected, y_encoded, test_size=0.3, random_state=42)

# Step 7: Train XGBoost Classifier
xgb_model = XGBClassifier(n_estimators=100, max_depth=7, use_label_encoder=False,
                          eval_metric='mlogloss', random_state=42)
xgb_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = xgb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("\nEvaluation Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0, target_names=label_encoder.classes_))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Number of Classes: 12
Initial Feature Count: 46


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Selected Features Count: 5
Selected Features: ['Number', 'Weight', 'IAT', 'Rate', 'DNS']


Parameters: { "use_label_encoder" } are not used.




Evaluation Metrics:
Accuracy: 0.9892540031873719
Precision: 0.9892625283319739
Recall: 0.9892540031873719
F1 Score: 0.9892555448718685

Classification Report:
                         precision    recall  f1-score   support

          BenignTraffic       1.00      1.00      1.00      1748
        DDoS-ICMP_Flood       0.99      0.99      0.99     11003
      DDoS-PSHACK_Flood       0.99      0.98      0.99      6237
       DDoS-RSTFINFlood       0.99      0.99      0.99      6318
         DDoS-SYN_Flood       0.98      0.99      0.99      6195
DDoS-SynonymousIP_Flood       0.99      0.99      0.99      5453
         DDoS-TCP_Flood       1.00      0.99      0.99      6875
         DDoS-UDP_Flood       1.00      1.00      1.00      8430
          DoS-SYN_Flood       0.97      0.97      0.97      3047
          DoS-TCP_Flood       0.99      0.99      0.99      4012
          DoS-UDP_Flood       0.99      0.99      0.99      5059
     Mirai-greeth_flood       0.96      0.96      0.96     