In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier

# Step 1: Load dataset
df = pd.read_csv('data.csv')

# Step 2: Keep only the 12 most common classes
top_12_classes = df['label'].value_counts().index[:12]
df_12 = df[df['label'].isin(top_12_classes)]

# Step 3: Separate features and labels
X = df_12.drop("label", axis=1, errors='ignore')
y = df_12["label"]

# Step 4: Preprocessing (encode categoricals + fill missing values + scale for chi2)
X = pd.get_dummies(X, drop_first=True)
X.fillna(X.mean(), inplace=True)
X = pd.DataFrame(MinMaxScaler().fit_transform(X), columns=X.columns)  # chi2 requires non-negative values

print("Number of Classes:", y.nunique())
print("Initial Feature Count:", X.shape[1])

# Step 5: Hybrid Feature Selection to get 46 features
# Filter Method - Chi-Square
filter_selector = SelectKBest(score_func=chi2, k=60)  # First filter to top 60 features
X_filtered = filter_selector.fit_transform(X, y)
filter_scores = filter_selector.scores_
top_filter_indices = np.argsort(filter_scores)[-60:]

# Wrapper Method - RFE (use a valid estimator like KNN without invalid params)
rfe_selector = RFE(estimator=RandomForestClassifier(n_estimators=100, random_state=42), n_features_to_select=30)
rfe_selector.fit(X.iloc[:, top_filter_indices], y)
rfe_selected_indices = np.array(top_filter_indices)[rfe_selector.support_]

# Combine: RFE + random features to make 46
remaining_indices = list(set(range(X.shape[1])) - set(rfe_selected_indices))
extra_indices = np.random.choice(remaining_indices, size=16, replace=False)
final_indices = list(rfe_selected_indices) + list(extra_indices)

# Final selected features
X_selected = X.iloc[:, final_indices[:46]]
print("Selected Features Count:", X_selected.shape[1])
print("Selected Features:", X_selected.columns.tolist())

# Step 6: Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

# Step 7: Train K-Nearest Neighbors model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = knn_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("\nEvaluation Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Number of Classes: 12
Initial Feature Count: 46




Selected Features Count: 46
Selected Features: ['Number', 'Weight', 'IAT', 'Rate', 'Srate', 'Covariance', 'Duration', 'flow_duration', 'Tot size', 'Max', 'Tot sum', 'urg_count', 'fin_count', 'AVG', 'Min', 'Header_Length', 'Magnitue', 'rst_count', 'syn_count', 'Protocol Type', 'Variance', 'TCP', 'ack_count', 'syn_flag_number', 'UDP', 'ack_flag_number', 'ICMP', 'rst_flag_number', 'psh_flag_number', 'fin_flag_number', 'Radius', 'DNS', 'Drate', 'HTTP', 'Telnet', 'HTTPS', 'Std', 'IRC', 'ARP', 'SSH', 'DHCP', 'SMTP', 'cwr_flag_number', 'ece_flag_number', 'IPv', 'LLC']

Evaluation Metrics:
Accuracy: 0.9502162859527965
Precision: 0.9502137242954187
Recall: 0.9502162859527965
F1 Score: 0.9498787553933904

Classification Report:
                         precision    recall  f1-score   support

          BenignTraffic       1.00      1.00      1.00      1748
        DDoS-ICMP_Flood       1.00      1.00      1.00     11003
      DDoS-PSHACK_Flood       1.00      1.00      1.00      6237
       DDoS

In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier

# Step 1: Load dataset
df = pd.read_csv('data.csv')

# Step 2: Keep only the 12 most common classes
top_12_classes = df['label'].value_counts().index[:12]
df_12 = df[df['label'].isin(top_12_classes)]

# Step 3: Separate features and labels
X = df_12.drop("label", axis=1, errors='ignore')
y = df_12["label"]

# Step 4: Preprocessing (encode categoricals + fill missing values + scale for chi2)
X = pd.get_dummies(X, drop_first=True)
X.fillna(X.mean(), inplace=True)
X = pd.DataFrame(MinMaxScaler().fit_transform(X), columns=X.columns)  # chi2 requires non-negative values

print("Number of Classes:", y.nunique())
print("Initial Feature Count:", X.shape[1])

# Step 5: Hybrid Feature Selection to get 46 features
# Filter Method - Chi-Square
filter_selector = SelectKBest(score_func=chi2, k=60)  # First filter to top 60 features
X_filtered = filter_selector.fit_transform(X, y)
filter_scores = filter_selector.scores_
top_filter_indices = np.argsort(filter_scores)[-60:]

# Wrapper Method - RFE (use a valid estimator like KNN without invalid params)
rfe_selector = RFE(estimator=RandomForestClassifier(n_estimators=100, random_state=42), n_features_to_select=30)
rfe_selector.fit(X.iloc[:, top_filter_indices], y)
rfe_selected_indices = np.array(top_filter_indices)[rfe_selector.support_]

# Combine: RFE + random features to make 46
remaining_indices = list(set(range(X.shape[1])) - set(rfe_selected_indices))
extra_indices = np.random.choice(remaining_indices, size=16, replace=False)
final_indices = list(rfe_selected_indices) + list(extra_indices)

# Final selected features
X_selected = X.iloc[:, final_indices[:40]]
print("Selected Features Count:", X_selected.shape[1])
print("Selected Features:", X_selected.columns.tolist())

# Step 6: Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

# Step 7: Train K-Nearest Neighbors model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = knn_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("\nEvaluation Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Number of Classes: 12
Initial Feature Count: 46




Selected Features Count: 40
Selected Features: ['Number', 'Weight', 'IAT', 'Rate', 'Srate', 'Covariance', 'Duration', 'flow_duration', 'Tot size', 'Max', 'Tot sum', 'urg_count', 'fin_count', 'AVG', 'Min', 'Header_Length', 'Magnitue', 'rst_count', 'syn_count', 'Protocol Type', 'Variance', 'TCP', 'ack_count', 'syn_flag_number', 'UDP', 'ack_flag_number', 'ICMP', 'rst_flag_number', 'psh_flag_number', 'fin_flag_number', 'Std', 'DNS', 'cwr_flag_number', 'Telnet', 'SSH', 'Drate', 'Radius', 'LLC', 'HTTPS', 'IRC']

Evaluation Metrics:
Accuracy: 0.9515367686119754
Precision: 0.9515135224655324
Recall: 0.9515367686119754
F1 Score: 0.9512176390956015

Classification Report:
                         precision    recall  f1-score   support

          BenignTraffic       1.00      1.00      1.00      1748
        DDoS-ICMP_Flood       1.00      1.00      1.00     11003
      DDoS-PSHACK_Flood       1.00      1.00      1.00      6237
       DDoS-RSTFINFlood       1.00      1.00      1.00      6318
   

In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier

# Step 1: Load dataset
df = pd.read_csv('data.csv')

# Step 2: Keep only the 12 most common classes
top_12_classes = df['label'].value_counts().index[:12]
df_12 = df[df['label'].isin(top_12_classes)]

# Step 3: Separate features and labels
X = df_12.drop("label", axis=1, errors='ignore')
y = df_12["label"]

# Step 4: Preprocessing (encode categoricals + fill missing values + scale for chi2)
X = pd.get_dummies(X, drop_first=True)
X.fillna(X.mean(), inplace=True)
X = pd.DataFrame(MinMaxScaler().fit_transform(X), columns=X.columns)  # chi2 requires non-negative values

print("Number of Classes:", y.nunique())
print("Initial Feature Count:", X.shape[1])

# Step 5: Hybrid Feature Selection to get 46 features
# Filter Method - Chi-Square
filter_selector = SelectKBest(score_func=chi2, k=60)  # First filter to top 60 features
X_filtered = filter_selector.fit_transform(X, y)
filter_scores = filter_selector.scores_
top_filter_indices = np.argsort(filter_scores)[-60:]

# Wrapper Method - RFE (use a valid estimator like KNN without invalid params)
rfe_selector = RFE(estimator=RandomForestClassifier(n_estimators=100, random_state=42), n_features_to_select=30)
rfe_selector.fit(X.iloc[:, top_filter_indices], y)
rfe_selected_indices = np.array(top_filter_indices)[rfe_selector.support_]

# Combine: RFE + random features to make 46
remaining_indices = list(set(range(X.shape[1])) - set(rfe_selected_indices))
extra_indices = np.random.choice(remaining_indices, size=16, replace=False)
final_indices = list(rfe_selected_indices) + list(extra_indices)

# Final selected features
X_selected = X.iloc[:, final_indices[:30]]
print("Selected Features Count:", X_selected.shape[1])
print("Selected Features:", X_selected.columns.tolist())

# Step 6: Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

# Step 7: Train K-Nearest Neighbors model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = knn_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("\nEvaluation Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Number of Classes: 12
Initial Feature Count: 46




Selected Features Count: 30
Selected Features: ['Number', 'Weight', 'IAT', 'Rate', 'Srate', 'Covariance', 'Duration', 'flow_duration', 'Tot size', 'Max', 'Tot sum', 'urg_count', 'fin_count', 'AVG', 'Min', 'Header_Length', 'Magnitue', 'rst_count', 'syn_count', 'Protocol Type', 'Variance', 'TCP', 'ack_count', 'syn_flag_number', 'UDP', 'ack_flag_number', 'ICMP', 'rst_flag_number', 'psh_flag_number', 'fin_flag_number']

Evaluation Metrics:
Accuracy: 0.9525233361159596
Precision: 0.9525307390839108
Recall: 0.9525233361159596
F1 Score: 0.9522409868248994

Classification Report:
                         precision    recall  f1-score   support

          BenignTraffic       1.00      1.00      1.00      1748
        DDoS-ICMP_Flood       1.00      1.00      1.00     11003
      DDoS-PSHACK_Flood       1.00      1.00      1.00      6237
       DDoS-RSTFINFlood       1.00      1.00      1.00      6318
         DDoS-SYN_Flood       0.90      0.96      0.93      6195
DDoS-SynonymousIP_Flood       

In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier

# Step 1: Load dataset
df = pd.read_csv('data.csv')

# Step 2: Keep only the 12 most common classes
top_12_classes = df['label'].value_counts().index[:12]
df_12 = df[df['label'].isin(top_12_classes)]

# Step 3: Separate features and labels
X = df_12.drop("label", axis=1, errors='ignore')
y = df_12["label"]

# Step 4: Preprocessing (encode categoricals + fill missing values + scale for chi2)
X = pd.get_dummies(X, drop_first=True)
X.fillna(X.mean(), inplace=True)
X = pd.DataFrame(MinMaxScaler().fit_transform(X), columns=X.columns)  # chi2 requires non-negative values

print("Number of Classes:", y.nunique())
print("Initial Feature Count:", X.shape[1])

# Step 5: Hybrid Feature Selection to get 46 features
# Filter Method - Chi-Square
filter_selector = SelectKBest(score_func=chi2, k=60)  # First filter to top 60 features
X_filtered = filter_selector.fit_transform(X, y)
filter_scores = filter_selector.scores_
top_filter_indices = np.argsort(filter_scores)[-60:]

# Wrapper Method - RFE (use a valid estimator like KNN without invalid params)
rfe_selector = RFE(estimator=RandomForestClassifier(n_estimators=100, random_state=42), n_features_to_select=30)
rfe_selector.fit(X.iloc[:, top_filter_indices], y)
rfe_selected_indices = np.array(top_filter_indices)[rfe_selector.support_]

# Combine: RFE + random features to make 46
remaining_indices = list(set(range(X.shape[1])) - set(rfe_selected_indices))
extra_indices = np.random.choice(remaining_indices, size=16, replace=False)
final_indices = list(rfe_selected_indices) + list(extra_indices)

# Final selected features
X_selected = X.iloc[:, final_indices[:20]]
print("Selected Features Count:", X_selected.shape[1])
print("Selected Features:", X_selected.columns.tolist())

# Step 6: Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

# Step 7: Train K-Nearest Neighbors model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = knn_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("\nEvaluation Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Number of Classes: 12
Initial Feature Count: 46




Selected Features Count: 20
Selected Features: ['Number', 'Weight', 'IAT', 'Rate', 'Srate', 'Covariance', 'Duration', 'flow_duration', 'Tot size', 'Max', 'Tot sum', 'urg_count', 'fin_count', 'AVG', 'Min', 'Header_Length', 'Magnitue', 'rst_count', 'syn_count', 'Protocol Type']

Evaluation Metrics:
Accuracy: 0.9732260757380283
Precision: 0.9731636998517462
Recall: 0.9732260757380283
F1 Score: 0.9731294536243194

Classification Report:
                         precision    recall  f1-score   support

          BenignTraffic       1.00      1.00      1.00      1748
        DDoS-ICMP_Flood       1.00      1.00      1.00     11003
      DDoS-PSHACK_Flood       0.99      0.99      0.99      6237
       DDoS-RSTFINFlood       1.00      1.00      1.00      6318
         DDoS-SYN_Flood       0.96      0.97      0.96      6195
DDoS-SynonymousIP_Flood       0.98      0.99      0.98      5453
         DDoS-TCP_Flood       0.97      0.98      0.98      6875
         DDoS-UDP_Flood       0.94      0.

In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier

# Step 1: Load dataset
df = pd.read_csv('data.csv')

# Step 2: Keep only the 12 most common classes
top_12_classes = df['label'].value_counts().index[:12]
df_12 = df[df['label'].isin(top_12_classes)]

# Step 3: Separate features and labels
X = df_12.drop("label", axis=1, errors='ignore')
y = df_12["label"]

# Step 4: Preprocessing (encode categoricals + fill missing values + scale for chi2)
X = pd.get_dummies(X, drop_first=True)
X.fillna(X.mean(), inplace=True)
X = pd.DataFrame(MinMaxScaler().fit_transform(X), columns=X.columns)  # chi2 requires non-negative values

print("Number of Classes:", y.nunique())
print("Initial Feature Count:", X.shape[1])

# Step 5: Hybrid Feature Selection to get 46 features
# Filter Method - Chi-Square
filter_selector = SelectKBest(score_func=chi2, k=60)  # First filter to top 60 features
X_filtered = filter_selector.fit_transform(X, y)
filter_scores = filter_selector.scores_
top_filter_indices = np.argsort(filter_scores)[-60:]

# Wrapper Method - RFE (use a valid estimator like KNN without invalid params)
rfe_selector = RFE(estimator=RandomForestClassifier(n_estimators=100, random_state=42), n_features_to_select=30)
rfe_selector.fit(X.iloc[:, top_filter_indices], y)
rfe_selected_indices = np.array(top_filter_indices)[rfe_selector.support_]

# Combine: RFE + random features to make 46
remaining_indices = list(set(range(X.shape[1])) - set(rfe_selected_indices))
extra_indices = np.random.choice(remaining_indices, size=16, replace=False)
final_indices = list(rfe_selected_indices) + list(extra_indices)

# Final selected features
X_selected = X.iloc[:, final_indices[:15]]
print("Selected Features Count:", X_selected.shape[1])
print("Selected Features:", X_selected.columns.tolist())

# Step 6: Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

# Step 7: Train K-Nearest Neighbors model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = knn_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("\nEvaluation Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Number of Classes: 12
Initial Feature Count: 46




Selected Features Count: 15
Selected Features: ['Number', 'Weight', 'IAT', 'Rate', 'Srate', 'Covariance', 'Duration', 'flow_duration', 'Tot size', 'Max', 'Tot sum', 'urg_count', 'fin_count', 'AVG', 'Min']

Evaluation Metrics:
Accuracy: 0.9750018972451999
Precision: 0.9749685974128957
Recall: 0.9750018972451999
F1 Score: 0.9749773604410966

Classification Report:
                         precision    recall  f1-score   support

          BenignTraffic       1.00      1.00      1.00      1748
        DDoS-ICMP_Flood       0.99      0.99      0.99     11003
      DDoS-PSHACK_Flood       0.98      0.99      0.98      6237
       DDoS-RSTFINFlood       1.00      1.00      1.00      6318
         DDoS-SYN_Flood       0.96      0.95      0.95      6195
DDoS-SynonymousIP_Flood       0.99      0.99      0.99      5453
         DDoS-TCP_Flood       0.96      0.97      0.96      6875
         DDoS-UDP_Flood       0.98      0.98      0.98      8430
          DoS-SYN_Flood       0.92      0.92     

In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier

# Step 1: Load dataset
df = pd.read_csv('data.csv')

# Step 2: Keep only the 12 most common classes
top_12_classes = df['label'].value_counts().index[:12]
df_12 = df[df['label'].isin(top_12_classes)]

# Step 3: Separate features and labels
X = df_12.drop("label", axis=1, errors='ignore')
y = df_12["label"]

# Step 4: Preprocessing (encode categoricals + fill missing values + scale for chi2)
X = pd.get_dummies(X, drop_first=True)
X.fillna(X.mean(), inplace=True)
X = pd.DataFrame(MinMaxScaler().fit_transform(X), columns=X.columns)  # chi2 requires non-negative values

print("Number of Classes:", y.nunique())
print("Initial Feature Count:", X.shape[1])

# Step 5: Hybrid Feature Selection to get 46 features
# Filter Method - Chi-Square
filter_selector = SelectKBest(score_func=chi2, k=60)  # First filter to top 60 features
X_filtered = filter_selector.fit_transform(X, y)
filter_scores = filter_selector.scores_
top_filter_indices = np.argsort(filter_scores)[-60:]

# Wrapper Method - RFE (use a valid estimator like KNN without invalid params)
rfe_selector = RFE(estimator=RandomForestClassifier(n_estimators=100, random_state=42), n_features_to_select=30)
rfe_selector.fit(X.iloc[:, top_filter_indices], y)
rfe_selected_indices = np.array(top_filter_indices)[rfe_selector.support_]

# Combine: RFE + random features to make 46
remaining_indices = list(set(range(X.shape[1])) - set(rfe_selected_indices))
extra_indices = np.random.choice(remaining_indices, size=16, replace=False)
final_indices = list(rfe_selected_indices) + list(extra_indices)

# Final selected features
X_selected = X.iloc[:, final_indices[:10]]
print("Selected Features Count:", X_selected.shape[1])
print("Selected Features:", X_selected.columns.tolist())

# Step 6: Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

# Step 7: Train K-Nearest Neighbors model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = knn_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("\nEvaluation Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Number of Classes: 12
Initial Feature Count: 46




Selected Features Count: 10
Selected Features: ['Number', 'Weight', 'IAT', 'Rate', 'Srate', 'Covariance', 'Duration', 'flow_duration', 'Tot size', 'Max']

Evaluation Metrics:
Accuracy: 0.9717689914244517
Precision: 0.9718026642703845
Recall: 0.9717689914244517
F1 Score: 0.9717735817330868

Classification Report:
                         precision    recall  f1-score   support

          BenignTraffic       1.00      1.00      1.00      1748
        DDoS-ICMP_Flood       0.99      0.99      0.99     11003
      DDoS-PSHACK_Flood       0.97      0.97      0.97      6237
       DDoS-RSTFINFlood       0.97      0.97      0.97      6318
         DDoS-SYN_Flood       0.96      0.96      0.96      6195
DDoS-SynonymousIP_Flood       0.99      0.98      0.98      5453
         DDoS-TCP_Flood       0.97      0.97      0.97      6875
         DDoS-UDP_Flood       0.97      0.98      0.97      8430
          DoS-SYN_Flood       0.93      0.94      0.94      3047
          DoS-TCP_Flood       0.96 

In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier

# Step 1: Load dataset
df = pd.read_csv('data.csv')

# Step 2: Keep only the 12 most common classes
top_12_classes = df['label'].value_counts().index[:12]
df_12 = df[df['label'].isin(top_12_classes)]

# Step 3: Separate features and labels
X = df_12.drop("label", axis=1, errors='ignore')
y = df_12["label"]

# Step 4: Preprocessing (encode categoricals + fill missing values + scale for chi2)
X = pd.get_dummies(X, drop_first=True)
X.fillna(X.mean(), inplace=True)
X = pd.DataFrame(MinMaxScaler().fit_transform(X), columns=X.columns)  # chi2 requires non-negative values

print("Number of Classes:", y.nunique())
print("Initial Feature Count:", X.shape[1])

# Step 5: Hybrid Feature Selection to get 46 features
# Filter Method - Chi-Square
filter_selector = SelectKBest(score_func=chi2, k=60)  # First filter to top 60 features
X_filtered = filter_selector.fit_transform(X, y)
filter_scores = filter_selector.scores_
top_filter_indices = np.argsort(filter_scores)[-60:]

# Wrapper Method - RFE (use a valid estimator like KNN without invalid params)
rfe_selector = RFE(estimator=RandomForestClassifier(n_estimators=100, random_state=42), n_features_to_select=30)
rfe_selector.fit(X.iloc[:, top_filter_indices], y)
rfe_selected_indices = np.array(top_filter_indices)[rfe_selector.support_]

# Combine: RFE + random features to make 46
remaining_indices = list(set(range(X.shape[1])) - set(rfe_selected_indices))
extra_indices = np.random.choice(remaining_indices, size=16, replace=False)
final_indices = list(rfe_selected_indices) + list(extra_indices)

# Final selected features
X_selected = X.iloc[:, final_indices[:5]]
print("Selected Features Count:", X_selected.shape[1])
print("Selected Features:", X_selected.columns.tolist())

# Step 6: Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

# Step 7: Train K-Nearest Neighbors model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = knn_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("\nEvaluation Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Number of Classes: 12
Initial Feature Count: 46




Selected Features Count: 5
Selected Features: ['Number', 'Weight', 'IAT', 'Rate', 'Srate']

Evaluation Metrics:
Accuracy: 0.9926418786692759
Precision: 0.9926783448896348
Recall: 0.9926418786692759
F1 Score: 0.9926506749548951

Classification Report:
                         precision    recall  f1-score   support

          BenignTraffic       1.00      1.00      1.00       984
        DDoS-ICMP_Flood       0.99      0.99      0.99      6430
      DDoS-PSHACK_Flood       1.00      0.99      0.99      3672
       DDoS-RSTFINFlood       0.99      1.00      0.99      3618
         DDoS-SYN_Flood       1.00      0.99      0.99      3613
DDoS-SynonymousIP_Flood       1.00      0.99      1.00      3192
         DDoS-TCP_Flood       1.00      0.99      0.99      4024
         DDoS-UDP_Flood       0.98      0.99      0.98      4833
          DoS-SYN_Flood       1.00      0.99      1.00      1763
          DoS-TCP_Flood       1.00      1.00      1.00      2354
          DoS-UDP_Flood       0.9