In [9]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.metrics import recall_score, precision_score, classification_report
from tqdm import tqdm
import time

# Load data
path = "C:/Users/adamr/Downloads/ACI (1).csv"
data = pd.read_csv(path)

# Drop unnecessary columns including specified idle columns
columns_to_drop = ['Flow ID', 'Src IP', 'Dst IP', 'Timestamp', 'Connection Type', 'Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min']
data.drop(columns=columns_to_drop, inplace=True)

# Convert columns to numeric, handle NaNs and infinities
numeric_cols = data.select_dtypes(include=['number']).columns
data[numeric_cols] = data[numeric_cols].apply(pd.to_numeric, errors='coerce')
data[numeric_cols] = data[numeric_cols].replace([np.inf, -np.inf], np.nan)
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].mean())

# Encode categorical labels
label_encoder = LabelEncoder()
data['Label'] = label_encoder.fit_transform(data['Label'])
label_names = label_encoder.classes_  # Save label names for later use

# Handle class imbalance using SMOTE
features = data.loc[:, data.columns != 'Label']
labels = data['Label']
smote = SMOTE(random_state=42, k_neighbors=min(5, labels.value_counts().min() - 1))
features_resampled, labels_resampled = smote.fit_resample(features, labels)

# Print the final distribution of each label in the training dataset after SMOTE
label_distribution = pd.Series(labels_resampled).value_counts().sort_index()
label_distribution.index = [label_names[i] for i in label_distribution.index]
print("Final distribution of labels in the training dataset after SMOTE:")
print(label_distribution)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features_resampled, labels_resampled, test_size=0.2, random_state=42)

# Normalize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define and compile the TensorFlow Keras model
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(len(np.unique(y_train)), activation='softmax')
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=[])

# Train the model
epochs = 1
model.fit(X_train_scaled, y_train, epochs=epochs, verbose=1)

# Evaluate the model using recall and precision
y_pred = model.predict(X_test_scaled)
y_pred_classes = np.argmax(y_pred, axis=1)

# Calculate recall and precision
recall = recall_score(y_test, y_pred_classes, average='weighted')
precision = precision_score(y_test, y_pred_classes, average='weighted')
class_report = classification_report(y_test, y_pred_classes, target_names=label_names)

# Print evaluation results ensuring full classification report is displayed
results = f"Recall: {recall}\nPrecision: {precision}\n\nClassification Report:\n{class_report}"
print(results)

# Save the results to an external file
with open('C:/Users/adamr/Downloads/model_evaluation_results.txt', 'w') as file:
    file.write(results)

Final distribution of labels in the training dataset after SMOTE:
Benign                441282
DNS Flood             441282
Dictionary Attack     441282
ICMP Flood            441282
OS Scan               441282
Ping Sweep            441282
Port Scan             441282
SYN Flood             441282
Slowloris             441282
UDP Flood             441282
Vulnerability Scan    441282
Name: count, dtype: int64
Recall: 0.964389933880705
Precision: 0.9663504067892551

Classification Report:
                    precision    recall  f1-score   support

            Benign       0.95      0.74      0.83     88411
         DNS Flood       0.99      0.98      0.99     88000
 Dictionary Attack       0.96      0.99      0.97     87860
        ICMP Flood       0.99      1.00      0.99     88400
           OS Scan       0.99      0.99      0.99     88916
        Ping Sweep       0.96      1.00      0.98     88272
         Port Scan       0.99      0.97      0.98     87808
         SYN Flood       1.0