In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import glob

# -- User Directory --
prepared_dir = '/content/drive/MyDrive/GothamDataset2025/prepared/'
# Set the target label for the plots (e.g., 'label', 'label_category')
target_label = 'label_category'
# Define the numeric features to plot (adjust as needed)
numeric_features_to_plot = [
    'frame.len',
    'ip.ttl',
    'tcp.window_size_value',
    'tcp.window_size_scalefactor',
    'tcp.pdu.size'
]
# -- End User Directory --

# Find prepared CSV files
prepared_files = glob.glob(os.path.join(prepared_dir, "*.csv"))

if not prepared_files:
    print(f"No prepared CSV files found in '{prepared_dir}'. Please ensure the dataset preparation step has been completed successfully.")
else:
    for file_path in prepared_files:
        dataset_name = os.path.basename(file_path).replace('.csv', '')
        print(f"Loading and plotting for dataset: '{dataset_name}'")
        
        try:
            df_prepared = pd.read_csv(file_path, low_memory=False)
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
            continue

        if target_label not in df_prepared.columns:
            print(f"Error: Target label '{target_label}' not found in the dataset '{dataset_name}'. Skipping plotting for this dataset.")
        elif not numeric_features_to_plot:
            print(f"No suitable numeric features found for plotting in dataset '{dataset_name}'.")
        else:
            print(f"Generating boxplots for {len(numeric_features_to_plot)} numeric features against '{target_label}' for '{dataset_name}'.")

            # Determine grid size for subplots
            num_cols = 3 # Number of columns for subplots
            num_rows = (len(numeric_features_to_plot) + num_cols - 1) // num_cols # Calculate rows needed

            plt.figure(figsize=(num_cols * 6, num_rows * 4))
            for i, feature in enumerate(numeric_features_to_plot):
                plt.subplot(num_rows, num_cols, i + 1)
                sns.boxplot(x=target_label, y=feature, data=df_prepared)
                plt.title(f'{feature} vs. {target_label}')
                plt.xlabel('Attack Category')
                plt.ylabel(feature)
                plt.xticks(rotation=45, ha='right')

            plt.tight_layout()
            plt.suptitle(f'Numeric Feature Distributions for {dataset_name} (Boxplots)', y=1.02, fontsize=16)
            plt.show()
        
        # Clear the DataFrame to free up memory before loading the next one
        del df_prepared
        import gc
        gc.collect()