In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import glob

# --- User Configuration ---
prepared_dir = '/content/drive/MyDrive/GothamDataset2025/prepared/'
target_label = 'label_category'
# --- End User Configuration ---

# Find prepared CSV files
prepared_files = glob.glob(os.path.join(prepared_dir, "*.csv"))

if not prepared_files:
    print(f"No prepared CSV files found in '{prepared_dir}'. Please ensure the dataset preparation step has been completed successfully.")
else:
    for file_path in prepared_files:
        dataset_name = os.path.basename(file_path).replace('.csv', '')
        print(f"\n--- Processing dataset: '{dataset_name}' ---")

        try:
            df_prepared = pd.read_csv(file_path, low_memory=False)
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
            continue

        if target_label not in df_prepared.columns:
            print(f"Error: Target label '{target_label}' not found in '{dataset_name}'. Skipping.")
            del df_prepared
            import gc
            gc.collect()
            continue

        # Identify numeric features
        numeric_features = df_prepared.select_dtypes(include=['number']).columns.tolist()

        # Exclude features that are flags or checksums as they were already handled
        # and might not be informative as continuous distributions (e.g., ip.checksum was converted to int)
        features_to_exclude = [
            'ip.flag.rb', 'ip.flag.df', 'ip.flag.mf',
            'tcp.flag.fin', 'tcp.flag.syn', 'tcp.flag.rst', 'tcp.flag.psh',
            'tcp.flag.ack', 'tcp.flag.urg', 'tcp.flag.ece', 'tcp.flag.cwr', 'tcp.flag.ns',
            'ip.checksum', 'tcp.checksum', 'tcp.options' # Exclude if these are treated as categorical/binary after conversion
        ]
        plot_features = [f for f in numeric_features if f not in features_to_exclude]

        if not plot_features:
            print(f"No suitable numeric features found for histogram plotting in '{dataset_name}' after exclusions. Skipping.")
            del df_prepared
            import gc
            gc.collect()
            continue

        print(f"Generating histograms for {len(plot_features)} numeric features, colored by '{target_label}'.")

        # Determine grid size for subplots
        num_cols = 3 # Number of columns for subplots
        num_rows = (len(plot_features) + num_cols - 1) // num_cols # Calculate rows needed

        plt.figure(figsize=(num_cols * 6, num_rows * 4))
        plt.suptitle(f'Numeric Feature Distributions for {dataset_name} (Histograms)', y=1.02, fontsize=16)

        for i, feature in enumerate(plot_features):
            plt.subplot(num_rows, num_cols, i + 1)
            sns.histplot(data=df_prepared, x=feature, hue=target_label, palette='viridis', multiple='stack') # Removed kde=True
            plt.title(f'{feature} Distribution')
            plt.xlabel(feature)
            plt.ylabel('Count')
            plt.legend(title=target_label, bbox_to_anchor=(1.05, 1), loc='upper left')

        plt.tight_layout(rect=[0, 0.03, 1, 0.98]) # Adjust layout to prevent title overlap
        plt.show()

        # Clear the DataFrame to free up memory before loading the next one
        del df_prepared
        import gc
        gc.collect()
