In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import glob

# --- User Configuration ---
prepared_dir = '/content/drive/MyDrive/GothamDataset2025/prepared/'
target_label = 'label_category'
# --- End User Configuration ---

# Find prepared CSV files
prepared_files = glob.glob(os.path.join(prepared_dir, "*.csv"))

if not prepared_files:
    print(f"No prepared CSV files found in '{prepared_dir}'. Please ensure the dataset preparation step has been completed successfully.")
else:
    for file_path in prepared_files:
        dataset_name = os.path.basename(file_path).replace('.csv', '')
        print(f"\n--- Processing dataset: '{dataset_name}' ---")

        try:
            df_prepared = pd.read_csv(file_path, low_memory=False)
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
            continue

        if target_label not in df_prepared.columns:
            print(f"Error: Target label '{target_label}' not found in '{dataset_name}'. Skipping.")
            del df_prepared
            import gc
            gc.collect()
            continue

        # Identify categorical features (excluding the target label itself and numeric columns)
        categorical_features = df_prepared.select_dtypes(include=['object', 'category']).columns.tolist()
        if target_label in categorical_features:
            categorical_features.remove(target_label)

        if not categorical_features:
            print(f"No suitable categorical features found for plotting in '{dataset_name}'. Skipping.")
            del df_prepared
            import gc
            gc.collect()
            continue

        print(f"Generating stacked bar charts for {len(categorical_features)} categorical features against '{target_label}'.")

        # Determine grid size for subplots
        num_cols = 2 # Number of columns for subplots
        num_rows = (len(categorical_features) + num_cols - 1) // num_cols # Calculate rows needed

        plt.figure(figsize=(num_cols * 8, num_rows * 6))
        plt.suptitle(f'Categorical Feature Distributions for {dataset_name} (Stacked Bar Charts)', y=1.02, fontsize=16)

        for i, feature in enumerate(categorical_features):
            plt.subplot(num_rows, num_cols, i + 1)

            # Create a cross-tabulation (contingency table)
            cross_tab = pd.crosstab(df_prepared[feature], df_prepared[target_label])

            # Normalize the cross-tabulation to get percentages for stacking
            # This makes sure each bar sums to 100% or 1
            cross_tab_norm = cross_tab.div(cross_tab.sum(1), axis=0)

            # Plot as a stacked bar chart
            cross_tab_norm.plot(kind='bar', stacked=True, ax=plt.gca(), cmap='viridis')

            plt.title(f'{feature} vs. {target_label}')
            plt.xlabel(feature)
            plt.ylabel('Proportion')
            plt.xticks(rotation=45, ha='right')
            plt.legend(title=target_label, bbox_to_anchor=(1.05, 1), loc='upper left')

        plt.tight_layout(rect=[0, 0.03, 1, 0.98]) # Adjust layout to prevent title overlap
        plt.show()

        # Clear the DataFrame to free up memory before loading the next one
        del df_prepared
        import gc
        gc.collect()
