In [None]:
def map_categorical_values(df, mappings):
    """
    Maps categorical values based on provided mappings.
    - Handles general categorical columns.
    - Handles expanded columns ending with "_#" dynamically.
    
    Args:
        df (pd.DataFrame): DataFrame to process.
        mappings (dict): Dictionary containing mapping rules.

    Returns:
        pd.DataFrame: Updated DataFrame with mapped categorical values.
    """

    # Identify columns that end with "_" followed by a number
    expanded_columns = [col for col in df.columns if re.search(r"_\d+$", col)]
    
    for col in df.columns:
        if col in mappings:
            mapping = mappings[col]  # Get mapping dictionary
            
            # Convert column to string to avoid float issues
            df[col] = df[col].astype(str)

            # Replace NaN values properly
            df[col] = df[col].replace({"nan": np.nan, "None": np.nan, "NaN": np.nan}).str.strip()

            # Apply mapping to categorical columns (excluding expanded ones)
            if col not in expanded_columns:
                df[col] = df[col].apply(lambda x: mapping.get(str(x), x) if pd.notna(x) else np.nan)

    # Process expanded columns separately
    for col in expanded_columns:
        if col in mappings:
            mapped_value = list(mappings[col].values())[0]  # Get correct label

            # Convert column to string first, then replace values
            df[col] = df[col].astype(str).replace({"1.0": mapped_value, "1": mapped_value, "0": np.nan, "nan": np.nan})

    return df


In [None]:
# "primary_usage": {
#     "1": "DRINKING", "2": "COOKING", "3": "BATHING", "4": "CLEANING",
#     "5": "GARDEN", "6": "FARMING", "7":"OTHER"},
#  "daily_water_usage_source": {
#     "PIPE CONNECTION": "1", "WELL": "2", "POND": "3", "RIVER/BACKWATER": "4",
#     "BORE WELL": "5", "OTHERS": "6"},

"primary_usage_1": {"1": "DRINKING"}, 
"primary_usage_2": {"2": "COOKING"}, 
"primary_usage_3": {"3": "BATHING"}, 
"primary_usage_4": {"4": "CLEANING"},
"primary_usage_5": {"5": "GARDEN"}, 
"primary_usage_6": {"6": "FARMING"}, 
"primary_usage_7": {"7": "OTHER"},

"daily_water_usage_source_1" : {"1": "PIPE CONNECTION"},
"daily_water_usage_source_2" : {"2": "WELL"},
"daily_water_usage_source_3" : {"3": "POND"},
"daily_water_usage_source_4" : {"4": "RIVER/BACKWATER"},
"daily_water_usage_source_5" : {"5": "BORE WEL"},
"daily_water_usage_source_6" : {"6": "RAINWATER"},
"daily_water_usage_source_7" : {"7": "RO PLANT"},

"agricultural_water_source_1" : {"1": "PIPE CONNECTION"},
"agricultural_water_source_2" : {"2": "WELL"},
"agricultural_water_source_3" : {"3": "POND"},
"agricultural_water_source_4" : {"4": "RIVER/BACKWATER"},
"agricultural_water_source_5" : {"5": "BORE WEL"},
"agricultural_water_source_6" : {"6": "OTHERS"},

In [None]:


multi_response_columns = [col for col in df_expanded.columns if col.startswith("primary_usage")]

def plot_multiresponse_pie_chart(df, multi_response_columns):
    plt.figure(figsize=(8, 8))

    # Convert column names to readable labels
    usage_labels = [col.replace("primary_usage_", "").title() for col in multi_response_columns]

    # Count responses (sum across rows)
    usage_counts = df[multi_response_columns].sum()

    # Remove categories with zero responses
    usage_counts = usage_counts[usage_counts > 0]

    # Plot pie chart
    plt.pie(usage_counts, labels=usage_labels, autopct='%1.1f%%', startangle=140, colors=sns.color_palette("pastel"))

    plt.title("Distribution of Primary Water Usage", fontsize=14)

    # Save pie chart with sanitized filename
    # plt.savefig(f'./plots/primary_water_usage_pie.png', bbox_inches='tight', dpi=300)

    plt.show()


def plot_categorical_pie_charts(df, categorical_columns):
    for col in categorical_columns:
        plt.figure(figsize=(8, 8))

        # Convert column name to a readable title
        plot_title = col.replace("_", " ").title()

        # Aggregate data by category
        category_summary = df[col].value_counts()

        # Plot pie chart
        plt.pie(category_summary, labels=category_summary.index, autopct='%1.1f%%', startangle=140, colors=sns.color_palette("pastel"))

        plt.title(f"Distribution of {plot_title}", fontsize=14)
        
        # Save pie chart with sanitized filename
        plt.savefig(f'./plots/{clean_filename(plot_title)}_pie.png', bbox_inches='tight', dpi=300)

        plt.show()


plot_categorical_pie_charts(df_expanded, coded_mappings.keys())


In [None]:
# # Check if any columns in df_expanded are missing from coded_mappings
# missing_columns = [col for col in df_expanded.columns if col not in coded_mappings]

# # If there are missing columns, add them with a placeholder mapping
# if missing_columns:
#     print(f"Missing columns detected: {missing_columns}")
#     for col in missing_columns:
#         coded_mappings[col] = {"MISSING": "MISSING_MAPPING"}  # Placeholder for later updates
#     print("Updated coded_mappings with missing columns.")
# else:
#     print("No missing columns. All columns are already in coded_mappings.")

# # Optional: Print the updated coded_mappings dictionary to verify changes
# print(coded_mappings)


In [None]:
# # Function to visualize categorical variables across Panchayats with corrected X-tick labels
# def plot_categorical_distributions(df, categorical_columns):
#     for col in categorical_columns:
#         plt.figure(figsize=(10, 6))

#         # Convert column name to a readable title
#         plot_title = col.replace("_", " ").title()

#         # Group data
#         category_summary = df.groupby(["panchayat_code", col]).size().unstack(fill_value=0)
        

#         # Plot the data
#         category_summary.plot(kind="bar", stacked=True, colormap="coolwarm", figsize=(10, 6))

#         plt.title(f"{plot_title} Across Panchayats", fontsize=14)
#         plt.xlabel("Panchayat", fontsize=12)
#         plt.ylabel("Count", fontsize=12)
#         # plt.yticks(np.arange(1, category_summary.values.max() + 1, 1))
        
#         panchayats = ["Thakazhy", "Edathua", "Thalavady"]
#         # panchayat_codes = [1.0, 2.0, 3.0]

#         # # **Ensure X-Ticks Show Panchayat Names**
#         # plt.xticks(ticks=range(len(category_summary.index)), labels=category_summary.index, rotation=0)
#         # Set custom X-axis labels
#         plt.xticks(ticks=range(len(panchayats)), labels=panchayats)

#         # Rename legend labels
#         plt.legend(title=plot_title, bbox_to_anchor=(1.05, 1), loc='upper left')

#         plt.grid(axis='y', linestyle='--', alpha=0.7)

#         # Save plot
#         plt.savefig(f'./plots/{clean_filename(plot_title)}.png', bbox_inches='tight', dpi=300)

#         plt.show()

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import re

def plot_expanded_columns(df, expanded_mappings):
    """
    Plots stacked bar charts for all expanded column groups (e.g., primary usage, water sources).

    Args:
        df (pd.DataFrame): The processed DataFrame with categorical mappings applied.
        expanded_mappings (dict): Dictionary containing mappings for expanded categorical columns.

    Returns:
        None (Displays and saves stacked bar charts for each expanded group).
    """

    # ✅ **Define Panchayat Name Mapping**
    panchayat_mapping = {"1": "Thakazhy", "2": "Edathua", "3": "Thalavady"}
    
    # Identify expanded column groups by their prefixes (e.g., "primary_usage")
    expanded_columns = [col for col in df.columns if re.search(r"_(\d+)$", col)]
    grouped_columns = {}

    for col in expanded_columns:
        prefix = "_".join(col.split("_")[:-1])  # Extracts the category prefix
        if prefix not in grouped_columns:
            grouped_columns[prefix] = []
        grouped_columns[prefix].append(col)

    # ✅ Convert Panchayat Code to String (Ensuring Correct Mapping)
    df["panchayat_code"] = df["panchayat_code"].astype(str)

    for prefix, columns in grouped_columns.items():
        plt.figure(figsize=(12, 6))

        # **Sort columns based on their numeric suffix**
        columns.sort(key=lambda x: int(re.search(r"_(\d+)$", x).group(1)))

        # ✅ **Melt the data for stacking**
        df_melted = df.melt(id_vars=["panchayat_code"], value_vars=columns, var_name="Category", value_name="Value")

        # Remove NaN values
        df_melted = df_melted.dropna()

        # Convert values to string
        df_melted["Value"] = df_melted["Value"].astype(str)

        # ✅ **Group and reshape the data**
        category_summary = df_melted.groupby(["panchayat_code", "Value"]).size().unstack(fill_value=0)

        # ✅ **Sort legend using dictionary order**
        ordered_labels = [list(expanded_mappings[col].values())[0] for col in columns if col in expanded_mappings]
        category_summary = category_summary.reindex(columns=ordered_labels, fill_value=0)

        # ✅ **Plot stacked bar chart**
        category_summary.plot(kind="bar", stacked=True, colormap="coolwarm", figsize=(12, 6))

        plt.title(f"{prefix.replace('_', ' ').title()} Across Panchayats", fontsize=14)
        plt.xlabel("Panchayat", fontsize=12)
        plt.ylabel("Count", fontsize=12)

        # ✅ **Map X-Axis Ticks to Panchayat Names**
        panchayat_names = [panchayat_mapping.get(str(p), str(p)) for p in category_summary.index]
        plt.xticks(ticks=range(len(panchayat_names)), labels=panchayat_names, rotation=0)

        # ✅ **Fix legend (remove title)**
        plt.legend(title='', bbox_to_anchor=(1.05, 1), loc='upper left')

        plt.grid(axis='y', linestyle='--', alpha=0.7)

        # ✅ **Save the plot**
        plt.savefig(f'./plots/{prefix}_expanded_columns.png', bbox_inches='tight', dpi=300)

        plt.show()

# Run the function
plot_expanded_columns(df_expanded, expanded_mappings)


In [None]:
# def plot_expanded_columns(df):
#     """
#     Plots a single stacked bar chart for all expanded columns (ending with "_#"),
#     grouping them based on their prefixes and ensuring correct numerical order.

#     Args:
#         df (pd.DataFrame): DataFrame containing expanded columns.

#     Returns:
#         None (Displays and saves stacked bar charts for expanded categorical columns).
#     """

#     # Identify expanded column groups based on their prefixes
#     expanded_columns = [col for col in df.columns if re.search(r"_(\d+)$", col)]
    
#     # Group columns by prefix (e.g., "primary_usage_")
#     grouped_columns = {}
#     for col in expanded_columns:
#         prefix = "_".join(col.split("_")[:-1])  # Extracts prefix (e.g., "primary_usage")
#         if prefix not in grouped_columns:
#             grouped_columns[prefix] = []
#         grouped_columns[prefix].append(col)

#     # Ensure panchayat_code is mapped correctly
#     panchayat_mapping = {"1.0": "Thakazhy", "2.0": "Edathua", "3.0": "Thalavady"}
#     df["panchayat_code"] = df["panchayat_code"].astype(str).map(panchayat_mapping).fillna(df["panchayat_code"])

#     for prefix, columns in grouped_columns.items():
#         plt.figure(figsize=(12, 6))

#         # **Sort column names numerically**
#         columns.sort(key=lambda x: int(re.search(r"_(\d+)$", x).group(1)))  # Extract and sort by number

#         # Convert column names into readable categories for plotting
#         category_labels = [col.replace(prefix + "_", "").title() for col in columns]

#         # Aggregate counts for each category
#         category_summary = df.groupby("panchayat_code")[columns].sum()

#         # Rename columns for better visualization
#         category_summary.columns = category_labels

#         # Plot stacked bar chart
#         category_summary.plot(kind="bar", stacked=True, colormap="coolwarm", figsize=(12, 6))

#         plt.title(f"{prefix.replace('_', ' ').title()} Across Panchayats", fontsize=14)
#         plt.xlabel("Panchayat", fontsize=12)
#         plt.ylabel("Count", fontsize=12)
#         plt.xticks(rotation=0)

#         # Ensure legend has proper labels
#         plt.legend(title='', bbox_to_anchor=(1.05, 1), loc='upper left') #title=prefix.replace("_", " ").title()

#         plt.grid(axis='y', linestyle='--', alpha=0.7)

#         # Save plot
#         plt.savefig(f'./plots/{clean_filename(prefix)}_expanded_columns.png', bbox_inches='tight', dpi=300)

#         plt.show()


# # Run the function for expanded columns
# plot_expanded_columns(df_expanded)



In [None]:
plot_labels = {
    "panchayat_code": "Panchayat Code",
    "gender_of_informant": "Gender of Informant",
    "kwa_connection": "KWA Connection",
    "primary_water_source": "Primary Water Source",
    "daily_water_consumption_litres": "Daily Water Consumption (Litres)",
    "primary_usage_1": "Primary Usage Drinking",
    "primary_usage_2": "Primary Usage Cooking",
    "primary_usage_3": "Primary Usage Bathing",
    "primary_usage_4": "Primary Usage Cleaning",
    "primary_usage_5": "Primary Usage Garden",
    "primary_usage_6": "Primary Usage Farming",
    "primary_usage_7": "Primary Usage Others",
    "daily_water_usage_source_1": "Daily Water Usage Source Pipe Connection",
    "daily_water_usage_source_2": "Daily Water Usage Source Well",
    "daily_water_usage_source_3": "Daily Water Usage Source Pond",
    "daily_water_usage_source_4": "Daily Water Usage Source River/Backwater",
    "daily_water_usage_source_5": "Daily Water Usage Source Borewell",
    "daily_water_usage_source_6": "Daily Water Usage Source Others",
    "agricultural_water_source": "Agricultural Water Source",
    "issues_with_water_supply": "Issues with Water Supply",
    "frequency_of_supply_issues": "Frequency of Supply Issues",
    "check_by_authority": "Check by Authority",
    "ph_value_well": "pH Value (Well)",
    "ph_value_pond": "pH Value (Pond)",
    "ph_value_river_backwater": "pH Value (River/Backwater)",
    "ph_value_borewell": "pH Value (Borewell)",
    "ph_value_pipe_water": "pH Value (Pipe Water)",
    "satisfaction_with_water_taste": "Satisfaction with Water Taste",
    "satisfaction_with_water_smell": "Satisfaction with Water Smell",
    "noticeable_contamination": "Noticeable Contamination",
    "septic_tank_or_wastewater_presence_nearby_primary_water_source":
        "Septic Tank/Wastewater Presence Near Water Source",
    "pipeline_water_chlorinated_belief": "Belief that Pipeline Water is Chlorinated",
    "frequency_of_cleaning_water_tank": "Frequency of Cleaning Water Tank",
    "water_purifying_techniques": "Water Purifying Techniques",
    "types_of_water_purifications": "Types of Water Purification Methods",
    "ever_noticed_broken_pipelines_in_house_or_public_places":
        "Noticed Broken Pipelines (House/Public Places)",
    "waterborne_diseases_last_year": "Waterborne Diseases (Last Year)",
    "confidence_in_safety": "Confidence in Water Safety",
    "noticed_any_changes_in_health_after_using_current_drinking_water":
        "Noticed Health Changes After Using Water",
    "willingness_to_pay_for_better_drinking_water_services":
        "Willingness to Pay for Better Water Services",
    "household_members_below_1year_old": "Household Members (<1 Year Old)",
    "household_members_below_1year_to_3years_old": "Household Members (1-3 Years Old)",
    "household_members_below_4years_to_8years_old": "Household Members (4-8 Years Old)",
    "household_members_below_9years_to_12years_old": "Household Members (9-12 Years Old)",
    "household_members_below_13years_and_above": "Household Members (13+ Years Old)",
    "educational_qualification": "Educational Qualification",
    "primary_income_earner_qualification": "Primary Income Earner Qualification",
    "sec": "Socio-Economic Class (SEC)",
    "monthly_household_income": "Monthly Household Income"
}
