# National Practical Significance Analysis – Chronic Conditions (CCHS)

This notebook evaluates **practical significance** for multiple chronic conditions using stratified 
prevalence estimates from the Canadian Community Health Survey (CCHS).  

For each condition and stratifier (Province, Age, Sex, and Income), the analysis:  
- Identifies groups with the highest and lowest prevalence.  
- Calculates absolute difference and relative risk.  
- Reports confidence intervals and overall association strength (Cramér’s V).  
- Provides human-readable interpretations to support health equity and public health insights.

---

**Data Disclaimer:**  
This notebook uses **bootstrap-derived prevalence estimates** generated from CCHS data.  
Due to licensing restrictions, raw microdata is **not shared** in this repository.  
Users must supply their own CCHS bootstrap estimate file to replicate these results.


In [1]:
# ===========================================================
# Practical Significance Testing (Step 1)
# Condition: Cardiovascular Condition
# Stratifiers: Province, Age, Sex, Income
# Author: Arun Acharya
# ===========================================================

import pandas as pd
import numpy as np

# -----------------------------------------------------------
# 1. FILE PATH (updated to placeholder for GitHub)
# -----------------------------------------------------------
# Replace this with your local path to the national bootstrap estimates file.
bootstrap_file = "PATH_TO_YOUR_BOOTSTRAP_FILE.xlsx"

# -----------------------------------------------------------
# 2. DETECT SHEET NAMES DYNAMICALLY
# -----------------------------------------------------------
# Excel limits sheet names to 31 characters and may truncate them.
# We'll scan all sheet names and auto-detect relevant ones for each stratifier.

# Condition under focus
conditions = ["Cardiovascular Condition"]

# Read all available sheet names from the Excel file
all_sheets = pd.ExcelFile(bootstrap_file).sheet_names

# Initialize dictionary for mapping stratifier sheet names
sheet_map = {cond: {"age": None, "sex": None, "income": None, "province": None} for cond in conditions}

# Detect sheets for the condition of interest
for sheet in all_sheets:
    for cond in conditions:
        if sheet.startswith(cond[:30]):  # handles Excel's 31-character limit
            lower = sheet.lower()
            if "_by_ag" in lower or "_by_age" in lower:
                sheet_map[cond]["age"] = sheet
            elif "_by_se" in lower:
                sheet_map[cond]["sex"] = sheet
            elif "_by_in" in lower:
                sheet_map[cond]["income"] = sheet
            elif "_by_pr" in lower:
                sheet_map[cond]["province"] = sheet

# Display mapping results for transparency
print("=== Auto-generated Sheet Mapping ===")
for cond, mapping in sheet_map.items():
    print(f"{cond}: {mapping}")

# -----------------------------------------------------------
# 3. READ SHEETS FOR CARDIOVASCULAR CONDITION
# -----------------------------------------------------------
# These sheets contain already-calculated prevalence and confidence intervals (from bootstrap).
df_province = pd.read_excel(bootstrap_file, sheet_name=sheet_map["Cardiovascular Condition"]["province"])
df_age      = pd.read_excel(bootstrap_file, sheet_name=sheet_map["Cardiovascular Condition"]["age"])
df_sex      = pd.read_excel(bootstrap_file, sheet_name=sheet_map["Cardiovascular Condition"]["sex"])
df_income   = pd.read_excel(bootstrap_file, sheet_name=sheet_map["Cardiovascular Condition"]["income"])

# -----------------------------------------------------------
# 4. HELPER FUNCTION: Calculate Practical Significance
# -----------------------------------------------------------
def practical_significance(df, stratifier_name, use_cramers_v=True):
    """
    Finds highest and lowest prevalence groups, calculates absolute difference and relative risk,
    captures confidence intervals, and generates an interpretation string.
    """
    # 1. Find group with highest and lowest prevalence
    highest = df.loc[df['Estimated Prevalence (%)'].idxmax()]
    lowest  = df.loc[df['Estimated Prevalence (%)'].idxmin()]

    # 2. Absolute difference (percentage points)
    absolute_difference = highest['Estimated Prevalence (%)'] - lowest['Estimated Prevalence (%)']

    # 3. Relative risk (ratio of percentages)
    relative_risk = highest['Estimated Prevalence (%)'] / lowest['Estimated Prevalence (%)']

    # 4. Confidence intervals
    highest_ci = (highest['95% CI Lower'], highest['95% CI Upper'])
    lowest_ci  = (lowest['95% CI Lower'], lowest['95% CI Upper'])

    # 5. Generate interpretation text
    interpretation = (
        f"For {stratifier_name}, the highest prevalence group is '{highest['Group']}' "
        f"({highest['Estimated Prevalence (%)']:.2f}%, CI: {highest_ci[0]:.2f}-{highest_ci[1]:.2f}) "
        f"and the lowest is '{lowest['Group']}' "
        f"({lowest['Estimated Prevalence (%)']:.2f}%, CI: {lowest_ci[0]:.2f}-{lowest_ci[1]:.2f}). "
        f"The absolute difference is {absolute_difference:.2f}% "
        f"and the relative risk is {relative_risk:.2f}."
    )

    # 6. Optional placeholder for Cramér's V (from Chi-square test)
    cramers_v = "To be linked from previous Chi-square results" if use_cramers_v else None

    # 7. Return dictionary of results
    return {
        "Stratifier": stratifier_name,
        "Highest Group": highest['Group'],
        "Lowest Group": lowest['Group'],
        "Absolute Difference (%)": round(absolute_difference, 2),
        "Relative Risk": round(relative_risk, 2),
        "Highest CI": highest_ci,
        "Lowest CI": lowest_ci,
        "Cramer's V": cramers_v,
        "Interpretation": interpretation
    }

# -----------------------------------------------------------
# 5. APPLY FUNCTION TO ALL STRATIFIERS
# -----------------------------------------------------------
results = [
    practical_significance(df_province, "Province", use_cramers_v=True),
    practical_significance(df_age, "Age Group", use_cramers_v=True),
    practical_significance(df_sex, "Sex (Male vs Female)", use_cramers_v=False),
    practical_significance(df_income, "Income Group", use_cramers_v=True)
]

# -----------------------------------------------------------
# 6. CONVERT RESULTS TO DATAFRAME
# -----------------------------------------------------------
results_df = pd.DataFrame(results)

# Display final practical significance summary
results_df


=== Auto-generated Sheet Mapping ===
Cardiovascular Condition: {'age': 'Cardiovascular Condition_by_Age', 'sex': 'Cardiovascular Condition_by_Sex', 'income': 'Cardiovascular Condition_by_Inc', 'province': 'Cardiovascular Condition_by_Pro'}


Unnamed: 0,Stratifier,Highest Group,Lowest Group,Absolute Difference (%),Relative Risk,Highest CI,Lowest CI,Cramer's V,Interpretation
0,Province,Quebec,YUKON/NORTHWEST/NUNAVUT TERRITORIES,5.57,4.34,"(6.649317040185867, 7.790120154253399)","(0.9488372339747543, 2.473353281194409)",To be linked from previous Chi-square results,"For Province, the highest prevalence group is ..."
1,Age Group,65+,12–17,19.57,54.31,"(19.23615470176384, 20.60546629441292)","(0.1523719842965021, 0.6010705860034657)",To be linked from previous Chi-square results,"For Age Group, the highest prevalence group is..."
2,Sex (Male vs Female),Male,Female,2.68,1.62,"(6.660673755238558, 7.369990026969471)","(4.073246583339655, 4.596700684511899)",,"For Sex (Male vs Female), the highest prevalen..."
3,Income Group,"$20,000–39,999","$80,000 or more",6.9,2.74,"(10.07053384872905, 11.66024985402078)","(3.697974249580352, 4.215901179507243)",To be linked from previous Chi-square results,"For Income Group, the highest prevalence group..."


In [1]:
# ===========================================================
# National Practical Significance Analysis – Chronic Conditions (CCHS)
# Author: Arun Acharya
# ===========================================================

import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency

# -----------------------------------------------------------
# 1. FILE PATHS (use placeholder paths for GitHub sharing)
# -----------------------------------------------------------
# Replace these placeholders with your own local file paths when running.
bootstrap_file = "PATH_TO_YOUR_BOOTSTRAP_FILE.xlsx"
output_file = "National_Practical_Significance_Final.xlsx"

# -----------------------------------------------------------
# 2. CONDITIONS (list of chronic conditions)
# -----------------------------------------------------------
conditions = [
    "Sleep Apnea",
    "High Blood Pressure",
    "High Blood Cholesterol",
    "Diabetes",
    "Chronic Fatigue Syndrome",
    "Mood Disorder",
    "Anxiety Disorder",
    "Respiratory Condition",
    "Musculoskeletal Condition",
    "Cardiovascular Condition"
]

# -----------------------------------------------------------
# 3. SHEET DETECTION (handle Excel's 31-character name limit)
# -----------------------------------------------------------
# CCHS bootstrap sheets sometimes have truncated names due to Excel's 31-character limit.
# We map detected sheets to each stratifier (Province, Age, Sex, Income).
all_sheets = pd.ExcelFile(bootstrap_file).sheet_names
sheet_map = {cond: {"age": None, "sex": None, "income": None, "province": None} for cond in conditions}

for sheet in all_sheets:
    for cond in conditions:
        if sheet.startswith(cond[:30]):  # safely match prefix
            lower = sheet.lower()
            if "_by_ag" in lower or "_by_age" in lower:
                sheet_map[cond]["age"] = sheet
            elif "_by_se" in lower:
                sheet_map[cond]["sex"] = sheet
            elif "_by_in" in lower:
                sheet_map[cond]["income"] = sheet
            elif "_by_pr" in lower:
                sheet_map[cond]["province"] = sheet

# Show mapping for transparency
print("=== Auto-generated Sheet Mapping ===")
for cond, mapping in sheet_map.items():
    print(f"{cond}: {mapping}")

# -----------------------------------------------------------
# 4. FUNCTION: CALCULATE CRAMÉR'S V (association strength)
# -----------------------------------------------------------
def calculate_cramers_v(df):
    """
    Calculates Cramér's V statistic to measure association strength between
    categorical grouping and prevalence outcome.
    """
    # Convert prevalence to proportion and compute weighted counts
    df['Prop'] = df['Estimated Prevalence (%)'] / 100
    df['Weighted_Positive'] = df['Prop'] * df['Weighted N']
    df['Weighted_Negative'] = df['Weighted N'] - df['Weighted_Positive']

    # Build contingency table and run Chi-square
    contingency = df[['Weighted_Positive', 'Weighted_Negative']].to_numpy()
    chi2, _, _, _ = chi2_contingency(contingency)

    # Sample size and number of groups
    n = df['Weighted N'].sum()
    k = len(df)

    # Compute Cramér's V, avoid division by zero if only one group
    v = np.sqrt(chi2 / (n * (min(k, 2) - 1 if min(k, 2) > 1 else 1))) if k > 1 else 0
    return v

# -----------------------------------------------------------
# 5. FUNCTION: PRACTICAL SIGNIFICANCE + INTERPRETATION
# -----------------------------------------------------------
def practical_significance(df, condition_name, stratifier_name):
    """
    Identifies highest and lowest prevalence groups, calculates absolute difference,
    relative risk, confidence intervals, adds Cramér's V (if applicable), and
    returns a structured dictionary with interpretation.
    """
    # Identify highest and lowest groups
    highest = df.loc[df['Estimated Prevalence (%)'].idxmax()]
    lowest = df.loc[df['Estimated Prevalence (%)'].idxmin()]

    # Calculate differences and ratios
    absolute_difference = highest['Estimated Prevalence (%)'] - lowest['Estimated Prevalence (%)']
    relative_risk = highest['Estimated Prevalence (%)'] / lowest['Estimated Prevalence (%)']
    highest_ci = (highest['95% CI Lower'], highest['95% CI Upper'])
    lowest_ci  = (lowest['95% CI Lower'], lowest['95% CI Upper'])

    # Calculate Cramér's V for multi-category stratifiers
    cramers_v = None
    if stratifier_name != "Sex (Male vs Female)":  # Sex already has two fixed categories
        cramers_v = calculate_cramers_v(df)

    # Human-readable interpretation
    interpretation = (
        f"For {stratifier_name}, the highest prevalence group is '{highest['Group']}' "
        f"({highest['Estimated Prevalence (%)']:.2f}%, CI: {highest_ci[0]:.2f}-{highest_ci[1]:.2f}) "
        f"and the lowest is '{lowest['Group']}' "
        f"({lowest['Estimated Prevalence (%)']:.2f}%, CI: {lowest_ci[0]:.2f}-{lowest_ci[1]:.2f}). "
        f"The absolute difference is {absolute_difference:.2f}% and the relative risk is {relative_risk:.2f}."
    )
    if cramers_v is not None:
        interpretation += f" Overall association strength (Cramér's V) is {cramers_v:.3f}."

    # Return results as dictionary
    return {
        "Condition": condition_name,
        "Stratifier": stratifier_name,
        "Highest Group": highest['Group'],
        "Highest Prevalence (%)": round(highest['Estimated Prevalence (%)'], 2),
        "Highest CI": highest_ci,
        "Lowest Group": lowest['Group'],
        "Lowest Prevalence (%)": round(lowest['Estimated Prevalence (%)'], 2),
        "Lowest CI": lowest_ci,
        "Absolute Difference (%)": round(absolute_difference, 2),
        "Relative Risk": round(relative_risk, 2),
        "Cramer's V": round(cramers_v, 3) if cramers_v is not None else None,
        "Interpretation": interpretation
    }

# -----------------------------------------------------------
# 6. RUN ANALYSIS FOR ALL CONDITIONS AND STRATIFIERS
# -----------------------------------------------------------
# Loop through all conditions and stratifiers, apply the function, and store results
all_results = []
for cond in conditions:
    mapping = sheet_map[cond]
    if mapping["province"]:
        df_prov = pd.read_excel(bootstrap_file, sheet_name=mapping["province"])
        all_results.append(practical_significance(df_prov, cond, "Province"))
    if mapping["age"]:
        df_age = pd.read_excel(bootstrap_file, sheet_name=mapping["age"])
        all_results.append(practical_significance(df_age, cond, "Age Group"))
    if mapping["sex"]:
        df_sex = pd.read_excel(bootstrap_file, sheet_name=mapping["sex"])
        all_results.append(practical_significance(df_sex, cond, "Sex (Male vs Female)"))
    if mapping["income"]:
        df_income = pd.read_excel(bootstrap_file, sheet_name=mapping["income"])
        all_results.append(practical_significance(df_income, cond, "Income Group"))

# -----------------------------------------------------------
# 7. SAVE RESULTS AND DISPLAY
# -----------------------------------------------------------
results_df = pd.DataFrame(all_results)
results_df.to_excel(output_file, index=False)

print("\n=== NATIONAL PRACTICAL SIGNIFICANCE RESULTS (with Prevalence Columns) ===")
print(results_df)
print(f"\nResults saved to: {output_file}")

# -----------------------------------------------------------


=== Auto-generated Sheet Mapping ===
Sleep Apnea: {'age': 'Sleep Apnea_by_Age Group', 'sex': 'Sleep Apnea_by_Sex', 'income': 'Sleep Apnea_by_Income Group', 'province': 'Sleep Apnea_by_Province'}
High Blood Pressure: {'age': 'High Blood Pressure_by_Age Grou', 'sex': 'High Blood Pressure_by_Sex', 'income': 'High Blood Pressure_by_Income G', 'province': 'High Blood Pressure_by_Province'}
High Blood Cholesterol: {'age': 'High Blood Cholesterol_by_Age G', 'sex': 'High Blood Cholesterol_by_Sex', 'income': 'High Blood Cholesterol_by_Incom', 'province': 'High Blood Cholesterol_by_Provi'}
Diabetes: {'age': 'Diabetes_by_Age Group', 'sex': 'Diabetes_by_Sex', 'income': 'Diabetes_by_Income Group', 'province': 'Diabetes_by_Province'}
Chronic Fatigue Syndrome: {'age': 'Chronic Fatigue Syndrome_by_Age', 'sex': 'Chronic Fatigue Syndrome_by_Sex', 'income': 'Chronic Fatigue Syndrome_by_Inc', 'province': 'Chronic Fatigue Syndrome_by_Pro'}
Mood Disorder: {'age': 'Mood Disorder_by_Age Group', 'sex': 'Mood D