# 05 — Survey Design Checks

Run first cell:
```python
%run 00_bootstrap.ipynb
```

<h2>🧮 Generate Weighted Demographic Summary Table</h2>

In [None]:
# Python does not support full survey design (strata + PSU) out of the box
# below code not working 
# === Load data ===
data_path = "/Users/dengshuyue/Desktop/SDOH/analysis/data/"
df = pd.read_pickle(os.path.join(data_path, "SODH_diet_mort.pkl"))

# Drop rows missing survey design variables
df = df.dropna(subset=['wt10', 'sdmvstra', 'sdmvpsu'])

# === Weighted proportions for categorical variables ===
def weighted_props(df, var, weight):
    d = df[[var, weight]].dropna()
    counts = d.groupby(var)[weight].sum()
    total_weight = d[weight].sum()
    props = (counts / total_weight).round(3)
    return counts.round(0).astype(int), (props * 100).round(1)

# === Use Taylor Linearization for SEs of continuous variables ===
from linearmodels.survey import SurveyDesign, SurveyMean

def survey_mean_se(df, var, weight='wt10', strata='sdmvstra', cluster='sdmvpsu'):
    # Remove missing
    d = df[[var, weight, strata, cluster]].dropna()

    # Define survey design
    design = sm.survey.SurveyDesign(
        strata=d[strata],
        cluster=d[cluster],
        weights=d[weight],
    )

    # Fit the design-based estimator
    survey_var = sm.survey.SurveyMean(d[var], design)
    return float(survey_var.mean), float(survey_var.std)

# === Variable dictionaries ===
cat_vars = {
    'SEX': {1: 'Male', 2: 'Female'},
    'RACE': {1: 'Non-Hispanic White', 2: 'Non-Hispanic Black', 3: 'Hispanic', 4: 'Other'},
    'EDU': {1: 'Less than high school', 2: 'High school or equivalent', 3: 'Some college', 4: 'College or above'},
    'pir': {1: '<1.3', 2: '1.3~2.99', 3: '>=3'},
    'SNAP': {0: 'Not participant', 1: 'Participant', 2: 'Income eligible non-participant'},
    'SMK': {1: 'Nonsmokers', 2: 'Former smokers', 3: '<15 cigarettes/day', 4: '15-24.9 cigarettes/day', 5: '≥ 25 cigarettes/day'},
    'ALCG2': {1: 'Nondrinkers', 2: 'Moderate drinker', 3: 'Heavy drinker', 4: 'Missing'},
    'bmic': {1: 'BMI <18.5', 2: '18-24.9', 3: '25-29.9', 4: 'BMI ≥30'}
}

binary_vars = ['DIABETES', 'CVD', 'dm_rx', 'chol_rx', 'angina', 'cancer', 'lung_disease', 'MORTSTAT']
cont_vars = ['RIDAGEYR', 'met_hr', 'bmi', 'hba1c', 'sbp', 'dbp', 'hdl', 'ldl', 'tg', 'HEI2015_TOTAL_SCORE']

# === Generate summary ===
rows = []

# Categorical
for var, mapping in cat_vars.items():
    counts, props = weighted_props(df, var, 'wt10')
    for code, label in mapping.items():
        if code in counts.index:
            rows.append({
                "Variable": var,
                "Category": label,
                "Overall": f"{counts[code]:,.0f} ({props[code]}%)"
            })

# Binary (weighted %)
for var in binary_vars:
    val, se = survey_mean_se(df, var)
    rows.append({
        "Variable": var,
        "Category": "1",
        "Overall": f"{val * 100:.1f}% ({se * 100:.1f})"
    })

# Continuous (mean ± SE)
for var in cont_vars:
    mean, se = survey_mean_se(df, var)
    rows.append({
        "Variable": var,
        "Category": "",
        "Overall": f"{mean:.2f} ({se:.2f})"
    })

# === Create summary table ===
demo_table = pd.DataFrame(rows)

# === Save and preview ===
output_path = os.path.join(data_path, "demo_summary.csv")
demo_table.to_csv(output_path, index=False)

# Show preview
demo_table.head(50)


In [None]:

# === Load data ===
df = pd.read_pickle(os.path.join(data_path, "SODH_diet_mort.pkl"))

# Drop missing survey design variables (weight pooled: WTDRD1)
df = df.dropna(subset=['wt10', 'sdmvstra', 'sdmvpsu'])

# === Helper functions ===
def weighted_mean(x, w):
    d = pd.DataFrame({'x': x, 'w': w}).dropna()
    return np.sum(d.x * d.w) / np.sum(d.w)

# not standard 
# def weighted_se(x, w):
#    d = pd.DataFrame({'x': x, 'w': w}).dropna()
#    mean = weighted_mean(d.x, d.w)
#    return np.sqrt(np.sum(d.w * (d.x - mean)**2) / ((len(d) - 1) * np.sum(d.w) / len(d)))

# Only acceptable if weights are uniform
# def weighted_se(x, w):
#    d = pd.DataFrame({'x': x, 'w': w}).dropna()
#    mean = np.sum(d.x * d.w) / np.sum(d.w)
#    var = np.sum(d.w * (d.x - mean)**2) / np.sum(d.w)
#    se = np.sqrt(var / len(d))  # Approximate
#    return se

# 🔥 Check best practice for estimating SE with survey weights

def weighted_se(x, w):
    d = pd.DataFrame({'x': x, 'w': w}).dropna()
    mean = np.sum(d.x * d.w) / np.sum(d.w)
    eff_n = (np.sum(d.w))**2 / np.sum(d.w**2)  # Effective sample size
    var = np.sum(d.w * (d.x - mean)**2) / np.sum(d.w)
    se = np.sqrt(var / eff_n)
    return se

    
def weighted_props(df, var, weight):
    d = df[[var, weight]].dropna()
    counts = d.groupby(var)[weight].sum()
    total_weight = d[weight].sum()
    props = (counts / total_weight).round(3)
    return counts.round(0).astype(int), (props * 100).round(1)

# === Categorical variables ===
cat_vars = {
    'SEX': {1: 'Male', 2: 'Female'},
    'RACE': {1: 'Non-Hispanic White', 2: 'Non-Hispanic Black', 3: 'Hispanic', 4: 'Other'},
    'EDU': {1: 'Less than high school', 2: 'High school or equivalent', 3: 'Some college', 4: 'College or above'},
    'pir': {1: '<1.3', 2: '1.3~2.99', 3: '>=3'},
    'SNAP': {0: 'Not participant', 1: 'Participant', 2: 'Income eligible non-participant'},
    'SMK': {1: 'Nonsmokers', 2: 'Former smokers', 3: '<15 cigarettes/day', 4: '15-24.9 cigarettes/day', 5: '≥ 25 cigarettes/day'},
    'ALCG2': {1: 'Nondrinkers', 2: 'Moderate drinker', 3: 'Heavy drinker', 4: 'Missing'},
    'bmic': {1: 'BMI <18.5', 2: '18-24.9', 3: '25-29.9', 4: 'BMI ≥30'}
}

rows = []

for var, mapping in cat_vars.items():
    counts, props = weighted_props(df, var, 'wt10')
    for code, label in mapping.items():
        if code in counts.index:
            rows.append({
                "Variable": var,
                "Category": label,
                "Overall": f"{counts[code]:,.0f} ({props[code]}%)"
            })

# === Binary variables ===
binary_vars = ['DIABETES', 'CVD', 'dm_rx', 'chol_rx', 'angina', 'cancer', 'lung_disease', 'MORTSTAT']
for var in binary_vars:
    val = weighted_mean(df[var].fillna(0), df['wt10']) * 100
    rows.append({
        "Variable": var,
        "Category": "1",
        "Overall": f"{val:.1f}%"
    })

# === Continuous variables ===
cont_vars = ['RIDAGEYR', 'met_hr', 'bmi', 'hba1c', 'sbp', 'dbp', 'hdl', 'ldl', 'tg', 'HEI2015_TOTAL_SCORE']
for var in cont_vars:
    mean = weighted_mean(df[var], df['wt10'])
    se = weighted_se(df[var], df['wt10'])
    rows.append({
        "Variable": var,
        "Category": "",
        "Overall": f"{mean:.2f} ({se:.2f})"
    })

# === Create summary table ===
demo_table = pd.DataFrame(rows)

# Save, preview

# Optionally: save to CSV
# demo_table.to_csv("demo_summary.csv", index=False)
demo_table.to_csv(output_path, index=False)

demo_table.head(50)

In [None]:
# Clean data: drop rows with missing mortality status or weight
mort_df = df[['MORTSTAT', 'WTDRD1']].dropna()

# Calculate total weighted sum
total_weight = mort_df['WTDRD1'].sum()

# Weighted percentage of deceased (MORTSTAT == 1)
dead_weight = mort_df.loc[mort_df['MORTSTAT'] == 1, 'WTDRD1'].sum()
weighted_pct_dead = (dead_weight / total_weight) * 100

print(f"Weighted percentage deceased: {weighted_pct_dead:.1f}%")


In [None]:
# Define unweighted counts and proportions
def unweighted_props(df, var):
    counts = df[var].value_counts(dropna=False).sort_index()
    props = counts / counts.sum() * 100
    return counts, props.round(1)

# Now use it for 'sex'
counts, props = unweighted_props(df, 'sex')

# Optionally map codes to labels
sex_labels = {1: 'Male', 2: 'Female'}
for val in counts.index:
    label = sex_labels.get(val, val)
    print(f"{label}: {counts[val]} ({props[val]}%)")
18530+19890
