# 02 — Build Covariates

Run first cell:
```python
%run 00_bootstrap.ipynb
```

In [None]:
# Step 1: Read dietary score data
scores = pd.read_excel(os.path.join(folder_path, "i.scores.xlsx"), engine="openpyxl")

# Rename columns to match desired output
scores = scores.rename(columns={
    "seqn": "SEQN",
    "i.FCS": "i_FCS",
    "i.optup": "i_optup",  # keep lowercase here first
    "i.HSR": "i_HSR",
    "i.nutri": "i_nutri"
})

# Then copy and rename for output
scores2 = scores[["SEQN", "i_FCS", "i_optup", "i_HSR", "i_nutri"]].copy()
scores2 = scores2.rename(columns={"i_optup": "i_Optup"})
scores2 = scores2.sort_values("SEQN")


# Step 2: Read covariates from Lu paper
covar = pd.read_sas(os.path.join(folder_path, "covar.sas7bdat"), format="sas7bdat")
covar = covar.rename(columns=str.upper)  # make all column names uppercase to match SAS style
# filter available variables only
covar_vars = ["SEQN", "RIDAGEYR", "SEX", "RACE", "EDU", "INDFMPIR", "SMK_AVG", "SMK_PAST",
              "SMK", "ALCG2", "HEI2015_TOTAL_SCORE", "DIABE"]
covar = covar[[col for col in covar_vars if col in covar.columns]].copy()
covar = covar.sort_values("SEQN")

# Step 3: Read covariates from Meghan paper
covariates1_raw = pd.read_csv(os.path.join(folder_path, "covariates.csv"))
covariates1 = covariates1_raw.rename(columns={"seqn": "SEQN"})
covariates_vars = ["SEQN", "sdmvpsu", "sdmvstra", "met_hr", "perE_alco", "dm_self",
                   "tchol", "hdl", "ldl", "tg", "bmi", "CVD", "dm_rx", "chol_rx",
                   "angina_rx", "lung_disease", "angina", "hba1c", "sbp", "dbp", "cancer"]
covariates1 = covariates1[[col for col in covariates_vars if col in covariates1.columns]].copy()
covariates1 = covariates1.sort_values("SEQN")

# Step 4: Read dietary weight data (filter DAYS == 1)
dietwt = pd.read_sas(os.path.join(folder_path, "gg.sas7bdat"), format="sas7bdat")

# Check for expected columns
required_cols = ["SEQN", "DAYS", "WTDRD1", "WTDR2D", "DR12DRST"]
missing = [col for col in required_cols if col not in dietwt.columns]
if missing:
    print(f"Warning: Missing columns from gg.sas7bdat: {missing}")

# Filter and select
dietwt = dietwt[dietwt["DAYS"] == 1][["SEQN", "WTDRD1", "WTDR2D", "DR12DRST"]].copy()
dietwt = dietwt.sort_values("SEQN")


# Step 5: Read mortality data
mort = pd.read_sas(os.path.join(folder_path, "mortality9918.sas7bdat"), format="sas7bdat")
mort = mort.sort_values("SEQN")

def summarize_df(name, df):
    print(f"{name}:")
    print(f"  Rows: {df.shape[0]}")
    print(f"  Unique SEQN: {df['SEQN'].nunique()}")
    print("-" * 40)

summarize_df("scores2", scores2)
summarize_df("covar", covar)
summarize_df("covariates1", covariates1)
summarize_df("dietwt", dietwt)
summarize_df("mort", mort)

In [None]:
# 🔥🔥🔥🔥🔥🔥 NOW WORK AT HERE!!!!!!
# try to extend covariates.csv to 99-18 currently is 03-18



In [None]:
# Step 2: Variable transformations
score_mort["wt10"] = score_mort["WTDRD1"] / 10
score_mort["wt"] = score_mort["WTDR2D"] / 8
score_mort["i_FCS_sd"] = score_mort["i_FCS"] / 10.89
score_mort["i_Optup_sd"] = score_mort["i_Optup"] / 8.17
score_mort["i_nutri_sd"] = -score_mort["i_nutri"] / 3.17
score_mort["i_HSR_sd"] = score_mort["i_HSR"] / 1.01
score_mort["hei2015_sd"] = score_mort["HEI2015_TOTAL_SCORE"] / 13

# Step 3: Recode death indicators
for cause, code in {
    "death_heart": "001", "death_cancer": "002", "death_resp": "003", "Death_inj": "004",
    "death_cerev": "005", "Death_alz": "006", "death_diabe": "007",
    "Death_infl": "008", "Death_kid": "009", "death_other1": "010"
}.items():
    score_mort[cause] = (score_mort["UCOD_LEADING"] == code).astype(int)

# Step 4: Composite categories
score_mort["Death_other"] = score_mort[["death_resp", "Death_inj", "Death_alz", "Death_infl"]].sum(axis=1).clip(upper=1)
score_mort["Death_oth2"] = score_mort[["death_resp", "Death_inj", "Death_alz", "Death_infl", "death_other1"]].sum(axis=1).clip(upper=1)
score_mort["death_cvd"] = score_mort[["death_heart", "death_cerev"]].sum(axis=1).clip(upper=1)
score_mort["death_cmd"] = score_mort[["death_heart", "death_cerev", "death_diabe"]].sum(axis=1).clip(upper=1)
score_mort["death_cmdk"] = score_mort[["death_heart", "death_cerev", "death_diabe", "Death_kid"]].sum(axis=1).clip(upper=1)
score_mort["death_cmdkh"] = score_mort["death_cmdk"]
score_mort.loc[score_mort["DIABETES"] == 1, "death_cmdkh"] = 1
score_mort.loc[score_mort["HYPERTEN"] == 1, "death_cmdkh"] = 1
score_mort["death_cmd"] = score_mort["death_cmd"].fillna(0)
score_mort.loc[score_mort["death_cmd"] == 1, ["Death_other", "Death_oth2"]] = 0

# Step 5: Multiple cause mortality
score_mort["death_multi"] = score_mort["MORTSTAT"]
score_mort.loc[score_mort["death_cmd"] == 1, "death_multi"] = 1
score_mort.loc[score_mort["death_cancer"] == 1, "death_multi"] = 2
score_mort.loc[score_mort["Death_oth2"] == 1, "death_multi"] = 3

# Step 6: Age & time vars
score_mort["agesq"] = score_mort["RIDAGEYR"] ** 2
score_mort["py"] = score_mort["PERMTH_EXM"] / 12
score_mort["agestart"] = score_mort["RIDAGEYR"]
score_mort["ageend"] = score_mort["RIDAGEYR"] + score_mort["py"]

# Step 7: Poverty
score_mort["pir"] = 5
score_mort.loc[(score_mort["INDFMPIR"] < 1.3) & (score_mort["INDFMPIR"].notna()), "pir"] = 1
score_mort.loc[(score_mort["INDFMPIR"] >= 1.3), "pir"] = 2
score_mort.loc[(score_mort["INDFMPIR"] >= 3), "pir"] = 3

# Step 8: Recode SNAP
score_mort.loc[(score_mort["INDFMPIR"].between(0, 1.3)) & (score_mort["SNAP"] != 1), "SNAP"] = 2

# Step 9: BMI categories
score_mort["bmic"] = pd.NA
score_mort.loc[(score_mort["bmi"] > 0) & (score_mort["bmi"] < 18.5), "bmic"] = 0
score_mort.loc[(score_mort["bmi"] >= 18.5) & (score_mort["bmi"] < 25), "bmic"] = 1
score_mort.loc[(score_mort["bmi"] >= 25), "bmic"] = 2
score_mort.loc[(score_mort["bmi"] >= 30), "bmic"] = 3

In [None]:
with open('/Users/dengshuyue/Desktop/SDOH/analysis/code/Ref/2.1_Prepare data_covariates.sas', 'r', encoding='latin1') as f:
    sas_code = f.read()

print(sas_code[:20000])  # Preview the first X,000 characters

In [None]:
# Diabetes Identification Note:
# The final diabetes indicator variable used in the analysis is 'diabe2'.
# This composite variable identifies diabetes based on the following criteria:
# - Self-reported physician diagnosis (DIQ010), current insulin use (DIQ050), or oral medication use (DIQ070)
# - Prescription drug data indicating diabetes treatment (dm_rx2)
# - Fasting glucose ≥ 126 mg/dL (glu_dm)
# - OGTT ≥ 200 mg/dL (ogtt_dm)
# - HbA1c ≥ 6.5% (hb_dm)
# Any one of these criteria being met will set 'diabe2' = 1, otherwise 0.
