# 03 — Link Mortality (Follow-up)


# Link CORE (1999–2023) to Mortality (through 2018)
- Reads final CORE from 02
- Reads NCHS Linked Mortality (SAS 1999–2018)
- Writes:
 - output/cov_mort_1999_2018.parquet
 - output/cov_core_mort_1999_2023.parquet  (CORE + mortality columns)


## Setup & paths

In [1]:
from __future__ import annotations
from pathlib import Path
import pandas as pd, numpy as np

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 200)

# Project roots
BASE = Path("/Users/dengshuyue/Desktop/SDOH/analysis")
OUT  = BASE / "output"
OUT.mkdir(parents=True, exist_ok=True)

# Inputs
CORE_PATH = OUT / "cov_core_1999_2023.parquet"  # produced by 02
MORT_PATH = Path("/Users/dengshuyue/Desktop/SDOH/analysis/data/mortality9918.sas7bdat")

assert CORE_PATH.exists(), f"Missing CORE file: {CORE_PATH}"
assert MORT_PATH.exists(), f"Missing mortality SAS file: {MORT_PATH}"


## Small helpers

In [3]:
def upper_df(df: pd.DataFrame) -> pd.DataFrame:
    d = df.copy()
    d.columns = [c.upper() for c in d.columns]
    return d

def load_core(p: Path) -> pd.DataFrame:
    core = upper_df(pd.read_parquet(p))
    core["SEQN"] = pd.to_numeric(core["SEQN"], errors="coerce").astype("Int64")
    return core


## Load & standardize mortality (SAS 1999–2018)

In [4]:
def load_mortality_sas(p: Path) -> pd.DataFrame:
    """Tolerant reader for public-use mortality (1999–2018)."""
    mort = pd.read_sas(p, format="sas7bdat")
    mort = upper_df(mort)
    if "SEQN" not in mort.columns:
        raise KeyError("Mortality file lacks SEQN.")

    # Canonical columns and their likely variants
    candidates = {
        "ELIGSTAT": ["ELIGSTAT"],
        "MORTSTAT": ["MORTSTAT"],
        "PERMTH_EXM": ["PERMTH_EXM"],
        "PERMTH_INT": ["PERMTH_INT"],
        "UCOD_LEADING": ["UCOD_LEADING", "UCODLEADING", "UCOD_LEAD"],
        "UCOD_113": ["UCOD_113","UCOD113"],
        "DOD_YR": ["DOD_YR","DODYR","DOD_YY"],
        "DOD_QTR": ["DODQTR","DOD_QTR"],
    }

    def pick(opts):
        for c in opts:
            if c in mort.columns:
                return c
        return None

    keep = {"SEQN": "SEQN"}
    for k, opts in candidates.items():
        got = pick(opts)
        if got is not None:
            keep[k] = got

    m = mort[list(keep.values())].copy()
    m.columns = list(keep.keys())

    # Types
    m["SEQN"] = pd.to_numeric(m["SEQN"], errors="coerce").astype("Int64")
    for col in ["ELIGSTAT","MORTSTAT"]:
        if col in m.columns:
            m[col] = pd.to_numeric(m[col], errors="coerce").astype("Int8")
    for col in ["PERMTH_EXM","PERMTH_INT"]:
        if col in m.columns:
            m[col] = pd.to_numeric(m[col], errors="coerce")

    if "UCOD_LEADING" in m.columns:
        m["UCOD_LEADING"] = m["UCOD_LEADING"].astype("string")
    elif "UCOD_113" in m.columns:
        m["UCOD_LEADING"] = (
            pd.to_numeric(m["UCOD_113"], errors="coerce").astype("Int64").astype("string")
        )

    if "DOD_YR" in m.columns:
        m["DOD_YR"] = pd.to_numeric(m["DOD_YR"], errors="coerce").astype("Int16")
    if "DOD_QTR" in m.columns:
        m["DOD_QTR"] = pd.to_numeric(m["DOD_QTR"], errors="coerce").astype("Int8")

    m = m.drop_duplicates("SEQN")

    # Save a clean mortality covariate artifact
    mort_out = OUT / "cov_mort_1999_2018.parquet"
    cols_out = ["SEQN","ELIGSTAT","MORTSTAT","PERMTH_EXM","PERMTH_INT","UCOD_LEADING","DOD_YR","DOD_QTR"]
    m[[c for c in cols_out if c in m.columns]].to_parquet(mort_out, index=False)
    print(f"✓ MORT → {mort_out}  (rows={len(m):,})")
    return m


## Merge mortality into CORE

In [21]:
core = load_core(CORE_PATH)
mort = load_mortality_sas(MORT_PATH)

merge_cols = ["ELIGSTAT","MORTSTAT","PERMTH_EXM","PERMTH_INT","UCOD_LEADING","DOD_YR","DOD_QTR"]
merge_cols = [c for c in merge_cols if c in mort.columns]

core_mort = core.merge(mort[["SEQN"] + merge_cols], on="SEQN", how="left")

# Optional: force NAs for post-2018 cycles (SDDSRVYR > 10)
if "SDDSRVYR" in core_mort.columns and merge_cols:
    post_mask = pd.to_numeric(core_mort["SDDSRVYR"], errors="coerce") > 10
    for c in merge_cols:
        core_mort.loc[post_mask, c] = np.nan


✓ MORT → /Users/dengshuyue/Desktop/SDOH/analysis/output/cov_mort_1999_2018.parquet  (rows=59,064)


## Write merged outputs

In [22]:
core_mort_path = OUT / "cov_core_mort_1999_2023.parquet"
core_mort.to_parquet(core_mort_path, index=False)
print(f"✓ CORE+MORT → {core_mort_path}  (rows={len(core_mort):,})")


✓ CORE+MORT → /Users/dengshuyue/Desktop/SDOH/analysis/output/cov_core_mort_1999_2023.parquet  (rows=128,809)


## Quick QC: missingness & counts

In [13]:
mm = [c for c in ["ELIGSTAT","MORTSTAT","PERMTH_EXM","PERMTH_INT","UCOD_LEADING"] if c in core_mort.columns]
print("Overall missingness:")
print(core_mort[mm].isna().mean().round(3))

if "SDDSRVYR" in core_mort.columns and "MORTSTAT" in core_mort.columns:
    pre = core_mort[pd.to_numeric(core_mort["SDDSRVYR"], errors="coerce") <= 10]
    print("\n≤2018 cycles (SDDSRVYR ≤ 10):")
    print(pre[mm].isna().mean().round(3))
    print("\nMORTSTAT value counts (≤2018):")
    print(pre["MORTSTAT"].value_counts(dropna=False))


Overall missingness:
ELIGSTAT        0.541
MORTSTAT        0.541
PERMTH_EXM      0.563
PERMTH_INT      0.541
UCOD_LEADING    0.928
dtype: float64

≤2018 cycles (SDDSRVYR ≤ 10):
ELIGSTAT        0.417
MORTSTAT        0.417
PERMTH_EXM      0.445
PERMTH_INT      0.417
UCOD_LEADING    0.909
dtype: float64

MORTSTAT value counts (≤2018):
MORTSTAT
0       49815
<NA>    42252
1        9249
Name: count, dtype: Int64


## quick check why na 

In [16]:
# 1) How many rows are post-2018?
post2018 = core_mort["SDDSRVYR"].ge(11) | core_mort["SDDSRVYR"].eq(66)  # 11~2019-20 doesn't exist; 66=2021–23
print("Post-2018 rows:", int(post2018.sum()))

# 2) Among ≤2018, how many are <18?
pre2018 = core_mort.loc[~post2018]
under18 = pre2018["AGE_YR"] < 18
print("≤2018 & <18 yrs:", int(under18.sum()))

# 3) Mortality missing reasons (rough triage labels)
reason = pd.Series("other/unknown", index=core_mort.index, dtype="string")
reason[post2018] = "no LMF beyond 2018"
reason.loc[~post2018 & (core_mort["AGE_YR"] < 18)] = "under 18 at baseline"
reason.loc[~post2018 & (core_mort["AGE_YR"] >= 18) & core_mort["MORTSTAT"].isna()] = "adult but not on PUF"
print(reason.value_counts())

# 4) Confirm that UCOD_LEADING shows mostly for deaths
tmp = pre2018.copy()
print("UCOD among decedents (%) =",
      (tmp.loc[tmp["MORTSTAT"]==1, "UCOD_LEADING"].notna().mean()*100).round(1))


Post-2018 rows: 27493
≤2018 & <18 yrs: 42112
other/unknown           59064
under 18 at baseline    42112
no LMF beyond 2018      27493
adult but not on PUF      140
Name: count, dtype: Int64
UCOD among decedents (%) = 100.0


## save flag and filtered merged file 

In [23]:
import numpy as np
import pandas as pd

cm = core_mort.copy()

# Coverage flags
cm["IS_POST2018"] = (pd.to_numeric(cm["SDDSRVYR"], errors="coerce") > 10) | (cm["SDDSRVYR"] == 66)
cm["IS_ADULT"]    = pd.to_numeric(cm.get("AGE_YR"), errors="coerce").ge(18)

# "Covered by public-use mortality" = adult AND not post-2018
cm["MORTALITY_COVERED"] = cm["IS_ADULT"] & (~cm["IS_POST2018"])

# Event indicator & survival time (exam-based)
cm["EVENT"]   = (cm["MORTSTAT"] == 1).astype("Int8")
cm["CENSORED"] = (cm["MORTSTAT"] == 0).astype("Int8")
cm["FU_YRS_EXM"] = pd.to_numeric(cm["PERMTH_EXM"], errors="coerce") / 12.0
cm["FU_YRS_INT"] = pd.to_numeric(cm["PERMTH_INT"], errors="coerce") / 12.0

# Optional: human-readable cause labels for quick tables
ucod_map = {
    "001":"Heart disease","002":"Cancer","003":"Chronic lower resp",
    "004":"Unintentional injuries","005":"Stroke","006":"Alzheimer disease",
    "007":"Diabetes","008":"Influenza & pneumonia","009":"Kidney disease","010":"Suicide"
}
cm["UCOD_LABEL"] = cm.get("UCOD_LEADING").astype("string").map(ucod_map)

# Save a convenient analysis subset: adults ≤2018 with known MORTSTAT
analysis = cm.loc[cm["MORTALITY_COVERED"] & cm["MORTSTAT"].notna()].copy()

from pathlib import Path
OUT = Path("/Users/dengshuyue/Desktop/SDOH/analysis/output")
cm.to_parquet(OUT / "cov_core_mort_1999_2023_flags.parquet", index=False)
analysis.to_parquet(OUT / "cov_core_mort_covered_adults.parquet", index=False)

print("Saved:",
      OUT / "cov_core_mort_1999_2023_flags.parquet",
      OUT / "cov_core_mort_covered_adults.parquet")
print("Covered adults:", len(analysis), 
      "Deaths:", int(analysis["EVENT"].sum()),
      "Crude death %:", round(100*analysis["EVENT"].mean(), 1))


Saved: /Users/dengshuyue/Desktop/SDOH/analysis/output/cov_core_mort_1999_2023_flags.parquet /Users/dengshuyue/Desktop/SDOH/analysis/output/cov_core_mort_covered_adults.parquet
Covered adults: 59064 Deaths: 9249 Crude death %: 15.7


## peak result

In [24]:
display(core_mort.head(20))

Unnamed: 0,SEQN,SDDSRVYR,SDMVPSU,SDMVSTRA,WTMEC2YR,AGE_YR,RIAGENDR,SEX,FEMALE,SMK_STATUS,CIGS_PER_DAY,PACK_YEARS,FORMER_SMOKER,DRINKS_PER_DAY,ALCOHOL_CAT,LTPA,METSCORE,IMP,BMXWT,BMXHT,BMI,BMI_CLAS,DIABETES,HTN,HIGH_CHOL,CVD,CANCER,SBP,DBP,TCHOL,HDL,LDL,TG,DMDHHSIZ,ELIGSTAT,MORTSTAT,PERMTH_EXM,PERMTH_INT,UCOD_LEADING
0,1,1.0,1.0,5.0,10982.898896,2.0,2,F,1,,,,,,,,,,12.5,91.6,14.897695,UNDER,0,0,0,0,0,91.333333,56.0,131.0,59.0,54.0,99.0,3.0,,,,,
1,2,1.0,3.0,1.0,28325.384898,77.0,1,M,0,NEVER,,,0.0,0.789041,MODERATE,0.0,60.0,1.0,75.4,174.0,24.904215,NORMAL,0,0,0,0,1,100.666667,56.666667,215.0,54.0,136.0,128.0,1.0,1.0,1.0,177.0,177.0,6.0
2,3,1.0,2.0,7.0,46192.256945,10.0,2,F,1,,,,,,,,,,32.9,136.6,17.631713,UNDER,0,0,0,0,0,108.666667,62.0,129.0,30.0,58.0,202.0,4.0,,,,,
3,4,1.0,1.0,2.0,10251.26002,1.0,1,M,0,,,,,,,,,,13.3,,,,0,0,1,0,0,95.333333,61.333333,211.0,43.0,161.0,37.0,7.0,,,,,
4,5,1.0,2.0,8.0,99445.065735,49.0,1,M,0,FORMER,,,1.0,12.0,HEAVY,41.066667,1920.0,1.0,92.5,178.3,29.096386,OVER,0,1,1,0,0,122.0,82.666667,279.0,42.0,168.0,347.0,3.0,1.0,0.0,244.0,244.0,
5,6,1.0,2.0,2.0,39656.600444,19.0,2,F,1,,,,,,,,,,59.2,162.0,22.557537,NORMAL,0,0,0,0,0,114.666667,68.0,153.0,61.0,59.0,181.0,2.0,1.0,0.0,245.0,246.0,
6,7,1.0,2.0,4.0,25525.423409,59.0,2,F,1,FORMER,,8030.0,1.0,,NONE,3.033333,0.0,1.0,78.0,162.9,29.393577,OVER,0,0,1,0,0,125.333333,80.0,245.0,105.0,127.0,62.0,1.0,1.0,0.0,236.0,237.0,
7,8,1.0,1.0,6.0,31510.587866,13.0,1,M,0,,,,,,,,,,40.7,162.0,15.508307,UNDER,0,0,0,0,0,100.666667,49.333333,162.0,67.0,88.0,33.0,7.0,,,,,
8,9,1.0,2.0,9.0,7575.870247,11.0,2,F,1,,,,,,,,,,45.5,156.9,18.482704,UNDER,0,0,0,0,0,109.333333,53.333333,148.0,58.0,79.0,56.0,4.0,,,,,
9,10,1.0,1.0,7.0,22445.808572,43.0,1,M,0,CURRENT,1.0,,0.0,0.19726,MODERATE,0.0,8160.0,1.0,111.8,190.1,30.936955,OBESE,0,1,0,0,0,145.333333,96.0,140.0,51.0,80.0,45.0,1.0,1.0,1.0,231.0,231.0,1.0


In [25]:
cols_show = ["SEQN","SDDSRVYR", "AGE_YR", "MORTSTAT","PERMTH_EXM","PERMTH_INT","UCOD_LEADING"]
cols_show = [c for c in cols_show if c in core_mort.columns]
display(core_mort.head(20)[cols_show])


Unnamed: 0,SEQN,SDDSRVYR,AGE_YR,MORTSTAT,PERMTH_EXM,PERMTH_INT,UCOD_LEADING
0,1,1.0,2.0,,,,
1,2,1.0,77.0,1.0,177.0,177.0,6.0
2,3,1.0,10.0,,,,
3,4,1.0,1.0,,,,
4,5,1.0,49.0,0.0,244.0,244.0,
5,6,1.0,19.0,0.0,245.0,246.0,
6,7,1.0,59.0,0.0,236.0,237.0,
7,8,1.0,13.0,,,,
8,9,1.0,11.0,,,,
9,10,1.0,43.0,1.0,231.0,231.0,1.0


#### peak flaged file 

In [26]:
from pathlib import Path
import pandas as pd

OUT = Path("/Users/dengshuyue/Desktop/SDOH/analysis/output")
cm = pd.read_parquet(OUT / "cov_core_mort_1999_2023_flags.parquet")
cm.head(10)

Unnamed: 0,SEQN,SDDSRVYR,SDMVPSU,SDMVSTRA,WTMEC2YR,AGE_YR,RIAGENDR,SEX,FEMALE,SMK_STATUS,CIGS_PER_DAY,PACK_YEARS,FORMER_SMOKER,DRINKS_PER_DAY,ALCOHOL_CAT,LTPA,METSCORE,IMP,BMXWT,BMXHT,BMI,BMI_CLAS,DIABETES,HTN,HIGH_CHOL,CVD,CANCER,SBP,DBP,TCHOL,HDL,LDL,TG,DMDHHSIZ,ELIGSTAT,MORTSTAT,PERMTH_EXM,PERMTH_INT,UCOD_LEADING,IS_POST2018,IS_ADULT,MORTALITY_COVERED,EVENT,CENSORED,FU_YRS_EXM,FU_YRS_INT,UCOD_LABEL
0,1,1.0,1.0,5.0,10982.898896,2.0,2,F,1,,,,,,,,,,12.5,91.6,14.897695,UNDER,0,0,0,0,0,91.333333,56.0,131.0,59.0,54.0,99.0,3.0,,,,,,False,False,False,,,,,
1,2,1.0,3.0,1.0,28325.384898,77.0,1,M,0,NEVER,,,0.0,0.789041,MODERATE,0.0,60.0,1.0,75.4,174.0,24.904215,NORMAL,0,0,0,0,1,100.666667,56.666667,215.0,54.0,136.0,128.0,1.0,1.0,1.0,177.0,177.0,6.0,False,True,True,1.0,0.0,14.75,14.75,Alzheimer disease
2,3,1.0,2.0,7.0,46192.256945,10.0,2,F,1,,,,,,,,,,32.9,136.6,17.631713,UNDER,0,0,0,0,0,108.666667,62.0,129.0,30.0,58.0,202.0,4.0,,,,,,False,False,False,,,,,
3,4,1.0,1.0,2.0,10251.26002,1.0,1,M,0,,,,,,,,,,13.3,,,,0,0,1,0,0,95.333333,61.333333,211.0,43.0,161.0,37.0,7.0,,,,,,False,False,False,,,,,
4,5,1.0,2.0,8.0,99445.065735,49.0,1,M,0,FORMER,,,1.0,12.0,HEAVY,41.066667,1920.0,1.0,92.5,178.3,29.096386,OVER,0,1,1,0,0,122.0,82.666667,279.0,42.0,168.0,347.0,3.0,1.0,0.0,244.0,244.0,,False,True,True,0.0,1.0,20.333333,20.333333,
5,6,1.0,2.0,2.0,39656.600444,19.0,2,F,1,,,,,,,,,,59.2,162.0,22.557537,NORMAL,0,0,0,0,0,114.666667,68.0,153.0,61.0,59.0,181.0,2.0,1.0,0.0,245.0,246.0,,False,True,True,0.0,1.0,20.416667,20.5,
6,7,1.0,2.0,4.0,25525.423409,59.0,2,F,1,FORMER,,8030.0,1.0,,NONE,3.033333,0.0,1.0,78.0,162.9,29.393577,OVER,0,0,1,0,0,125.333333,80.0,245.0,105.0,127.0,62.0,1.0,1.0,0.0,236.0,237.0,,False,True,True,0.0,1.0,19.666667,19.75,
7,8,1.0,1.0,6.0,31510.587866,13.0,1,M,0,,,,,,,,,,40.7,162.0,15.508307,UNDER,0,0,0,0,0,100.666667,49.333333,162.0,67.0,88.0,33.0,7.0,,,,,,False,False,False,,,,,
8,9,1.0,2.0,9.0,7575.870247,11.0,2,F,1,,,,,,,,,,45.5,156.9,18.482704,UNDER,0,0,0,0,0,109.333333,53.333333,148.0,58.0,79.0,56.0,4.0,,,,,,False,False,False,,,,,
9,10,1.0,1.0,7.0,22445.808572,43.0,1,M,0,CURRENT,1.0,,0.0,0.19726,MODERATE,0.0,8160.0,1.0,111.8,190.1,30.936955,OBESE,0,1,0,0,0,145.333333,96.0,140.0,51.0,80.0,45.0,1.0,1.0,1.0,231.0,231.0,1.0,False,True,True,1.0,0.0,19.25,19.25,Heart disease


In [27]:
# Basic QA
print("Rows:", len(cm), "Unique SEQN:", cm["SEQN"].nunique())

print("\nRIAGENDR counts (1=Male, 2=Female):")
print(cm["RIAGENDR"].value_counts(dropna=False).sort_index())

print("\nSEX vs RIAGENDR cross-check:")
print(pd.crosstab(cm["RIAGENDR"], cm["SEX"], dropna=False))

print("\nFEMALE flag check (should be 1 only when RIAGENDR==2):")
print(pd.crosstab(cm["RIAGENDR"], cm["FEMALE"], dropna=False))

# Types tidy (optional)
cat_cols = ["SEX","SMK_STATUS","ALCOHOL_CAT","BMI_CLAS","UCOD_LEADING","UCOD_LABEL"]
for c in cat_cols:
    if c in cm.columns:
        cm[c] = cm[c].astype("category")


Rows: 128809 Unique SEQN: 128809

RIAGENDR counts (1=Male, 2=Female):
RIAGENDR
1    63189
2    65620
Name: count, dtype: Int64

SEX vs RIAGENDR cross-check:
SEX           F      M
RIAGENDR              
1             0  63189
2         65620      0

FEMALE flag check (should be 1 only when RIAGENDR==2):
FEMALE        0      1
RIAGENDR              
1         63189      0
2             0  65620
