# NHANES 2017–2018: Data exploration

Fresh notebook after Jupyter setup: loads ALB/BPX/DEMO CSVs, merges on SEQN, and runs initial sanity checks on sample size and missingness.

In [None]:
import pandas as pd
from pathlib import Path

# Paths relative to notebooks/ -> project root -> data/raw/2017_2018
BASE_DIR = Path("..").resolve()
DATA_DIR = BASE_DIR / "data" / "raw" / "2017_2018"

ALB_PATH = DATA_DIR / "ALB_CR_J.csv"
BPX_PATH = DATA_DIR / "BPX_J.csv"
DEMO_PATH = DATA_DIR / "DEMO_J.csv"

print("Base directory:", BASE_DIR)
print("Data directory:", DATA_DIR)
print("Files exist:", ALB_PATH.exists(), BPX_PATH.exists(), DEMO_PATH.exists())

alb = pd.read_csv(ALB_PATH)
bpx = pd.read_csv(BPX_PATH)
demo = pd.read_csv(DEMO_PATH)

print("ALB shape:", alb.shape)
print("BPX shape:", bpx.shape)
print("DEMO shape:", demo.shape)

common_key = "SEQN"
merged = alb.merge(bpx, on=common_key, how="inner").merge(demo, on=common_key, how="inner")

print("Merged shape:", merged.shape)
merged.head()


Base directory: /workspaces/AAHRC-Calculator
Data directory: /workspaces/AAHRC-Calculator/data/raw/2017_2018
Files exist: True True True
ALB shape: (7936, 8)
BPX shape: (8704, 21)
DEMO shape: (9254, 46)
Merged shape: (7936, 73)


Unnamed: 0,SEQN,URXUMA,URXUMS,URDUMALC,URXUCR,URXCRS,URDUCRLC,URDACT,PEASCCT1,BPXCHR,...,DMDHREDZ,DMDHRMAZ,DMDHSEDZ,WTINT2YR,WTMEC2YR,SDMVPSU,SDMVSTRA,INDHHIN2,INDFMIN2,INDFMPIR
0,93705.0,3.2,3.2,5.397605e-79,24.0,2121.6,5.397605e-79,13.33,,,...,1.0,2.0,,8614.571172,8338.419786,2.0,145.0,3.0,3.0,0.82
1,93706.0,39.6,39.6,5.397605e-79,69.0,6099.6,5.397605e-79,57.39,,,...,3.0,1.0,2.0,8548.632619,8723.439814,2.0,134.0,,,
2,93707.0,41.6,41.6,5.397605e-79,209.0,18475.6,5.397605e-79,19.9,,,...,2.0,1.0,3.0,6769.344567,7064.60973,1.0,138.0,10.0,10.0,1.88
3,93708.0,9.1,9.1,5.397605e-79,47.0,4154.8,5.397605e-79,19.36,,,...,1.0,1.0,1.0,13329.450589,14372.488765,2.0,138.0,6.0,6.0,1.63
4,93709.0,26.4,26.4,5.397605e-79,215.0,19006.0,5.397605e-79,12.28,,,...,2.0,2.0,,12043.388271,12277.556662,1.0,136.0,2.0,2.0,0.41


In [None]:
print("Merged rows:", merged.shape[0])
print("Columns:", merged.shape[1])

key_cols = [
    "SEQN",
    "URDACT",  # uACR
    "BPXSY1", "BPXSY2", "BPXSY3",
    "BPXDI1", "BPXDI2", "BPXDI3",
]

existing_key_cols = [c for c in key_cols if c in merged.columns]
print("Key columns present:", existing_key_cols)

if existing_key_cols:
    print(merged[existing_key_cols].isna().mean().sort_values(ascending=False).head(10))
    merged[existing_key_cols].describe(include="all")


Merged rows: 7936
Columns: 73
Key columns present: ['SEQN', 'URDACT', 'BPXSY1', 'BPXSY2', 'BPXSY3', 'BPXDI1', 'BPXDI2', 'BPXDI3']
BPXSY1    0.205897
BPXDI1    0.205897
BPXSY3    0.176159
BPXDI3    0.176159
BPXSY2    0.173009
BPXDI2    0.173009
URDACT    0.038306
SEQN      0.000000
dtype: float64


In [5]:
# Compute average SBP and DBP from available readings
sbp_cols = ["BPXSY1", "BPXSY2", "BPXSY3"]
dbp_cols = ["BPXDI1", "BPXDI2", "BPXDI3"]

# Only keep columns that actually exist in the merged dataset
sbp_cols = [c for c in sbp_cols if c in merged.columns]
dbp_cols = [c for c in dbp_cols if c in merged.columns]

merged["SBP_mean"] = merged[sbp_cols].mean(axis=1, skipna=True)
merged["DBP_mean"] = merged[dbp_cols].mean(axis=1, skipna=True)

print("SBP/DBP mean columns added.")
merged[["SBP_mean", "DBP_mean"]].describe()


SBP/DBP mean columns added.


Unnamed: 0,SBP_mean,DBP_mean
count,6714.0,6714.0
mean,121.693178,68.33408
std,20.359703,15.78902
min,72.666667,5.397605e-79
25%,106.666667,60.66667
50%,118.0,69.33333
75%,132.666667,77.33333
max,238.0,135.3333


In [6]:
from pathlib import Path

# Define key columns needed for analysis
core_cols = ["SEQN", "SBP_mean", "DBP_mean"]
if "URDACT" in merged.columns:
    core_cols.append("URDACT")

# Drop rows with missing core variables
analysis_df = merged.dropna(subset=core_cols).copy()

print("Original merged rows:", merged.shape[0])
print("Analysis rows (non-missing core vars):", analysis_df.shape[0])

# Ensure processed directory exists
BASE_DIR = Path("..").resolve()
PROCESSED_DIR = BASE_DIR / "data" / "processed"
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

out_path = PROCESSED_DIR / "nhanes_2017_2018_analysis.csv"
analysis_df.to_csv(out_path, index=False)

print("Saved processed analysis dataset to:", out_path)


Original merged rows: 7936
Analysis rows (non-missing core vars): 6564
Saved processed analysis dataset to: /workspaces/AAHRC-Calculator/data/processed/nhanes_2017_2018_analysis.csv


## Processed dataset

- Computed SBP_mean and DBP_mean from available BP readings.
- Dropped rows with missing core variables (SEQN, SBP_mean, DBP_mean, and URDACT if available).
- Saved the resulting analysis dataset to `data/processed/nhanes_2017_2018_analysis.csv`.
