In [3]:
# Reload XPT files by auto-discovering available files and sanitize filenames if needed
from pathlib import Path
DATA_DIR = Path("2433_p3_data/healthcare.gov")
# sanitize filenames: trim leading/trailing whitespace in filenames on disk (fix common mistake)
for p in sorted(DATA_DIR.iterdir()):
    # skip directories
    if not p.is_file():
        continue
    name = p.name
    stripped = name.strip()
    if name != stripped:
        newp = p.with_name(stripped)
        if newp.exists():
            print(f"Warning: cannot sanitize '{name}' -> '{stripped}' because target exists.")
        else:
            try:
                p.rename(newp)
                print(f"Renamed file: '{name}' -> '{stripped}'")
            except Exception as e:
                print(f"Failed to rename '{name}': {e}")

# discover XPT files (case-insensitive suffix) and sort
xpt_files = sorted([p for p in DATA_DIR.iterdir() if p.is_file() and p.suffix.lower() == '.xpt'])
filenames = [p.name for p in xpt_files]
if not filenames:
    print(f"No XPT files found under {DATA_DIR}. Check path and filenames.")
else:
    print(f"Discovered XPT files: {filenames}")

# reader: prefer pyreadstat if available
try:
    import pyreadstat
    def read_xpt(p):
        df, meta = pyreadstat.read_xport(str(p))
        return df, meta
    reader_name = "pyreadstat"
except Exception:
    import pandas as pd
    def read_xpt(p):
        df = pd.read_sas(str(p), format='xport', encoding='utf-8')
        return df, None
    reader_name = "pandas.read_sas (fallback)"

print(f"Using reader: {reader_name}")

data_reloaded = {}
for fname in filenames:
    path = DATA_DIR / fname
    if not path.exists():
        print(f"Missing: {path} (skipping)")
        continue
    print(f"Reading {fname}...")
    try:
        df_meta = read_xpt(path)
        # reader may return (df, meta) or df only
        if isinstance(df_meta, tuple):
            df, meta = df_meta
        else:
            df = df_meta
            meta = None
    except Exception as e:
        print(f"Failed to read {fname}: {e}")
        continue
    data_reloaded[fname.replace('.XPT','')] = df

print(f"Loaded {len(data_reloaded)} datasets into data_reloaded: {list(data_reloaded.keys())}")

# Quick previews
from IPython.display import display
for name, df in data_reloaded.items():
    print(f"\n{name}: shape={df.shape}")
    display(df.head(3))

Renamed file: 'LLCP2018.XPT ' -> 'LLCP2018.XPT'
Renamed file: 'LLCP2019.XPT ' -> 'LLCP2019.XPT'
Discovered XPT files: ['LLCP2018.XPT', 'LLCP2019.XPT', 'LLCP2020.XPT', 'LLCP2021.XPT', 'LLCP2022.XPT', 'LLCP2023.XPT', 'LLCP2024.XPT']
Using reader: pandas.read_sas (fallback)
Reading LLCP2018.XPT...
Reading LLCP2019.XPT...
Reading LLCP2019.XPT...
Reading LLCP2020.XPT...
Reading LLCP2020.XPT...
Reading LLCP2021.XPT...
Reading LLCP2021.XPT...
Reading LLCP2022.XPT...
Reading LLCP2022.XPT...
Reading LLCP2023.XPT...
Reading LLCP2023.XPT...
Reading LLCP2024.XPT...
Reading LLCP2024.XPT...
Loaded 7 datasets into data_reloaded: ['LLCP2018', 'LLCP2019', 'LLCP2020', 'LLCP2021', 'LLCP2022', 'LLCP2023', 'LLCP2024']

LLCP2018: shape=(437436, 275)
Loaded 7 datasets into data_reloaded: ['LLCP2018', 'LLCP2019', 'LLCP2020', 'LLCP2021', 'LLCP2022', 'LLCP2023', 'LLCP2024']

LLCP2018: shape=(437436, 275)


Unnamed: 0,_STATE,FMONTH,IDATE,IMONTH,IDAY,IYEAR,DISPCODE,SEQNO,_PSU,CTELENM1,...,_MAM5022,_RFPAP34,_RFPSA22,_RFBLDS3,_COL10YR,_HFOB3YR,_FS5YR,_FOBTFS,_CRCREC,_AIDTST3
0,1.0,1.0,1052018,1,5,2018,1100.0,2018000001,2018000000.0,1.0,...,,,,,,,,,,2.0
1,1.0,1.0,1122018,1,12,2018,1100.0,2018000002,2018000000.0,1.0,...,,1.0,,,,,,,,2.0
2,1.0,1.0,1082018,1,8,2018,1100.0,2018000003,2018000000.0,1.0,...,,,,,,,,,,2.0



LLCP2019: shape=(418268, 342)


Unnamed: 0,_STATE,FMONTH,IDATE,IMONTH,IDAY,IYEAR,DISPCODE,SEQNO,_PSU,CTELENM1,...,_VEGESU1,_FRTLT1A,_VEGLT1A,_FRT16A,_VEG23A,_FRUITE1,_VEGETE1,_FLSHOT7,_PNEUMO3,_AIDTST4
0,1.0,1.0,1182019,1,18,2019,1100.0,2019000001,2019000000.0,1.0,...,114.0,1.0,1.0,1.0,1.0,5.397605e-79,5.397605e-79,2.0,1.0,2.0
1,1.0,1.0,1132019,1,13,2019,1100.0,2019000002,2019000000.0,1.0,...,121.0,1.0,1.0,1.0,1.0,5.397605e-79,5.397605e-79,1.0,1.0,2.0
2,1.0,1.0,1182019,1,18,2019,1100.0,2019000003,2019000000.0,1.0,...,164.0,1.0,1.0,1.0,1.0,5.397605e-79,5.397605e-79,1.0,2.0,2.0



LLCP2020: shape=(401958, 279)


Unnamed: 0,_STATE,FMONTH,IDATE,IMONTH,IDAY,IYEAR,DISPCODE,SEQNO,_PSU,CTELENM1,...,_RFPSA23,_CLNSCPY,_SGMSCPY,_SGMS10Y,_RFBLDS4,_STOLDNA,_VIRCOLN,_SBONTIM,_CRCREC1,_AIDTST4
0,1.0,1.0,1042020,1,4,2020,1100.0,2020000001,2020000000.0,1.0,...,,1.0,3.0,3.0,3.0,3.0,3.0,3.0,1.0,1.0
1,1.0,1.0,2072020,2,7,2020,1200.0,2020000002,2020000000.0,1.0,...,,,,,,,,2.0,,
2,1.0,1.0,1232020,1,23,2020,1100.0,2020000003,2020000000.0,1.0,...,,1.0,3.0,3.0,3.0,3.0,3.0,3.0,1.0,2.0



LLCP2021: shape=(438693, 303)


Unnamed: 0,_STATE,FMONTH,IDATE,IMONTH,IDAY,IYEAR,DISPCODE,SEQNO,_PSU,CTELENM1,...,_FRTRES1,_VEGRES1,_FRUTSU1,_VEGESU1,_FRTLT1A,_VEGLT1A,_FRT16A,_VEG23A,_FRUITE1,_VEGETE1
0,1.0,1.0,1192021,1,19,2021,1100.0,2021000001,2021000000.0,1.0,...,1.0,1.0,100.0,214.0,1.0,1.0,1.0,1.0,5.397605e-79,5.397605e-79
1,1.0,1.0,1212021,1,21,2021,1100.0,2021000002,2021000000.0,1.0,...,1.0,1.0,100.0,128.0,1.0,1.0,1.0,1.0,5.397605e-79,5.397605e-79
2,1.0,1.0,1212021,1,21,2021,1100.0,2021000003,2021000000.0,1.0,...,1.0,1.0,100.0,71.0,1.0,2.0,1.0,1.0,5.397605e-79,5.397605e-79



LLCP2022: shape=(445132, 328)


Unnamed: 0,_STATE,FMONTH,IDATE,IMONTH,IDAY,IYEAR,DISPCODE,SEQNO,_PSU,CTELENM1,...,_SMOKGRP,_LCSREC,DRNKANY6,DROCDY4_,_RFBING6,_DRNKWK2,_RFDRHV8,_FLSHOT7,_PNEUMO3,_AIDTST4
0,1.0,1.0,2032022,2,3,2022,1100.0,2022000001,2022000000.0,1.0,...,4.0,,2.0,5.397605e-79,1.0,5.397605e-79,1.0,1.0,2.0,2.0
1,1.0,1.0,2042022,2,4,2022,1100.0,2022000002,2022000000.0,1.0,...,4.0,,2.0,5.397605e-79,1.0,5.397605e-79,1.0,2.0,2.0,2.0
2,1.0,1.0,2022022,2,2,2022,1100.0,2022000003,2022000000.0,1.0,...,4.0,,2.0,5.397605e-79,1.0,5.397605e-79,1.0,,,2.0



LLCP2023: shape=(433323, 350)


Unnamed: 0,_STATE,FMONTH,IDATE,IMONTH,IDAY,IYEAR,DISPCODE,SEQNO,_PSU,CTELENM1,...,DROCDY4_,_RFBING6,_DRNKWK2,_RFDRHV8,_FLSHOT7,_PNEUMO3,_AIDTST4,_RFSEAT2,_RFSEAT3,_DRNKDRV
0,1.0,1.0,3012023,3,1,2023,1100.0,2023000001,2023000000.0,1.0,...,5.397605e-79,1.0,5.397605e-79,1.0,2.0,2.0,2.0,1.0,1.0,9.0
1,1.0,1.0,1062023,1,6,2023,1100.0,2023000002,2023000000.0,1.0,...,5.397605e-79,1.0,5.397605e-79,1.0,1.0,1.0,2.0,1.0,1.0,9.0
2,1.0,1.0,3082023,3,8,2023,1100.0,2023000003,2023000000.0,1.0,...,5.397605e-79,1.0,5.397605e-79,1.0,1.0,1.0,2.0,1.0,1.0,9.0



LLCP2024: shape=(457670, 301)


Unnamed: 0,_STATE,FMONTH,IDATE,IMONTH,IDAY,IYEAR,DISPCODE,SEQNO,_PSU,CTELENM1,...,_LCSCTSN,_LCSPSTF,DRNKANY6,DROCDY4_,_RFBING6,_DRNKWK3,_RFDRHV9,_FLSHOT7,_PNEUMO3,_AIDTST4
0,1.0,2.0,2282024,2,28,2024,1100.0,2024000001,2024000000.0,1.0,...,,9.0,2.0,5.397605e-79,1.0,5.397605e-79,1.0,1.0,2.0,2.0
1,1.0,2.0,2212024,2,21,2024,1100.0,2024000002,2024000000.0,1.0,...,4.0,9.0,2.0,5.397605e-79,1.0,5.397605e-79,1.0,1.0,1.0,2.0
2,1.0,2.0,2212024,2,21,2024,1100.0,2024000003,2024000000.0,1.0,...,4.0,2.0,1.0,100.0,2.0,1400.0,1.0,,,2.0


In [None]:
# Quick-check checklist for XPT datasets (run-able cell)
# Purpose: inspect presence and quality of key BRFSS variables to guide downstream cleaning & aggregation.
import numpy as np
import pandas as pd
from pathlib import Path
from IPython.display import display

DATA_DIR = Path("2433_p3_data/healthcare.gov")
print("Quick-check: looking for loaded datasets or attempting to load LLCP2018-2024")

datasets = globals().get('data_reloaded')
if datasets is None:
    datasets = {}
    # try to load using existing reader functions if available in globals
    reader = globals().get('read_xpt')
    if reader is None:
        try:
            import pyreadstat
            def _read(path):
                df, meta = pyreadstat.read_xport(str(path))
                return df, meta
            reader = _read
            print("Using pyreadstat to load files")
        except Exception:
            import pandas as pd
            def _read(path):
                df = pd.read_sas(str(path), format='xport', encoding='utf-8')
                return df, None
            reader = _read
            print("Using pandas.read_sas to load files")

    # Attempt to load LLCP2018 through LLCP2024 (inclusive)
    filenames = [f"LLCP{y}.XPT" for y in range(2018, 2025)]
    for fn in filenames:
        p = DATA_DIR / fn
        if p.exists():
            try:
                df_meta = reader(p)
                # reader may return (df, meta) or df only
                if isinstance(df_meta, tuple):
                    df, meta = df_meta
                else:
                    df = df_meta
                    meta = None
                datasets[fn.replace('.XPT','')] = df
                print(f"Loaded {fn}: shape={df.shape}")
            except Exception as e:
                print(f"Failed to load {fn}: {e}")
        else:
            print(f"Missing file: {p}")
else:
    print(f"Found in-memory datasets: {list(datasets.keys())}")

# define key columns to inspect
key_cols = ['IYEAR','_STATE','_LLCPWT','_BMI5','_BMI5CAT','_RFBMI5','SMOKE100','SMOKDAY2','DIABETE4','_PSU','_STSTR','HHADULT','CHILDREN','_CHLDCNT']

for name, df in datasets.items():
    print('\n' + '='*80)
    print(f"Dataset: {name}  shape={getattr(df,'shape',None)}")
    cols = list(df.columns)
    print(f"Total columns: {len(cols)}")

    # check key cols presence
    presence = {c: (c in cols) for c in key_cols}
    print("Key columns present:")
    for c,v in presence.items():
        print(f"  {c}: {v}")

    # show head
    print("\nFirst 3 rows:")
    try:
        display(df.head(3))
    except Exception:
        print(df.head(3))

    # missing rates for present key cols
    present = [c for c in key_cols if c in cols]
    if present:
        print("\nMissing rates for key cols:")
        for c in present:
            try:
                rate = df[c].isna().mean()
                print(f"  {c}: {rate:.3%}")
            except Exception as e:
                print(f"  {c}: error computing missing rate: {e}")

    # BMI handling
    if '_BMI5' in df.columns:
        try:
            s = df['_BMI5'].replace({9999: np.nan}).astype(float)
            print('\n_BMI5 raw summary:')
            print(s.describe())
            bmi = s.dropna() / 100.0
            print('\nComputed BMI (after /100) summary:')
            print(bmi.describe())
        except Exception as e:
            print(f"Error processing _BMI5: {e}")
    if '_BMI5CAT' in df.columns:
        print('\n_BMI5CAT value counts (top 10):')
        print(df['_BMI5CAT'].value_counts(dropna=False).head(10))

    # Smoking
    for col in ('SMOKE100','SMOKDAY2'):
        if col in df.columns:
            print(f"\n{col} value counts (top 10):")
            try:
                print(df[col].value_counts(dropna=False).head(10))
            except Exception as e:
                print(f"  error: {e}")

    # Diabetes
    if 'DIABETE4' in df.columns:
        print("\nDIABETE4 value counts (top 10):")
        print(df['DIABETE4'].value_counts(dropna=False).head(10))

    # weight
    if '_LLCPWT' in df.columns:
        print('\n_LLCPTW summary:')
        try:
            print(df['_LLCPWT'].describe())
            print('Total weight sum:', df['_LLCPWT'].sum())
        except Exception as e:
            print(f"Error inspecting _LLCPWT: {e}")

    # year and state distributions
    if 'IYEAR' in df.columns:
        print('\nIYEAR distribution:')
        try:
            print(df['IYEAR'].value_counts().sort_index())
        except Exception as e:
            print(f"Error with IYEAR: {e}")
    if '_STATE' in df.columns:
        print('\n_STATE top 10:')
        print(df['_STATE'].value_counts().head(10))

    # age-like columns
    age_cols = [c for c in df.columns if 'AGE' in c.upper()]
    if age_cols:
        print('\nAge-like columns found:', age_cols)
        for ac in age_cols:
            print(f"\n{ac} value counts (top 10):")
            try:
                print(df[ac].value_counts(dropna=False).head(10))
            except Exception as e:
                print(f"  error: {e}")
    
    # Household composition variables
    if 'HHADULT' in df.columns:
        print('\nHHADULT (Number of adults in household) value counts (top 15):')
        try:
            print(df['HHADULT'].value_counts(dropna=False).head(15))
            print(f'\nHHADULT summary stats:')
            # Filter valid values (1-76)
            valid_adults = df['HHADULT'].apply(lambda x: x if (pd.notna(x) and 1 <= x <= 76) else np.nan)
            print(valid_adults.describe())
        except Exception as e:
            print(f"  error: {e}")
    
    if 'CHILDREN' in df.columns:
        print('\nCHILDREN (Number of children in household) value counts (top 15):')
        try:
            print(df['CHILDREN'].value_counts(dropna=False).head(15))
            print(f'\nCHILDREN summary stats:')
            # 88 = None (0 children), 1-87 = count
            clean_children = df['CHILDREN'].apply(lambda x: 0 if x == 88 else (x if (pd.notna(x) and 1 <= x <= 87) else np.nan))
            print(clean_children.describe())
            print(f'\nHouseholds with children (CHILDREN != 88): {(df["CHILDREN"] != 88).sum():,} ({(df["CHILDREN"] != 88).mean():.2%})')
        except Exception as e:
            print(f"  error: {e}")
    
    if '_CHLDCNT' in df.columns:
        print('\n_CHLDCNT (Computed children count) value counts (top 10):')
        try:
            print(df['_CHLDCNT'].value_counts(dropna=False).head(10))
        except Exception as e:
            print(f"  error: {e}")

print('\nQuick-check finished.')

Quick-check: looking for loaded datasets or attempting to load LLCP2020-2024
Found in-memory datasets: ['LLCP2020', 'LLCP2021', 'LLCP2022', 'LLCP2023', 'LLCP2024']

Dataset: LLCP2020  shape=(401958, 279)
Total columns: 279
Key columns present:
  IYEAR: True
  _STATE: True
  _LLCPWT: True
  _BMI5: True
  _BMI5CAT: True
  _RFBMI5: True
  SMOKE100: True
  SMOKDAY2: True
  DIABETE4: True
  _PSU: True
  _STSTR: True
  HHADULT: True
  CHILDREN: True
  _CHLDCNT: True

First 3 rows:


Unnamed: 0,_STATE,FMONTH,IDATE,IMONTH,IDAY,IYEAR,DISPCODE,SEQNO,_PSU,CTELENM1,...,_RFPSA23,_CLNSCPY,_SGMSCPY,_SGMS10Y,_RFBLDS4,_STOLDNA,_VIRCOLN,_SBONTIM,_CRCREC1,_AIDTST4
0,1.0,1.0,1042020,1,4,2020,1100.0,2020000001,2020000000.0,1.0,...,,1.0,3.0,3.0,3.0,3.0,3.0,3.0,1.0,1.0
1,1.0,1.0,2072020,2,7,2020,1200.0,2020000002,2020000000.0,1.0,...,,,,,,,,2.0,,
2,1.0,1.0,1232020,1,23,2020,1100.0,2020000003,2020000000.0,1.0,...,,1.0,3.0,3.0,3.0,3.0,3.0,3.0,1.0,2.0



Missing rates for key cols:
  IYEAR: 0.000%
  _STATE: 0.000%
  _LLCPWT: 0.000%
  _BMI5: 10.289%
  _BMI5CAT: 10.289%
  _RFBMI5: 0.000%
  SMOKE100: 4.443%
  SMOKDAY2: 61.032%
  DIABETE4: 0.001%
  _PSU: 0.000%
  _STSTR: 0.000%
  HHADULT: 31.036%
  CHILDREN: 1.186%
  _CHLDCNT: 0.000%

_BMI5 raw summary:
count    360601.000000
mean       2830.631271
std         638.164868
min        1202.000000
25%        2399.000000
50%        2732.000000
75%        3138.000000
max        9843.000000
Name: _BMI5, dtype: float64

Computed BMI (after /100) summary:
count    360601.000000
mean         28.306313
std           6.381649
min          12.020000
25%          23.990000
50%          27.320000
75%          31.380000
max          98.430000
Name: _BMI5, dtype: float64

_BMI5CAT value counts (top 10):
_BMI5CAT
3.0    128946
4.0    115541
2.0    110121
NaN     41357
1.0      5993
Name: count, dtype: int64

SMOKE100 value counts (top 10):
SMOKE100
2.0    224535
1.0    156750
NaN     17860
7.0      2178
9.

Unnamed: 0,_STATE,FMONTH,IDATE,IMONTH,IDAY,IYEAR,DISPCODE,SEQNO,_PSU,CTELENM1,...,_FRTRES1,_VEGRES1,_FRUTSU1,_VEGESU1,_FRTLT1A,_VEGLT1A,_FRT16A,_VEG23A,_FRUITE1,_VEGETE1
0,1.0,1.0,1192021,1,19,2021,1100.0,2021000001,2021000000.0,1.0,...,1.0,1.0,100.0,214.0,1.0,1.0,1.0,1.0,5.397605e-79,5.397605e-79
1,1.0,1.0,1212021,1,21,2021,1100.0,2021000002,2021000000.0,1.0,...,1.0,1.0,100.0,128.0,1.0,1.0,1.0,1.0,5.397605e-79,5.397605e-79
2,1.0,1.0,1212021,1,21,2021,1100.0,2021000003,2021000000.0,1.0,...,1.0,1.0,100.0,71.0,1.0,2.0,1.0,1.0,5.397605e-79,5.397605e-79



Missing rates for key cols:
  IYEAR: 0.000%
  _STATE: 0.000%
  _LLCPWT: 0.000%
  _BMI5: 10.680%
  _BMI5CAT: 10.680%
  _RFBMI5: 0.000%
  SMOKE100: 4.840%
  SMOKDAY2: 61.829%
  DIABETE4: 0.001%
  _PSU: 0.000%
  _STSTR: 0.000%
  HHADULT: 26.850%
  CHILDREN: 1.398%
  _CHLDCNT: 0.000%

_BMI5 raw summary:
count    391841.000000
mean       2855.226495
std         655.194977
min        1200.000000
25%        2414.000000
50%        2744.000000
75%        3174.000000
max        9933.000000
Name: _BMI5, dtype: float64

Computed BMI (after /100) summary:
count    391841.000000
mean         28.552265
std           6.551950
min          12.000000
25%          24.140000
50%          27.440000
75%          31.740000
max          99.330000
Name: _BMI5, dtype: float64

_BMI5CAT value counts (top 10):
_BMI5CAT
3.0    138760
4.0    131305
2.0    115489
NaN     46852
1.0      6287
Name: count, dtype: int64

SMOKE100 value counts (top 10):
SMOKE100
2.0    246644
1.0    167588
NaN     21232
7.0      2298
9.

Unnamed: 0,_STATE,FMONTH,IDATE,IMONTH,IDAY,IYEAR,DISPCODE,SEQNO,_PSU,CTELENM1,...,_SMOKGRP,_LCSREC,DRNKANY6,DROCDY4_,_RFBING6,_DRNKWK2,_RFDRHV8,_FLSHOT7,_PNEUMO3,_AIDTST4
0,1.0,1.0,2032022,2,3,2022,1100.0,2022000001,2022000000.0,1.0,...,4.0,,2.0,5.397605e-79,1.0,5.397605e-79,1.0,1.0,2.0,2.0
1,1.0,1.0,2042022,2,4,2022,1100.0,2022000002,2022000000.0,1.0,...,4.0,,2.0,5.397605e-79,1.0,5.397605e-79,1.0,2.0,2.0,2.0
2,1.0,1.0,2022022,2,2,2022,1100.0,2022000003,2022000000.0,1.0,...,4.0,,2.0,5.397605e-79,1.0,5.397605e-79,1.0,,,2.0



Missing rates for key cols:
  IYEAR: 0.000%
  _STATE: 0.000%
  _LLCPWT: 0.000%
  _BMI5: 10.964%
  _BMI5CAT: 10.964%
  _RFBMI5: 0.000%
  SMOKE100: 7.139%
  SMOKDAY2: 63.145%
  DIABETE4: 0.001%
  _PSU: 0.000%
  _STSTR: 0.000%
  HHADULT: 21.580%
  CHILDREN: 2.092%
  _CHLDCNT: 0.000%

_BMI5 raw summary:
count    396326.000000
mean       2852.984182
std         655.488867
min        1202.000000
25%        2413.000000
50%        2744.000000
75%        3175.000000
max        9964.000000
Name: _BMI5, dtype: float64

Computed BMI (after /100) summary:
count    396326.000000
mean         28.529842
std           6.554889
min          12.020000
25%          24.130000
50%          27.440000
75%          31.750000
max          99.640000
Name: _BMI5, dtype: float64

_BMI5CAT value counts (top 10):
_BMI5CAT
3.0    139995
4.0    132577
2.0    116976
NaN     48806
1.0      6778
Name: count, dtype: int64

SMOKE100 value counts (top 10):
SMOKE100
2.0    245955
1.0    164217
NaN     31777
7.0      2297
9.

Unnamed: 0,_STATE,FMONTH,IDATE,IMONTH,IDAY,IYEAR,DISPCODE,SEQNO,_PSU,CTELENM1,...,DROCDY4_,_RFBING6,_DRNKWK2,_RFDRHV8,_FLSHOT7,_PNEUMO3,_AIDTST4,_RFSEAT2,_RFSEAT3,_DRNKDRV
0,1.0,1.0,3012023,3,1,2023,1100.0,2023000001,2023000000.0,1.0,...,5.397605e-79,1.0,5.397605e-79,1.0,2.0,2.0,2.0,1.0,1.0,9.0
1,1.0,1.0,1062023,1,6,2023,1100.0,2023000002,2023000000.0,1.0,...,5.397605e-79,1.0,5.397605e-79,1.0,1.0,1.0,2.0,1.0,1.0,9.0
2,1.0,1.0,3082023,3,8,2023,1100.0,2023000003,2023000000.0,1.0,...,5.397605e-79,1.0,5.397605e-79,1.0,1.0,1.0,2.0,1.0,1.0,9.0



Missing rates for key cols:
  IYEAR: 0.000%
  _STATE: 0.000%
  _LLCPWT: 0.000%
  _BMI5: 9.354%
  _BMI5CAT: 9.354%
  _RFBMI5: 0.000%
  SMOKE100: 4.540%
  SMOKDAY2: 63.390%
  DIABETE4: 0.001%
  _PSU: 0.000%
  _STSTR: 0.000%
  HHADULT: 20.389%
  CHILDREN: 1.175%
  _CHLDCNT: 0.000%

_BMI5 raw summary:
count    392788.000000
mean       2847.928783
std         654.197490
min        1202.000000
25%        2410.000000
50%        2740.000000
75%        3162.000000
max        9984.000000
Name: _BMI5, dtype: float64

Computed BMI (after /100) summary:
count    392788.000000
mean         28.479288
std           6.541975
min          12.020000
25%          24.100000
50%          27.400000
75%          31.620000
max          99.840000
Name: _BMI5, dtype: float64

_BMI5CAT value counts (top 10):
_BMI5CAT
3.0    139615
4.0    129906
2.0    116500
NaN     40535
1.0      6767
Name: count, dtype: int64

SMOKE100 value counts (top 10):
SMOKE100
2.0    251981
1.0    158774
NaN     19674
7.0      2251
9.0 

Unnamed: 0,_STATE,FMONTH,IDATE,IMONTH,IDAY,IYEAR,DISPCODE,SEQNO,_PSU,CTELENM1,...,_LCSCTSN,_LCSPSTF,DRNKANY6,DROCDY4_,_RFBING6,_DRNKWK3,_RFDRHV9,_FLSHOT7,_PNEUMO3,_AIDTST4
0,1.0,2.0,2282024,2,28,2024,1100.0,2024000001,2024000000.0,1.0,...,,9.0,2.0,5.397605e-79,1.0,5.397605e-79,1.0,1.0,2.0,2.0
1,1.0,2.0,2212024,2,21,2024,1100.0,2024000002,2024000000.0,1.0,...,4.0,9.0,2.0,5.397605e-79,1.0,5.397605e-79,1.0,1.0,1.0,2.0
2,1.0,2.0,2212024,2,21,2024,1100.0,2024000003,2024000000.0,1.0,...,4.0,2.0,1.0,100.0,2.0,1400.0,1.0,,,2.0



Missing rates for key cols:
  IYEAR: 0.000%
  _STATE: 0.000%
  _LLCPWT: 0.000%
  _BMI5: 9.404%
  _BMI5CAT: 9.404%
  _RFBMI5: 0.000%
  SMOKE100: 6.306%
  SMOKDAY2: 63.494%
  DIABETE4: 0.001%
  _PSU: 0.000%
  _STSTR: 0.000%
  HHADULT: 17.908%
  CHILDREN: 1.225%
  _CHLDCNT: 0.000%

_BMI5 raw summary:
count    414633.000000
mean       2855.680093
std         658.616131
min        1200.000000
25%        2414.000000
50%        2744.000000
75%        3175.000000
max        9984.000000
Name: _BMI5, dtype: float64

Computed BMI (after /100) summary:
count    414633.000000
mean         28.556801
std           6.586161
min          12.000000
25%          24.140000
50%          27.440000
75%          31.750000
max          99.840000
Name: _BMI5, dtype: float64

_BMI5CAT value counts (top 10):
_BMI5CAT
3.0    146563
4.0    139640
2.0    121053
NaN     43037
1.0      7377
Name: count, dtype: int64

SMOKE100 value counts (top 10):
SMOKE100
2.0    258956
1.0    167242
NaN     28860
7.0      2062
9.0 

In [16]:
# Field mapping & missing-value replacements (run this cell)
# Creates `data_cleaned` in globals(): cleaned DataFrames with derived fields:
#  - BMI (from _BMI5, handle 9999 -> NaN, divide by 100)
#  - DIABETES (binary), EVER_SMOKER, CURRENT_SMOKER
#  - ensure numeric _LLCPWT
#  - create Age_Group from common age columns (fallback to 'UNK')
import numpy as np
import pandas as pd
from IPython.display import display

datasets = globals().get('data_reloaded', {})
if not datasets:
    raise RuntimeError('No `data_reloaded` found in notebook. Run the reload cell (LLCP2020-2024) first.')

cleaned = {}
for name, df in datasets.items():
    dfc = df.copy()

    # _BMI5 -> BMI
    if '_BMI5' in dfc.columns:
        dfc['_BMI5'] = pd.to_numeric(dfc['_BMI5'], errors='coerce').replace({9999: np.nan})
        dfc['BMI'] = dfc['_BMI5'] / 100.0
    else:
        dfc['BMI'] = np.nan

    # DIABETE4 -> DIABETES (conservative mapping: 1->1, 2->0, else NaN)
    if 'DIABETE4' in dfc.columns:
        dfc['DIABETES'] = pd.to_numeric(dfc['DIABETE4'], errors='coerce').apply(lambda x: 1 if x==1 else (0 if x==2 else np.nan))
    else:
        dfc['DIABETES'] = np.nan
    # Chronic conditions: map BRFSS condition variables (1->1, 2->0, else NaN)
    if 'CVDINFR4' in dfc.columns:
        dfc['HEART_ATTACK'] = pd.to_numeric(dfc['CVDINFR4'], errors='coerce').apply(lambda x: 1 if x==1 else (0 if x==2 else np.nan))
    else:
        dfc['HEART_ATTACK'] = np.nan
    if 'CVDCRHD4' in dfc.columns:
        dfc['CHD'] = pd.to_numeric(dfc['CVDCRHD4'], errors='coerce').apply(lambda x: 1 if x==1 else (0 if x==2 else np.nan))
    else:
        dfc['CHD'] = np.nan
    if 'CVDSTRK3' in dfc.columns:
        dfc['STROKE'] = pd.to_numeric(dfc['CVDSTRK3'], errors='coerce').apply(lambda x: 1 if x==1 else (0 if x==2 else np.nan))
    else:
        dfc['STROKE'] = np.nan
    if 'ASTHMA3' in dfc.columns:
        dfc['ASTHMA'] = pd.to_numeric(dfc['ASTHMA3'], errors='coerce').apply(lambda x: 1 if x==1 else (0 if x==2 else np.nan))
    else:
        dfc['ASTHMA'] = np.nan
    if 'ASTHNOW' in dfc.columns:
        dfc['ASTHMA_NOW'] = pd.to_numeric(dfc['ASTHNOW'], errors='coerce').apply(lambda x: 1 if x==1 else (0 if x==2 else np.nan))
    else:
        dfc['ASTHMA_NOW'] = np.nan
    if 'CHCCOPD3' in dfc.columns:
        dfc['COPD'] = pd.to_numeric(dfc['CHCCOPD3'], errors='coerce').apply(lambda x: 1 if x==1 else (0 if x==2 else np.nan))
    else:
        dfc['COPD'] = np.nan
    if 'CHCSCNC1' in dfc.columns:
        dfc['SKIN_CANCER'] = pd.to_numeric(dfc['CHCSCNC1'], errors='coerce').apply(lambda x: 1 if x==1 else (0 if x==2 else np.nan))
    else:
        dfc['SKIN_CANCER'] = np.nan
    if 'CHCOCNC1' in dfc.columns:
        dfc['ANY_CANCER'] = pd.to_numeric(dfc['CHCOCNC1'], errors='coerce').apply(lambda x: 1 if x==1 else (0 if x==2 else np.nan))
    else:
        dfc['ANY_CANCER'] = np.nan
    if 'CHCKDNY2' in dfc.columns:
        dfc['KIDNEY_DISEASE'] = pd.to_numeric(dfc['CHCKDNY2'], errors='coerce').apply(lambda x: 1 if x==1 else (0 if x==2 else np.nan))
    else:
        dfc['KIDNEY_DISEASE'] = np.nan
    if 'HAVARTH4' in dfc.columns:
        dfc['ARTHRITIS'] = pd.to_numeric(dfc['HAVARTH4'], errors='coerce').apply(lambda x: 1 if x==1 else (0 if x==2 else np.nan))
    else:
        dfc['ARTHRITIS'] = np.nan

    # SMOKE100 -> EVER_SMOKER (1->1,2->0)
    if 'SMOKE100' in dfc.columns:
        dfc['EVER_SMOKER'] = pd.to_numeric(dfc['SMOKE100'], errors='coerce').apply(lambda x: 1 if x==1 else (0 if x==2 else np.nan))
    else:
        dfc['EVER_SMOKER'] = np.nan

    # SMOKDAY2 -> CURRENT_SMOKER (1/2 -> 1, 3 -> 0)
    if 'SMOKDAY2' in dfc.columns:
        dfc['CURRENT_SMOKER'] = pd.to_numeric(dfc['SMOKDAY2'], errors='coerce').apply(lambda x: 1 if x in (1,2) else (0 if x==3 else np.nan))
    else:
        dfc['CURRENT_SMOKER'] = np.nan

    # Household composition variables
    # HHADULT: Number of adults in household (1-76 valid, 77=DK, 99=Refused, else NaN)
    if 'HHADULT' in dfc.columns:
        hh_adult = pd.to_numeric(dfc['HHADULT'], errors='coerce')
        dfc['NUM_ADULTS'] = hh_adult.apply(lambda x: x if (1 <= x <= 76) else np.nan)
    else:
        dfc['NUM_ADULTS'] = np.nan
    
    # CHILDREN: Number of children in household (1-87 valid, 88=None, 99=Refused)
    if 'CHILDREN' in dfc.columns:
        children = pd.to_numeric(dfc['CHILDREN'], errors='coerce')
        # 88 = None (0 children), 1-87 = actual count, 99 = Refused -> NaN
        dfc['NUM_CHILDREN'] = children.apply(lambda x: 0 if x == 88 else (x if (1 <= x <= 87) else np.nan))
    else:
        dfc['NUM_CHILDREN'] = np.nan
    
    # Create household size (adults + children) if both available
    if 'NUM_ADULTS' in dfc.columns and 'NUM_CHILDREN' in dfc.columns:
        dfc['HOUSEHOLD_SIZE'] = dfc['NUM_ADULTS'] + dfc['NUM_CHILDREN']
    else:
        dfc['HOUSEHOLD_SIZE'] = np.nan
    
    # Binary indicator: household with children
    if 'NUM_CHILDREN' in dfc.columns:
        dfc['HAS_CHILDREN'] = dfc['NUM_CHILDREN'].apply(lambda x: 1 if x > 0 else (0 if x == 0 else np.nan))
    else:
        dfc['HAS_CHILDREN'] = np.nan

    # ensure weight numeric
    if '_LLCPWT' in dfc.columns:
        dfc['_LLCPWT'] = pd.to_numeric(dfc['_LLCPWT'], errors='coerce')
    else:
        dfc['_LLCPWT'] = np.nan

    # ensure IYEAR exists
    if 'IYEAR' not in dfc.columns:
        try:
            dfc['IYEAR'] = int(name.replace('LLCP',''))
        except Exception:
            dfc['IYEAR'] = np.nan

    # Age_Group detection: prefer grouped age vars if present, else bin raw AGE-like
    age_col = None
    for cand in ['_AGEG5YR','AGEG5YR','_AGEG5','RIDAGE_G','RIDAGE','AGE']:
        if cand in dfc.columns:
            age_col = cand
            break
    if age_col:
        # If it's already a grouped age label, keep as Age_Group; otherwise bin numeric AGE
        if 'G' in age_col.upper() or 'AGEG' in age_col.upper():
            dfc['Age_Group'] = dfc[age_col].astype(object)
        else:
            ages = pd.to_numeric(dfc[age_col], errors='coerce')
            dfc['Age_Group'] = pd.cut(ages, bins=[0,17,34,49,64,74,120], labels=['0-17','18-34','35-49','50-64','65-74','75+'])
    else:
        dfc['Age_Group'] = 'UNK'

    # keep only a small set of useful columns + derived
    keep_cols = [c for c in dfc.columns if c in ['IYEAR','_STATE','_LLCPWT','_PSU','_STSTR','_BMI5','_BMI5CAT','_RFBMI5','SMOKE100','SMOKDAY2','DIABETE4','HHADULT','CHILDREN']]
    # ensure derived columns present in output
    for dc in ['BMI','DIABETES','EVER_SMOKER','CURRENT_SMOKER','Age_Group','NUM_ADULTS','NUM_CHILDREN','HOUSEHOLD_SIZE','HAS_CHILDREN']:
        if dc not in dfc.columns:
            dfc[dc] = np.nan
    # produce a cleaned minimal frame (but keep original for deeper work)
    cleaned[name] = dfc

# expose to globals
globals()['data_cleaned'] = cleaned

# show quick summary for each cleaned frame
for k,v in cleaned.items():
    print(f"{k}: shape={v.shape}")
    print('Columns sample:', list(v.columns)[:40])
    display(v[[c for c in ['IYEAR','_STATE','BMI','DIABETES','EVER_SMOKER','CURRENT_SMOKER','Age_Group','NUM_ADULTS','NUM_CHILDREN','HOUSEHOLD_SIZE','HAS_CHILDREN','_LLCPWT'] if c in v.columns]].head(5))

print('\nField mapping & missing-value replacement complete. `data_cleaned` available in globals().')

LLCP2020: shape=(401958, 298)
Columns sample: ['_STATE', 'FMONTH', 'IDATE', 'IMONTH', 'IDAY', 'IYEAR', 'DISPCODE', 'SEQNO', '_PSU', 'CTELENM1', 'PVTRESD1', 'COLGHOUS', 'STATERE1', 'CELPHONE', 'LADULT1', 'COLGSEX', 'NUMADULT', 'LANDSEX', 'NUMMEN', 'NUMWOMEN', 'RESPSLCT', 'SAFETIME', 'CTELNUM1', 'CELLFON5', 'CADULT1', 'CELLSEX', 'PVTRESD3', 'CCLGHOUS', 'CSTATE1', 'LANDLINE', 'HHADULT', 'SEXVAR', 'GENHLTH', 'PHYSHLTH', 'MENTHLTH', 'POORHLTH', 'HLTHPLN1', 'PERSDOC2', 'MEDCOST', 'CHECKUP1']


Unnamed: 0,IYEAR,_STATE,BMI,DIABETES,EVER_SMOKER,CURRENT_SMOKER,Age_Group,NUM_ADULTS,NUM_CHILDREN,HOUSEHOLD_SIZE,HAS_CHILDREN,_LLCPWT
0,2020,1.0,16.6,1.0,1.0,1.0,8.0,,0.0,,0.0,284.335672
1,2020,1.0,29.18,,,,10.0,,0.0,,0.0,171.283329
2,2020,1.0,,,0.0,,10.0,,0.0,,0.0,1334.368863
3,2020,1.0,,,0.0,,13.0,,0.0,,0.0,1297.486618
4,2020,1.0,20.34,,0.0,,13.0,,0.0,,0.0,454.815127


LLCP2021: shape=(438693, 322)
Columns sample: ['_STATE', 'FMONTH', 'IDATE', 'IMONTH', 'IDAY', 'IYEAR', 'DISPCODE', 'SEQNO', '_PSU', 'CTELENM1', 'PVTRESD1', 'COLGHOUS', 'STATERE1', 'CELPHON1', 'LADULT1', 'COLGSEX', 'NUMADULT', 'LANDSEX', 'NUMMEN', 'NUMWOMEN', 'RESPSLCT', 'SAFETIME', 'CTELNUM1', 'CELLFON5', 'CADULT1', 'CELLSEX', 'PVTRESD3', 'CCLGHOUS', 'CSTATE1', 'LANDLINE', 'HHADULT', 'SEXVAR', 'GENHLTH', 'PHYSHLTH', 'MENTHLTH', 'POORHLTH', 'PRIMINSR', 'PERSDOC3', 'MEDCOST1', 'CHECKUP1']


Unnamed: 0,IYEAR,_STATE,BMI,DIABETES,EVER_SMOKER,CURRENT_SMOKER,Age_Group,NUM_ADULTS,NUM_CHILDREN,HOUSEHOLD_SIZE,HAS_CHILDREN,_LLCPWT
0,2021,1.0,14.54,,1.0,0.0,11.0,,0.0,,0.0,744.745531
1,2021,1.0,,1.0,0.0,,10.0,,0.0,,0.0,299.137394
2,2021,1.0,28.29,1.0,0.0,,11.0,,0.0,,0.0,587.862986
3,2021,1.0,33.47,1.0,0.0,,9.0,,0.0,,0.0,1099.621573
4,2021,1.0,28.73,1.0,0.0,,12.0,,0.0,,0.0,1711.825866


LLCP2022: shape=(445132, 347)
Columns sample: ['_STATE', 'FMONTH', 'IDATE', 'IMONTH', 'IDAY', 'IYEAR', 'DISPCODE', 'SEQNO', '_PSU', 'CTELENM1', 'PVTRESD1', 'COLGHOUS', 'STATERE1', 'CELPHON1', 'LADULT1', 'COLGSEX1', 'NUMADULT', 'LANDSEX1', 'NUMMEN', 'NUMWOMEN', 'RESPSLCT', 'SAFETIME', 'CTELNUM1', 'CELLFON5', 'CADULT1', 'CELLSEX1', 'PVTRESD3', 'CCLGHOUS', 'CSTATE1', 'LANDLINE', 'HHADULT', 'SEXVAR', 'GENHLTH', 'PHYSHLTH', 'MENTHLTH', 'POORHLTH', 'PRIMINSR', 'PERSDOC3', 'MEDCOST1', 'CHECKUP1']


Unnamed: 0,IYEAR,_STATE,BMI,DIABETES,EVER_SMOKER,CURRENT_SMOKER,Age_Group,NUM_ADULTS,NUM_CHILDREN,HOUSEHOLD_SIZE,HAS_CHILDREN,_LLCPWT
0,2022,1.0,,1.0,0.0,,13.0,,0.0,,0.0,487.612985
1,2022,1.0,26.57,,0.0,,13.0,,0.0,,0.0,432.100273
2,2022,1.0,25.61,,0.0,,8.0,,0.0,,0.0,366.743194
3,2022,1.0,23.3,,1.0,1.0,14.0,,0.0,,0.0,1681.791487
4,2022,1.0,21.77,,0.0,,5.0,,0.0,,0.0,2111.206286


LLCP2023: shape=(433323, 369)
Columns sample: ['_STATE', 'FMONTH', 'IDATE', 'IMONTH', 'IDAY', 'IYEAR', 'DISPCODE', 'SEQNO', '_PSU', 'CTELENM1', 'PVTRESD1', 'COLGHOUS', 'STATERE1', 'CELPHON1', 'LADULT1', 'NUMADULT', 'RESPSLC1', 'LANDSEX2', 'LNDSXBRT', 'SAFETIME', 'CTELNUM1', 'CELLFON5', 'CADULT1', 'CELLSEX2', 'CELSXBRT', 'PVTRESD3', 'CCLGHOUS', 'CSTATE1', 'LANDLINE', 'HHADULT', 'SEXVAR', 'GENHLTH', 'PHYSHLTH', 'MENTHLTH', 'POORHLTH', 'PRIMINS1', 'PERSDOC3', 'MEDCOST1', 'CHECKUP1', 'EXERANY2']


Unnamed: 0,IYEAR,_STATE,BMI,DIABETES,EVER_SMOKER,CURRENT_SMOKER,Age_Group,NUM_ADULTS,NUM_CHILDREN,HOUSEHOLD_SIZE,HAS_CHILDREN,_LLCPWT
0,2023,1.0,30.47,1.0,0.0,,13.0,,0.0,,0.0,605.427887
1,2023,1.0,28.56,,0.0,,13.0,,0.0,,0.0,1121.992705
2,2023,1.0,22.31,,1.0,0.0,13.0,,0.0,,0.0,600.963308
3,2023,1.0,27.44,,0.0,,12.0,,0.0,,0.0,605.427887
4,2023,1.0,25.85,1.0,0.0,,12.0,,0.0,,0.0,281.711042


LLCP2024: shape=(457670, 320)
Columns sample: ['_STATE', 'FMONTH', 'IDATE', 'IMONTH', 'IDAY', 'IYEAR', 'DISPCODE', 'SEQNO', '_PSU', 'CTELENM1', 'PVTRESD1', 'COLGHOUS', 'STATERE1', 'CELPHON1', 'LADULT1', 'NUMADULT', 'RESPSLC1', 'LANDSEX3', 'SAFETIME', 'CTELNUM1', 'CELLFON5', 'CADULT1', 'CELLSEX3', 'PVTRESD3', 'CCLGHOUS', 'CSTATE1', 'LANDLINE', 'HHADULT', 'SEXVAR', 'GENHLTH', 'PHYSHLTH', 'MENTHLTH', 'POORHLTH', 'PRIMINS2', 'PERSDOC3', 'MEDCOST1', 'CHECKUP1', 'EXERANY2', 'LASTDEN4', 'RMVTETH4']


Unnamed: 0,IYEAR,_STATE,BMI,DIABETES,EVER_SMOKER,CURRENT_SMOKER,Age_Group,NUM_ADULTS,NUM_CHILDREN,HOUSEHOLD_SIZE,HAS_CHILDREN,_LLCPWT
0,2024,1.0,22.49,,0.0,,12.0,,0.0,,0.0,261.525511
1,2024,1.0,25.83,,1.0,0.0,13.0,,0.0,,0.0,307.169688
2,2024,1.0,22.53,,1.0,1.0,8.0,,0.0,,0.0,2939.862806
3,2024,1.0,25.09,,0.0,,13.0,,0.0,,0.0,153.584844
4,2024,1.0,19.77,,0.0,,6.0,,0.0,,0.0,1229.623036



Field mapping & missing-value replacement complete. `data_cleaned` available in globals().


In [None]:
# Weighted aggregation by Year x State x Age_Group (run this cell)
# Computes weighted means/counts for key indicators and saves per-year and combined CSV (parquet optional if available).
import numpy as np
import pandas as pd
from pathlib import Path
from IPython.display import display

DATA_DIR = Path("2433_p3_data/healthcare.gov")
exports = DATA_DIR / "exports" / "aggregated"
exports.mkdir(parents=True, exist_ok=True)

cleaned = globals().get('data_cleaned', {})
if not cleaned:
    raise RuntimeError('`data_cleaned` not found in globals. Run the field-mapping cell first.')

agg_list = []

def weighted_mean(s, w):
    s = pd.to_numeric(s, errors='coerce')
    w = pd.to_numeric(w, errors='coerce')
    mask = s.notna() & w.notna()
    if mask.sum() == 0 or w[mask].sum() == 0:
        return np.nan
    return np.average(s[mask], weights=w[mask])

for name, df in cleaned.items():
    dfc = df.copy()
    # ensure weight
    if '_LLCPWT' not in dfc.columns or dfc['_LLCPWT'].isna().all():
        dfc['_LLCPWT'] = 1.0

    # ensure grouping cols
    if 'Age_Group' not in dfc.columns:
        dfc['Age_Group'] = 'UNK'

    group_cols = [c for c in ['IYEAR','_STATE','Age_Group'] if c in dfc.columns]
    if not group_cols:
        print(f"Skipping {name}: no grouping columns present: {list(dfc.columns)[:20]}")
        continue

    # ensure indicator columns exist
    for col in ['BMI','EVER_SMOKER','CURRENT_SMOKER','DIABETES',
                'HEART_ATTACK','CHD','STROKE','ASTHMA','ASTHMA_NOW','COPD',
               'SKIN_CANCER','ANY_CANCER','KIDNEY_DISEASE','ARTHRITIS',
                'NUM_ADULTS','NUM_CHILDREN','HOUSEHOLD_SIZE','HAS_CHILDREN']:
        if col not in dfc.columns:
            dfc[col] = np.nan

    # Vectorized weighted aggregation (avoid groupby.apply and optional parquet dependency)
    metrics = ['BMI','EVER_SMOKER','CURRENT_SMOKER','DIABETES',
               'HEART_ATTACK','CHD','STROKE','ASTHMA','ASTHMA_NOW','COPD',
               'SKIN_CANCER','ANY_CANCER','KIDNEY_DISEASE','ARTHRITIS',
               'NUM_ADULTS','NUM_CHILDREN','HOUSEHOLD_SIZE','HAS_CHILDREN']

    # ensure numeric for metrics and build weighted numerator/denominator cols
    for col in metrics:
        dfc[col] = pd.to_numeric(dfc[col], errors='coerce')
        # weighted numerator (treat NaN as 0 in numerator)
        dfc[f"{col}_wnum"] = dfc[col].fillna(0.0) * dfc['_LLCPWT']
        # weighted denominator: weight only where metric is present
        dfc[f"{col}_wden"] = dfc['_LLCPWT'] * dfc[col].notna().astype(float)

    # build aggregation dict for groupby.agg
    agg_dict = {
        'n': ('_LLCPWT', 'size'),
        'n_weighted': ('_LLCPWT', 'sum')
    }
    # add sum of weighted numerators and denominators for each metric
    for col in metrics:
        agg_dict[f"{col}_num"] = (f"{col}_wnum", 'sum')
        agg_dict[f"{col}_den"] = (f"{col}_wden", 'sum')

    g = dfc.groupby(group_cols)
    agg = g.agg(**agg_dict).reset_index()

    # compute final weighted means / prevalences from aggregated sums
    def safe_div(num, den):
        with np.errstate(divide='ignore', invalid='ignore'):
            out = num / den
        return out.replace([np.inf, -np.inf], np.nan)

    # mean BMI
    if 'BMI_num' in agg.columns and 'BMI_den' in agg.columns:
        agg['mean_BMI_w'] = safe_div(agg['BMI_num'], agg['BMI_den'])
    else:
        agg['mean_BMI_w'] = np.nan

    # map prevalences (lowercase names for consistency with previous outputs)
    for col in ['EVER_SMOKER','CURRENT_SMOKER','DIABETES','HEART_ATTACK','CHD','STROKE','ASTHMA','ASTHMA_NOW','COPD','SKIN_CANCER','ANY_CANCER','KIDNEY_DISEASE','ARTHRITIS','HAS_CHILDREN']:
        num_col = f"{col}_num"
        den_col = f"{col}_den"
        prev_col = f"{col.lower()}_prev_w"
        if num_col in agg.columns and den_col in agg.columns:
            agg[prev_col] = safe_div(agg[num_col], agg[den_col])
        else:
            agg[prev_col] = np.nan
    
    # Household composition: weighted means for continuous variables
    for col in ['NUM_ADULTS','NUM_CHILDREN','HOUSEHOLD_SIZE']:
        num_col = f"{col}_num"
        den_col = f"{col}_den"
        mean_col = f"mean_{col.lower()}_w"
        if num_col in agg.columns and den_col in agg.columns:
            agg[mean_col] = safe_div(agg[num_col], agg[den_col])
        else:
            agg[mean_col] = np.nan

    # drop intermediate numerator/denominator columns
    drop_cols = [c for c in agg.columns if c.endswith('_num') or c.endswith('_den')]
    agg = agg.drop(columns=drop_cols)

    agg['source'] = name
    # save per-year: always write CSV; try parquet only if supported in environment
    out_parquet = exports / f"aggregated_{name}.parquet"
    out_csv = exports / f"aggregated_{name}.csv"
    try:
        agg.to_csv(out_csv, index=False)
        try:
            agg.to_parquet(out_parquet, index=False)
            print(f"Saved aggregated for {name}: {out_parquet} rows={len(agg)} (parquet+csv)")
        except Exception:
            print(f"Saved aggregated for {name}: {out_csv} rows={len(agg)} (parquet skipped)")
    except Exception as e:
        print(f"Warning: failed to save aggregated for {name}: {e}")

    agg_list.append(agg)

if agg_list:
    aggregated_all = pd.concat(agg_list, ignore_index=True, sort=False)
    all_parquet = exports / "aggregated_all_years.parquet"
    all_csv = exports / "aggregated_all_years.csv"
    try:
        aggregated_all.to_csv(all_csv, index=False)
        try:
            aggregated_all.to_parquet(all_parquet, index=False)
            print(f"Saved combined aggregated to {all_parquet} rows={len(aggregated_all)} (parquet+csv)")
        except Exception:
            print(f"Saved combined aggregated to {all_csv} rows={len(aggregated_all)} (parquet skipped)")
    except Exception as e:
        print(f"Warning: failed to save combined aggregated: {e}")
    display(aggregated_all.head(10))
    globals()['aggregated'] = aggregated_all
else:
    print('No aggregated output produced (no groups found).')


Saved aggregated for LLCP2020: 2433_p3_data/healthcare.gov/exports/aggregated/aggregated_LLCP2020.csv rows=1328 (parquet skipped)
Saved aggregated for LLCP2021: 2433_p3_data/healthcare.gov/exports/aggregated/aggregated_LLCP2021.csv rows=1422 (parquet skipped)
Saved aggregated for LLCP2021: 2433_p3_data/healthcare.gov/exports/aggregated/aggregated_LLCP2021.csv rows=1422 (parquet skipped)
Saved aggregated for LLCP2022: 2433_p3_data/healthcare.gov/exports/aggregated/aggregated_LLCP2022.csv rows=1480 (parquet skipped)
Saved aggregated for LLCP2022: 2433_p3_data/healthcare.gov/exports/aggregated/aggregated_LLCP2022.csv rows=1480 (parquet skipped)
Saved aggregated for LLCP2023: 2433_p3_data/healthcare.gov/exports/aggregated/aggregated_LLCP2023.csv rows=1384 (parquet skipped)
Saved aggregated for LLCP2023: 2433_p3_data/healthcare.gov/exports/aggregated/aggregated_LLCP2023.csv rows=1384 (parquet skipped)
Saved aggregated for LLCP2024: 2433_p3_data/healthcare.gov/exports/aggregated/aggregated_L

Unnamed: 0,IYEAR,_STATE,Age_Group,n,n_weighted,mean_BMI_w,ever_smoker_prev_w,current_smoker_prev_w,diabetes_prev_w,heart_attack_prev_w,chd_prev_w,stroke_prev_w,asthma_prev_w,asthma_now_prev_w,copd_prev_w,skin_cancer_prev_w,any_cancer_prev_w,kidney_disease_prev_w,arthritis_prev_w,source
0,2020,1.0,1.0,307,446509.12852,27.767332,0.219945,0.524461,0.806859,0.005437,0.006528,0.0,0.179758,0.587927,,,,0.0,0.064581,LLCP2020
1,2020,1.0,2.0,244,279574.605557,28.576676,0.363987,0.535601,0.847979,0.0,0.011115,0.017697,0.19398,0.445996,,,,0.011232,0.083737,LLCP2020
2,2020,1.0,3.0,320,342219.56981,30.085542,0.438455,0.508013,0.794171,0.0,0.009701,0.005929,0.177344,0.581398,,,,0.014755,0.113673,LLCP2020
3,2020,1.0,4.0,313,270409.535248,30.302965,0.522425,0.519121,0.746045,0.016937,0.016963,0.021908,0.110263,0.567614,,,,0.017117,0.164671,LLCP2020
4,2020,1.0,5.0,332,308964.846236,30.359079,0.413647,0.695448,0.811031,0.01825,0.01675,0.009967,0.163075,0.682882,,,,0.025992,0.238604,LLCP2020
5,2020,1.0,6.0,346,245548.269815,30.817545,0.481296,0.556599,0.931223,0.027647,0.033338,0.020781,0.134755,0.602591,,,,0.029449,0.301246,LLCP2020
6,2020,1.0,7.0,439,317080.285988,30.698504,0.452268,0.428541,0.940291,0.057123,0.044794,0.06651,0.113998,0.642616,,,,0.066419,0.348664,LLCP2020
7,2020,1.0,8.0,452,272048.619095,29.741573,0.507288,0.485755,0.983735,0.0696,0.043501,0.045325,0.126766,0.7968,,,,0.045784,0.506383,LLCP2020
8,2020,1.0,9.0,547,359959.055572,29.961831,0.48386,0.394119,0.974725,0.08329,0.090233,0.083022,0.177178,0.705363,,,,0.077004,0.554145,LLCP2020
9,2020,1.0,10.0,615,277014.562571,30.019158,0.489529,0.329248,1.0,0.097934,0.093272,0.067625,0.118746,0.721661,,,,0.039688,0.525308,LLCP2020


In [12]:
# Inspect exported aggregated files and show first 5 rows for each; also display aggregated_all_years.csv head(20)
from pathlib import Path
import pandas as pd
from IPython.display import display

exports = Path("2433_p3_data/healthcare.gov/exports/aggregated")
print(f"Looking in: {exports}")
if not exports.exists():
    print("Exports directory not found.")
else:
    files = sorted(exports.glob("*"))
    if not files:
        print("No files found in exports directory.")
    else:
        for f in files:
            print('\n' + '='*80)
            try:
                size = f.stat().st_size
            except Exception:
                size = 'unknown'
            print(f"File: {f.name}  size: {size}")
            try:
                if f.suffix.lower() == '.csv':
                    df = pd.read_csv(f)
                elif f.suffix.lower() in ('.parquet', '.pq'):
                    try:
                        df = pd.read_parquet(f)
                    except Exception as e:
                        print(f"  Could not read parquet {f.name}: {e}")
                        continue
                else:
                    print("  Skipping unsupported file type")
                    continue
                display(df.head(5))
            except Exception as e:
                print(f"  Error reading {f.name}: {e}")

# Now explicitly read aggregated_all_years.csv and show first 20 rows (if present)
all_file = exports / 'aggregated_all_years.csv'
print('\n' + '='*80)
print(f"Inspecting combined file: {all_file}")
if all_file.exists():
    try:
        df_all = pd.read_csv(all_file, low_memory=False)
        print(f"Shape: {df_all.shape}")
        print('\nColumn dtypes:')
        print(df_all.dtypes.to_string())
        print('\nFirst 20 rows:')
        display(df_all.head(20))
        # expose for interactive use
        globals()['preview_aggregated_all_years'] = df_all.head(20)
    except Exception as e:
        print(f"Failed to read {all_file}: {e}")
else:
    print('Combined aggregated CSV not found.')


Looking in: 2433_p3_data/healthcare.gov/exports/aggregated

File: aggregated_LLCP2020.csv  size: 289466


Unnamed: 0,IYEAR,_STATE,Age_Group,n,n_weighted,mean_BMI_w,ever_smoker_prev_w,current_smoker_prev_w,diabetes_prev_w,heart_attack_prev_w,chd_prev_w,stroke_prev_w,asthma_prev_w,asthma_now_prev_w,copd_prev_w,skin_cancer_prev_w,any_cancer_prev_w,kidney_disease_prev_w,arthritis_prev_w,source
0,2020,1.0,1.0,307,446509.12852,27.767332,0.219945,0.524461,0.806859,0.005437,0.006528,0.0,0.179758,0.587927,,,,0.0,0.064581,LLCP2020
1,2020,1.0,2.0,244,279574.605557,28.576676,0.363987,0.535601,0.847979,0.0,0.011115,0.017697,0.19398,0.445996,,,,0.011232,0.083737,LLCP2020
2,2020,1.0,3.0,320,342219.56981,30.085542,0.438455,0.508013,0.794171,0.0,0.009701,0.005929,0.177344,0.581398,,,,0.014755,0.113673,LLCP2020
3,2020,1.0,4.0,313,270409.535248,30.302965,0.522425,0.519121,0.746045,0.016937,0.016963,0.021908,0.110263,0.567614,,,,0.017117,0.164671,LLCP2020
4,2020,1.0,5.0,332,308964.846236,30.359079,0.413647,0.695448,0.811031,0.01825,0.01675,0.009967,0.163075,0.682882,,,,0.025992,0.238604,LLCP2020



File: aggregated_LLCP2021.csv  size: 319661


Unnamed: 0,IYEAR,_STATE,Age_Group,n,n_weighted,mean_BMI_w,ever_smoker_prev_w,current_smoker_prev_w,diabetes_prev_w,heart_attack_prev_w,chd_prev_w,stroke_prev_w,asthma_prev_w,asthma_now_prev_w,copd_prev_w,skin_cancer_prev_w,any_cancer_prev_w,kidney_disease_prev_w,arthritis_prev_w,source
0,2021,1.0,1.0,246,455225.47175,26.835264,0.125732,0.406041,1.0,0.008653,0.002386,0.013856,0.247029,0.467694,0.006174,,,0.019893,,LLCP2021
1,2021,1.0,2.0,215,275459.940396,28.936857,0.381659,0.455662,0.810238,0.012384,0.006573,0.011867,0.140502,0.623512,0.041525,,,0.0,,LLCP2021
2,2021,1.0,3.0,243,352086.578917,31.213439,0.436647,0.521387,0.860341,0.003943,0.007928,0.012094,0.177566,0.587017,0.044419,,,0.013787,,LLCP2021
3,2021,1.0,4.0,263,285745.476339,29.41172,0.551144,0.474051,0.727418,0.004779,0.004723,0.004556,0.16704,0.604679,0.055182,,,0.011219,,LLCP2021
4,2021,1.0,5.0,272,303784.728397,30.980626,0.48755,0.505322,0.770837,0.01524,0.013639,0.028203,0.137905,0.599995,0.081714,,,0.030828,,LLCP2021



File: aggregated_LLCP2022.csv  size: 408291


Unnamed: 0,IYEAR,_STATE,Age_Group,n,n_weighted,mean_BMI_w,ever_smoker_prev_w,current_smoker_prev_w,diabetes_prev_w,heart_attack_prev_w,chd_prev_w,stroke_prev_w,asthma_prev_w,asthma_now_prev_w,copd_prev_w,skin_cancer_prev_w,any_cancer_prev_w,kidney_disease_prev_w,arthritis_prev_w,source
0,2022,1.0,1.0,183,454121.15733,25.889748,0.177619,0.56797,0.73686,0.00346,0.013614,0.0,0.251174,0.366975,0.026103,0.0,0.005682,0.006893,0.05396,LLCP2022
1,2022,1.0,2.0,188,281651.686825,28.63,0.350211,0.565021,0.652435,0.0,0.0,0.006467,0.147793,0.270781,0.013507,0.005634,0.043476,0.004392,0.046219,LLCP2022
2,2022,1.0,3.0,210,328113.350772,29.7493,0.378565,0.508624,0.744588,0.013431,0.020196,0.004323,0.177712,0.635593,0.055865,0.011369,0.004254,0.00655,0.119443,LLCP2022
3,2022,1.0,4.0,244,259496.142017,30.101444,0.480568,0.430893,0.718615,0.009828,0.01148,0.039332,0.154961,0.678373,0.04767,0.0382,0.021531,0.017869,0.155199,LLCP2022
4,2022,1.0,5.0,239,318626.332734,30.716991,0.403497,0.423352,0.798838,0.019669,0.027593,0.037286,0.189628,0.599878,0.047076,0.009415,0.032501,0.022721,0.237833,LLCP2022



File: aggregated_LLCP2023.csv  size: 372404


Unnamed: 0,IYEAR,_STATE,Age_Group,n,n_weighted,mean_BMI_w,ever_smoker_prev_w,current_smoker_prev_w,diabetes_prev_w,heart_attack_prev_w,chd_prev_w,stroke_prev_w,asthma_prev_w,asthma_now_prev_w,copd_prev_w,skin_cancer_prev_w,any_cancer_prev_w,kidney_disease_prev_w,arthritis_prev_w,source
0,2023,1.0,1.0,209,484020.001912,26.891099,0.172098,0.314988,1.0,0.003473,0.005302,0.005875,0.208682,0.31317,0.017636,0.0,0.004795,0.008516,0.036032,LLCP2023
1,2023,1.0,2.0,184,250583.144045,29.522418,0.327021,0.330862,0.927231,0.012414,0.004466,0.005842,0.160245,0.349066,0.05163,0.004466,0.003143,0.005935,0.062615,LLCP2023
2,2023,1.0,3.0,225,332833.966697,29.496791,0.381597,0.501178,0.791206,0.003985,0.010915,0.019973,0.177149,0.614281,0.025973,0.011339,0.021079,0.022813,0.084018,LLCP2023
3,2023,1.0,4.0,237,278739.639803,30.207312,0.406061,0.362777,0.700231,0.002737,0.0,0.021764,0.154289,0.639411,0.028147,0.024313,0.022782,0.004436,0.111745,LLCP2023
4,2023,1.0,5.0,242,302276.268894,30.479497,0.42269,0.42105,0.930196,0.024374,0.020963,0.033907,0.186516,0.616666,0.04918,0.023979,0.031354,0.014898,0.212405,LLCP2023



File: aggregated_LLCP2024.csv  size: 365955


Unnamed: 0,IYEAR,_STATE,Age_Group,n,n_weighted,mean_BMI_w,ever_smoker_prev_w,current_smoker_prev_w,diabetes_prev_w,heart_attack_prev_w,chd_prev_w,stroke_prev_w,asthma_prev_w,asthma_now_prev_w,copd_prev_w,skin_cancer_prev_w,any_cancer_prev_w,kidney_disease_prev_w,arthritis_prev_w,source
0,2024,1.0,1.0,255,444478.657665,27.448627,0.11851,0.499615,0.744914,0.0,0.006192,0.015196,0.219736,0.57117,0.017695,0.005708,0.0,0.020168,0.051736,LLCP2024
1,2024,1.0,2.0,218,269454.604133,29.788517,0.274015,0.328217,0.786544,0.0,0.010605,0.005479,0.245704,0.681906,0.037683,0.00633,0.018915,0.005517,0.088235,LLCP2024
2,2024,1.0,3.0,233,322273.979829,29.855462,0.476065,0.341489,0.706446,0.005006,0.003896,0.008389,0.198206,0.610096,0.045365,0.01534,0.013734,0.0,0.129534,LLCP2024
3,2024,1.0,4.0,281,260516.572064,30.185772,0.458301,0.460292,0.702095,0.015017,0.013894,0.011217,0.159296,0.624612,0.054405,0.025471,0.017518,0.025292,0.162239,LLCP2024
4,2024,1.0,5.0,296,299162.062568,29.923819,0.45237,0.378697,0.863163,0.042803,0.012438,0.022835,0.144471,0.58393,0.043331,0.018897,0.023558,0.028511,0.243044,LLCP2024



File: aggregated_all_years.csv  size: 1754673


Unnamed: 0,IYEAR,_STATE,Age_Group,n,n_weighted,mean_BMI_w,ever_smoker_prev_w,current_smoker_prev_w,diabetes_prev_w,heart_attack_prev_w,chd_prev_w,stroke_prev_w,asthma_prev_w,asthma_now_prev_w,copd_prev_w,skin_cancer_prev_w,any_cancer_prev_w,kidney_disease_prev_w,arthritis_prev_w,source
0,2020,1.0,1.0,307,446509.12852,27.767332,0.219945,0.524461,0.806859,0.005437,0.006528,0.0,0.179758,0.587927,,,,0.0,0.064581,LLCP2020
1,2020,1.0,2.0,244,279574.605557,28.576676,0.363987,0.535601,0.847979,0.0,0.011115,0.017697,0.19398,0.445996,,,,0.011232,0.083737,LLCP2020
2,2020,1.0,3.0,320,342219.56981,30.085542,0.438455,0.508013,0.794171,0.0,0.009701,0.005929,0.177344,0.581398,,,,0.014755,0.113673,LLCP2020
3,2020,1.0,4.0,313,270409.535248,30.302965,0.522425,0.519121,0.746045,0.016937,0.016963,0.021908,0.110263,0.567614,,,,0.017117,0.164671,LLCP2020
4,2020,1.0,5.0,332,308964.846236,30.359079,0.413647,0.695448,0.811031,0.01825,0.01675,0.009967,0.163075,0.682882,,,,0.025992,0.238604,LLCP2020



Inspecting combined file: 2433_p3_data/healthcare.gov/exports/aggregated/aggregated_all_years.csv
Shape: (6971, 20)

Column dtypes:
IYEAR                      int64
_STATE                   float64
Age_Group                float64
n                          int64
n_weighted               float64
mean_BMI_w               float64
ever_smoker_prev_w       float64
current_smoker_prev_w    float64
diabetes_prev_w          float64
heart_attack_prev_w      float64
chd_prev_w               float64
stroke_prev_w            float64
asthma_prev_w            float64
asthma_now_prev_w        float64
copd_prev_w              float64
skin_cancer_prev_w       float64
any_cancer_prev_w        float64
kidney_disease_prev_w    float64
arthritis_prev_w         float64
source                    object

First 20 rows:


Unnamed: 0,IYEAR,_STATE,Age_Group,n,n_weighted,mean_BMI_w,ever_smoker_prev_w,current_smoker_prev_w,diabetes_prev_w,heart_attack_prev_w,chd_prev_w,stroke_prev_w,asthma_prev_w,asthma_now_prev_w,copd_prev_w,skin_cancer_prev_w,any_cancer_prev_w,kidney_disease_prev_w,arthritis_prev_w,source
0,2020,1.0,1.0,307,446509.12852,27.767332,0.219945,0.524461,0.806859,0.005437,0.006528,0.0,0.179758,0.587927,,,,0.0,0.064581,LLCP2020
1,2020,1.0,2.0,244,279574.605557,28.576676,0.363987,0.535601,0.847979,0.0,0.011115,0.017697,0.19398,0.445996,,,,0.011232,0.083737,LLCP2020
2,2020,1.0,3.0,320,342219.56981,30.085542,0.438455,0.508013,0.794171,0.0,0.009701,0.005929,0.177344,0.581398,,,,0.014755,0.113673,LLCP2020
3,2020,1.0,4.0,313,270409.535248,30.302965,0.522425,0.519121,0.746045,0.016937,0.016963,0.021908,0.110263,0.567614,,,,0.017117,0.164671,LLCP2020
4,2020,1.0,5.0,332,308964.846236,30.359079,0.413647,0.695448,0.811031,0.01825,0.01675,0.009967,0.163075,0.682882,,,,0.025992,0.238604,LLCP2020
5,2020,1.0,6.0,346,245548.269815,30.817545,0.481296,0.556599,0.931223,0.027647,0.033338,0.020781,0.134755,0.602591,,,,0.029449,0.301246,LLCP2020
6,2020,1.0,7.0,439,317080.285988,30.698504,0.452268,0.428541,0.940291,0.057123,0.044794,0.06651,0.113998,0.642616,,,,0.066419,0.348664,LLCP2020
7,2020,1.0,8.0,452,272048.619095,29.741573,0.507288,0.485755,0.983735,0.0696,0.043501,0.045325,0.126766,0.7968,,,,0.045784,0.506383,LLCP2020
8,2020,1.0,9.0,547,359959.055572,29.961831,0.48386,0.394119,0.974725,0.08329,0.090233,0.083022,0.177178,0.705363,,,,0.077004,0.554145,LLCP2020
9,2020,1.0,10.0,615,277014.562571,30.019158,0.489529,0.329248,1.0,0.097934,0.093272,0.067625,0.118746,0.721661,,,,0.039688,0.525308,LLCP2020


In [22]:
# Export ACA-allowed variables from combined aggregated table
# Only keep groups that contain household-derived aggregated values.
from pathlib import Path
import pandas as pd
from IPython.display import display

exports_base = Path("2433_p3_data/healthcare.gov/exports")
aca_dir = exports_base / "aca_allowed"
aca_dir.mkdir(parents=True, exist_ok=True)

agg_file = exports_base / "aggregated" / "aggregated_all_years.csv"
if not agg_file.exists():
    print(f"Aggregated file not found: {agg_file}")
else:
    # read (low_memory False to avoid dtype inference problems)
    df = pd.read_csv(agg_file, low_memory=False)
    print(f"Loaded aggregated_all_years.csv shape={df.shape}")

    # define ACA-allowed variables to keep
    allowed_cols = [
        'IYEAR',            # interview year
        '_STATE',           # state FIPS (geography)
        'Age_Group',        # age group (if available)
        'n',                # sample count
        'n_weighted',       # weighted sample count
        'ever_smoker_prev_w',
        'current_smoker_prev_w',
        'source'            # original source/year tag
    ]

    present = [c for c in allowed_cols if c in df.columns]
    missing = [c for c in allowed_cols if c not in df.columns]
    print(f"Present columns: {present}")
    if missing:
        print(f"Warning - some allowed columns missing from aggregated data: {missing}")

    # Identify household-derived aggregated columns (if present)
    household_candidates = ['mean_num_adults_w','mean_num_children_w','mean_household_size_w','has_children_prev_w']
    household_present = [c for c in household_candidates if c in df.columns]
    if not household_present:
        print('No household aggregate columns found in combined aggregated file - no household-filtered ACA export will be written.')
    else:
        print(f"Found household aggregate columns: {household_present}")

        # Build frame including ACA columns + household columns (only columns that exist)
        cols_to_keep = present + household_present
        df_hw = df[cols_to_keep].copy()

        # Filter to only rows where at least one household column is non-null
        mask = df_hw[household_present].notna().any(axis=1)
        df_hw = df_hw.loc[mask].reset_index(drop=True)
        print(f"Groups with household data: {len(df_hw)} (out of {len(df)})")

        if df_hw.empty:
            print('After filtering, no groups contain household data. No file written.')
        else:
            # Normalize Age_Group
            if 'Age_Group' in df_hw.columns:
                df_hw['Age_Group'] = df_hw['Age_Group'].astype(str).replace({'nan':'UNK'})

            # Write filtered ACA export (only groups with household data)
            out_csv = aca_dir / 'aggregated_aca_allowed.csv'
            try:
                df_hw.to_csv(out_csv, index=False)
                print(f"Wrote household-filtered ACA export to: {out_csv}  shape={df_hw.shape}")
            except Exception as e:
                print(f"Failed to write household-filtered ACA export: {e}")

            # Also expose preview in kernel for interactive use
            globals()['preview_aca_allowed'] = df_hw.head(50)
            print('\nPreview (ACA - first 10 rows):')
            display(df_hw.head(10))


Loaded aggregated_all_years.csv shape=(6971, 24)
Present columns: ['IYEAR', '_STATE', 'Age_Group', 'n', 'n_weighted', 'ever_smoker_prev_w', 'current_smoker_prev_w', 'source']
Found household aggregate columns: ['mean_num_adults_w', 'mean_num_children_w', 'mean_household_size_w', 'has_children_prev_w']
Groups with household data: 6969 (out of 6971)
Wrote household-filtered ACA export to: 2433_p3_data/healthcare.gov/exports/aca_allowed/aggregated_aca_allowed.csv  shape=(6969, 12)

Preview (ACA - first 10 rows):


Unnamed: 0,IYEAR,_STATE,Age_Group,n,n_weighted,ever_smoker_prev_w,current_smoker_prev_w,source,mean_num_adults_w,mean_num_children_w,mean_household_size_w,has_children_prev_w
0,2020,1.0,1.0,307,446509.12852,0.219945,0.524461,LLCP2020,2.563891,0.554231,3.134054,0.403329
1,2020,1.0,2.0,244,279574.605557,0.363987,0.535601,LLCP2020,2.016937,1.013901,3.045903,0.53815
2,2020,1.0,3.0,320,342219.56981,0.438455,0.508013,LLCP2020,1.992775,1.478668,3.499861,0.67189
3,2020,1.0,4.0,313,270409.535248,0.522425,0.519121,LLCP2020,2.031705,1.622951,3.654077,0.735609
4,2020,1.0,5.0,332,308964.846236,0.413647,0.695448,LLCP2020,2.153344,1.412142,3.515449,0.700762
5,2020,1.0,6.0,346,245548.269815,0.481296,0.556599,LLCP2020,2.194885,0.836456,3.022588,0.488738
6,2020,1.0,7.0,439,317080.285988,0.452268,0.428541,LLCP2020,2.314699,0.472963,2.810449,0.297377
7,2020,1.0,8.0,452,272048.619095,0.507288,0.485755,LLCP2020,2.169307,0.279215,2.451525,0.171557
8,2020,1.0,9.0,547,359959.055572,0.48386,0.394119,LLCP2020,2.111464,0.222556,2.354999,0.146164
9,2020,1.0,10.0,615,277014.562571,0.489529,0.329248,LLCP2020,1.950278,0.166994,2.105452,0.089273
