# Structured Dataset Builder (Notebook)

This notebook contains the episode-level dataset builder used to assemble a cohort for synthetic discharge summary generation. It mirrors `scripts/build_structured_dataset.py` and adds an example run cell at the end.

- Grain: one row per `hadm_id` (admission)
- Filters: discharge to `HOME` or `HOME HEALTH CARE`; ICD-10 only; ≥3 dx codes
- Aggregations: diagnoses (+descriptions), procedures, discharge-like medications, curated labs last 48h (optional)
- Outputs: train/val/test parquet files + schema + cohort stats



In [2]:
import json
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Set, Tuple

import numpy as np
import pandas as pd


In [3]:
@dataclass(frozen=True)
class Paths:
    project_root: Path

    @property
    def data_dir(self) -> Path:
        return self.project_root / "data"

    @property
    def processed_dir(self) -> Path:
        return self.data_dir / "processed"

    @property
    def admissions(self) -> Path:
        return self.data_dir / "admissions.csv"

    @property
    def patients(self) -> Path:
        return self.data_dir / "patients.csv"

    @property
    def diagnoses_icd(self) -> Path:
        return self.data_dir / "diagnoses_icd.csv"

    @property
    def d_icd(self) -> Path:
        return self.data_dir / "d_icd_diagnoses.csv"

    @property
    def procedures_icd(self) -> Path:
        return self.data_dir / "procedures_icd.csv"

    @property
    def prescriptions(self) -> Path:
        return self.data_dir / "prescriptions.csv"

    @property
    def labevents(self) -> Path:
        return self.data_dir / "labevents.csv"

    @property
    def chartevents(self) -> Path:
        return self.data_dir / "chartevents.csv"

    @property
    def d_items(self) -> Path:
        return self.data_dir / "d_items.csv"

    @property
    def out_train(self) -> Path:
        return self.processed_dir / "structured_dataset.train.parquet"

    @property
    def out_val(self) -> Path:
        return self.processed_dir / "structured_dataset.val.parquet"

    @property
    def out_test(self) -> Path:
        return self.processed_dir / "structured_dataset.test.parquet"

    @property
    def out_schema(self) -> Path:
        return self.processed_dir / "structured_dataset.schema.json"

    @property
    def out_stats(self) -> Path:
        return self.project_root / "results" / "evaluation" / "patient_cohort_stats.json"


In [4]:
def ensure_dirs(paths: Paths) -> None:
    paths.processed_dir.mkdir(parents=True, exist_ok=True)
    (paths.project_root / "results" / "evaluation").mkdir(parents=True, exist_ok=True)


def filter_home_health_dispositions(adm: pd.DataFrame) -> pd.DataFrame:
    # HOME / HOME HEALTH CARE only (home health focus)
    mask = adm["discharge_location"].isin(["HOME", "HOME HEALTH CARE"])
    return adm.loc[mask].copy()


def load_admissions(paths: Paths) -> pd.DataFrame:
    usecols = [
        "subject_id",
        "hadm_id",
        "admittime",
        "dischtime",
        "admission_type",
        "discharge_location",
    ]
    df = pd.read_csv(paths.admissions, usecols=usecols, parse_dates=["admittime", "dischtime"], low_memory=False)
    df = filter_home_health_dispositions(df)
    df = df.dropna(subset=["subject_id", "hadm_id", "admittime", "dischtime"]).copy()
    return df


def load_patients(paths: Paths) -> pd.DataFrame:
    usecols = ["subject_id", "gender", "anchor_age", "dod"]
    df = pd.read_csv(paths.patients, usecols=usecols, parse_dates=["dod"], low_memory=False)
    return df


def attach_demographics(adm: pd.DataFrame, patients: pd.DataFrame) -> pd.DataFrame:
    merged = adm.merge(patients, on="subject_id", how="left")
    if "anchor_age" in merged.columns:
        merged["age_at_admit"] = merged["anchor_age"].astype("Int64")
    else:
        merged["age_at_admit"] = pd.Series([pd.NA] * len(merged), dtype="Int64")
    merged = merged.drop(columns=[c for c in ["anchor_age"] if c in merged.columns])
    return merged


In [5]:
def load_diagnoses(paths: Paths) -> Tuple[pd.DataFrame, pd.DataFrame]:
    dx = pd.read_csv(paths.diagnoses_icd, usecols=["subject_id", "hadm_id", "seq_num", "icd_code", "icd_version"], low_memory=False)
    dict_icd = pd.read_csv(paths.d_icd, usecols=["icd_code", "icd_version", "long_title"], low_memory=False)
    return dx, dict_icd


def filter_icd10(dx: pd.DataFrame) -> pd.DataFrame:
    return dx.loc[dx["icd_version"] == 10].copy()


def derive_primary_dx(dx10: pd.DataFrame) -> pd.DataFrame:
    dx10["_seq_rank"] = dx10["seq_num"].fillna(1e9)
    idx = dx10.sort_values(["hadm_id", "_seq_rank"]).groupby("hadm_id", as_index=False).head(1)
    primary = idx[["hadm_id", "icd_code"]].rename(columns={"icd_code": "primary_icd10"})
    primary = primary.drop_duplicates(subset=["hadm_id"])  # guard
    return primary


def aggregate_dx(dx10: pd.DataFrame, dict_icd: pd.DataFrame) -> pd.DataFrame:
    dx10 = dx10.merge(dict_icd, on=["icd_code", "icd_version"], how="left")
    agg = (
        dx10.groupby("hadm_id")
        .agg(
            icd10_codes=("icd_code", lambda s: list(pd.unique(s.dropna()))),
            icd10_descriptions=("long_title", lambda s: list(pd.unique(s.dropna()))),
            num_icd10_codes=("icd_code", "nunique"),
        )
        .reset_index()
    )
    return agg


In [6]:
def simple_pdgm_bucket(primary_icd10: str) -> str:
    if not isinstance(primary_icd10, str) or primary_icd10 == "":
        return "Unknown"
    prefix = primary_icd10[:3].upper()
    if prefix.startswith("E11"):
        return "Endocrine"
    if prefix.startswith("I50"):
        return "Cardiac & Circulatory"
    if prefix.startswith("J44"):
        return "Respiratory"
    if prefix.startswith("L89"):
        return "Wounds"
    if prefix.startswith("I69"):
        return "Neuro/Rehab"
    if prefix.startswith("Z"):
        return "Surgical Aftercare"
    if prefix.startswith("I10"):
        return "Cardiac & Circulatory"
    return "Other"


In [7]:
def load_procedures(paths: Paths) -> pd.DataFrame:
    usecols = ["subject_id", "hadm_id", "icd_code", "icd_version"]
    proc = pd.read_csv(paths.procedures_icd, usecols=usecols, low_memory=False)
    return proc


def aggregate_procedures(proc: pd.DataFrame) -> pd.DataFrame:
    agg = (
        proc.groupby("hadm_id")
        .agg(procedures_icd10=("icd_code", lambda s: list(pd.unique(s.dropna()))), num_procedures_total=("icd_code", "nunique"))
        .reset_index()
    )
    return agg


def load_prescriptions(paths: Paths) -> pd.DataFrame:
    usecols = ["subject_id", "hadm_id", "starttime", "stoptime", "drug"]
    prs = pd.read_csv(paths.prescriptions, usecols=usecols, parse_dates=["starttime", "stoptime"], low_memory=False)
    return prs


def compute_discharge_like_meds(
    prs: pd.DataFrame, admissions_subset: pd.DataFrame
) -> pd.DataFrame:
    merged = prs.merge(admissions_subset[["hadm_id", "admittime", "dischtime"]], on="hadm_id", how="inner")
    in_window = (merged["starttime"] <= merged["dischtime"]) & (merged["stoptime"].fillna(merged["dischtime"]) >= merged["admittime"])
    merged = merged.loc[in_window].copy()
    near_discharge = (
        (merged["stoptime"].notna() & ((merged["dischtime"] - merged["stoptime"]).dt.total_seconds().abs() <= 48 * 3600))
        | merged["stoptime"].isna()
    )
    merged = merged.loc[near_discharge].copy()
    merged.sort_values(["hadm_id", "drug", "starttime"], inplace=True)
    last_per_drug = merged.groupby(["hadm_id", "drug"], as_index=False).tail(1)
    agg = (
        last_per_drug.groupby("hadm_id")
        .agg(meds_discharge_like=("drug", lambda s: list(pd.unique(s.dropna()))), medication_count=("drug", "nunique"))
        .reset_index()
    )
    return agg


In [9]:
def load_d_items(paths: Paths) -> Optional[pd.DataFrame]:
    try:
        return pd.read_csv(paths.d_items, usecols=["itemid", "label", "linksto", "category", "unitname"], low_memory=False)
    except FileNotFoundError:
        return None


def select_lab_itemids(d_items: Optional[pd.DataFrame]) -> Dict[str, Set[int]]:
    targets = {
        "Glucose": {"GLUCOSE"},
        "Creatinine": {"CREATININE"},
        "Sodium": {"SODIUM"},
        "Potassium": {"POTASSIUM"},
        "WBC": {"WBC", "WHITE BLOOD"},
        "Hemoglobin": {"HEMOGLOBIN", "HGB"},
    }
    if d_items is None:
        return {}
    itemid_map: Dict[str, Set[int]] = {k: set() for k in targets}
    upper_labels = d_items[["itemid", "label", "linksto"]].copy()
    upper_labels["LABEL_UP"] = upper_labels["label"].astype(str).str.upper()
    for analyte, tokens in targets.items():
        hits = upper_labels.loc[
            upper_labels["LABEL_UP"].apply(lambda x: any(token in x for token in tokens))
        ]
        itemid_map[analyte] = set(hits["itemid"].astype(int).tolist())
    return itemid_map


def iter_labevents_for_hadm(paths: Paths, hadm_ids: Set[int], use_itemids: Optional[Set[int]] = None, chunksize: int = 500_000) -> Iterable[pd.DataFrame]:
    cols = ["subject_id", "hadm_id", "itemid", "charttime", "valuenum", "valueuom"]
    for chunk in pd.read_csv(paths.labevents, usecols=cols, parse_dates=["charttime"], chunksize=chunksize, low_memory=False):
        chunk = chunk.loc[chunk["hadm_id"].isin(hadm_ids)]
        if use_itemids:
            chunk = chunk.loc[chunk["itemid"].isin(list(use_itemids))]
        if not chunk.empty:
            yield chunk


def aggregate_labs_last48h(paths: Paths, admissions_subset: pd.DataFrame, d_items: Optional[pd.DataFrame]) -> pd.DataFrame:
    hadm_ids: Set[int] = set(admissions_subset["hadm_id"].astype(int).tolist())
    analyte_to_itemids = select_lab_itemids(d_items)
    target_itemids: Set[int] = set()
    for s in analyte_to_itemids.values():
        target_itemids.update(s)

    frames: List[pd.DataFrame] = []
    for chunk in iter_labevents_for_hadm(paths, hadm_ids, use_itemids=target_itemids if target_itemids else None):
        merged = chunk.merge(admissions_subset[["hadm_id", "dischtime"]], on="hadm_id", how="inner")
        within_48h = (merged["dischtime"] - merged["charttime"]).dt.total_seconds().between(0, 48 * 3600)
        merged = merged.loc[within_48h].copy()
        if not merged.empty:
            frames.append(merged)
    if not frames:
        return pd.DataFrame({"hadm_id": [], "labs_last48h": []})

    labs = pd.concat(frames, ignore_index=True)
    itemid_to_label: Dict[int, str] = {}
    for analyte, itemids in analyte_to_itemids.items():
        for iid in itemids:
            itemid_to_label[int(iid)] = analyte
    labs["analyte"] = labs["itemid"].map(itemid_to_label).fillna(labs["itemid"].astype(str))

    labs.sort_values(["hadm_id", "analyte", "charttime"], inplace=True)
    last_vals = labs.groupby(["hadm_id", "analyte"], as_index=False).tail(1)

    def to_struct(group: pd.DataFrame) -> Dict[str, Dict[str, Optional[float]]]:
        result: Dict[str, Dict[str, Optional[float]]] = {}
        for _, row in group.iterrows():
            result[str(row["analyte"]) ] = {"value": row["valuenum"] if pd.notna(row["valuenum"]) else None, "unit": row["valueuom"]}
        return result

    agg = (
        last_vals.groupby("hadm_id").apply(to_struct).reset_index(name="labs_last48h")
    )
    return agg


In [10]:
def build_dataset(
    paths: Paths,
    sample_target: int = 1500,
    seed: int = 17,
    skip_labs: bool = False,
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, Dict]:
    # Load base tables
    admissions = load_admissions(paths)
    patients = load_patients(paths)
    admissions = attach_demographics(admissions, patients)
    admissions["length_of_stay_days"] = (admissions["dischtime"] - admissions["admittime"]).dt.total_seconds() / 86400.0

    # Diagnoses
    dx, dict_icd = load_diagnoses(paths)
    dx10 = filter_icd10(dx)
    # Keep only hadm_ids present in admissions subset to reduce memory
    dx10 = dx10.loc[dx10["hadm_id"].isin(set(admissions["hadm_id"].tolist()))].copy()
    # Code richness filter: >= 3 codes per hadm
    code_counts = dx10.groupby("hadm_id")["icd_code"].nunique().rename("num_icd10_codes")
    eligible_hadm = code_counts.loc[code_counts >= 3].index
    admissions = admissions.loc[admissions["hadm_id"].isin(set(map(int, eligible_hadm)))]

    # Recompute dx tables on filtered cohort
    dx10 = dx10.loc[dx10["hadm_id"].isin(set(admissions["hadm_id"].tolist()))].copy()
    primary = derive_primary_dx(dx10)
    agg_dx = aggregate_dx(dx10, dict_icd)
    epi = admissions.merge(agg_dx, on="hadm_id", how="inner")
    epi = epi.merge(primary, on="hadm_id", how="left")

    # Primary desc
    epi = epi.merge(
        dict_icd.rename(columns={"long_title": "primary_icd10_desc"})[["icd_code", "icd_version", "primary_icd10_desc"]]
        .rename(columns={"icd_code": "primary_icd10"}),
        on=["primary_icd10"],
        how="left",
    )

    # PDGM bucket
    epi["primary_pdgm_bucket_simple"] = epi["primary_icd10"].apply(simple_pdgm_bucket)

    # Procedures
    proc = load_procedures(paths)
    proc = proc.loc[proc["hadm_id"].isin(set(epi["hadm_id"].tolist()))].copy()
    agg_proc = aggregate_procedures(proc)
    epi = epi.merge(agg_proc, on="hadm_id", how="left")

    # Medications
    prs = load_prescriptions(paths)
    prs = prs.loc[prs["hadm_id"].isin(set(epi["hadm_id"].tolist()))].copy()
    meds = compute_discharge_like_meds(prs, epi[["hadm_id", "admittime", "dischtime"]])
    epi = epi.merge(meds, on="hadm_id", how="left")

    # Data sufficiency: ensure at least one of procedures, prescriptions, labs present
    has_proc = epi["num_procedures_total"].fillna(0) > 0
    has_meds = epi["medication_count"].fillna(0) > 0
    if skip_labs:
        has_labs = pd.Series(False, index=epi.index)
    else:
        # Labs (curated, last48h)
        d_items = load_d_items(paths)
        labs = aggregate_labs_last48h(paths, epi[["hadm_id", "dischtime"]], d_items)
        epi = epi.merge(labs, on="hadm_id", how="left")
        has_labs = epi["labs_last48h"].apply(lambda x: isinstance(x, dict) and len(x) > 0)

    epi = epi.loc[(has_proc | has_meds | has_labs)].copy()

    # Sample to target size with stratification by PDGM bucket
    rng = np.random.default_rng(seed)
    if len(epi) > sample_target:
        frames: List[pd.DataFrame] = []
        for bucket, dfb in epi.groupby("primary_pdgm_bucket_simple"):
            n = int(np.round(sample_target * len(dfb) / len(epi)))
            if n == 0:
                n = min(1, len(dfb))
            idx = rng.choice(dfb.index.to_numpy(), size=min(n, len(dfb)), replace=False)
            frames.append(dfb.loc[idx])
        sampled = pd.concat(frames, ignore_index=True)
        if len(sampled) < sample_target:
            remain = epi.drop(sampled.index, errors="ignore")
            need = sample_target - len(sampled)
            if need > 0 and len(remain) > 0:
                idx2 = rng.choice(remain.index.to_numpy(), size=min(need, len(remain)), replace=False)
                sampled = pd.concat([sampled, remain.loc[idx2]], ignore_index=True)
        epi = sampled

    # Train/val/test split (70/15/15) stratified by PDGM bucket
    def stratified_split(df: pd.DataFrame, seed: int) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
        train_parts: List[pd.DataFrame] = []
        val_parts: List[pd.DataFrame] = []
        test_parts: List[pd.DataFrame] = []
        rng_local = np.random.default_rng(seed)
        for _, g in df.groupby("primary_pdgm_bucket_simple"):
            idx = g.index.to_numpy()
            rng_local.shuffle(idx)
            n = len(idx)
            n_train = int(np.floor(0.70 * n))
            n_val = int(np.floor(0.15 * n))
            train_idx = idx[:n_train]
            val_idx = idx[n_train:n_train + n_val]
            test_idx = idx[n_train + n_val:]
            train_parts.append(g.loc[train_idx])
            val_parts.append(g.loc[val_idx])
            test_parts.append(g.loc[test_idx])
        return (
            pd.concat(train_parts, ignore_index=True),
            pd.concat(val_parts, ignore_index=True),
            pd.concat(test_parts, ignore_index=True),
        )

    train_df, val_df, test_df = stratified_split(epi, seed)

    stats = {
        "num_rows": int(len(epi)),
        "by_pdgm_bucket": epi["primary_pdgm_bucket_simple"].value_counts().to_dict(),
        "demographics": {
            "gender_counts": epi["gender"].value_counts(dropna=False).to_dict(),
            "age_summary": epi["age_at_admit"].describe().to_dict(),
        },
        "modalities_presence": {
            "has_procedures": int(has_proc.sum()),
            "has_meds": int(has_meds.sum()),
            "has_labs": int(has_labs.sum()) if "has_labs" in locals() else 0,
        },
        "diagnoses": {
            "num_icd10_codes_summary": epi["num_icd10_codes"].describe().to_dict(),
            "top_primary_icd10": epi["primary_icd10"].value_counts().head(20).to_dict(),
        },
    }

    return train_df, val_df, test_df, stats


In [11]:
def write_outputs(train_df: pd.DataFrame, val_df: pd.DataFrame, test_df: pd.DataFrame, stats: Dict, paths: Paths) -> None:
    paths.processed_dir.mkdir(parents=True, exist_ok=True)
    train_df.to_parquet(paths.out_train, index=False)
    val_df.to_parquet(paths.out_val, index=False)
    test_df.to_parquet(paths.out_test, index=False)
    # lightweight schema from small sample
    sample_df = pd.concat([train_df, val_df, test_df], ignore_index=True).iloc[:100]
    schema = {
        "columns": [{"name": c, "dtype": str(sample_df[c].dtype)} for c in sample_df.columns],
        "n_rows": int(len(sample_df)),
        "generated_at": pd.Timestamp.utcnow().isoformat(),
    }
    paths.out_schema.write_text(json.dumps(schema, indent=2))
    (paths.project_root / "results" / "evaluation").mkdir(parents=True, exist_ok=True)
    with open(paths.out_stats, "w") as f:
        json.dump(stats, f, indent=2)


# Example run cell (skip labs for speed on first pass)
PROJECT_ROOT = Path("/Users/benjamindykstra/development/icd-10-coding")
paths = Paths(project_root=PROJECT_ROOT)
ensure_dirs(paths)

train_df, val_df, test_df, stats = build_dataset(paths, sample_target=1500, seed=17, skip_labs=True)
# write_outputs(train_df, val_df, test_df, stats, paths)

train_df.head(3)


Unnamed: 0,subject_id,hadm_id,admittime,dischtime,admission_type,discharge_location,gender,dod,age_at_admit,length_of_stay_days,...,icd10_descriptions,num_icd10_codes,primary_icd10,icd_version,primary_icd10_desc,primary_pdgm_bucket_simple,procedures_icd10,num_procedures_total,meds_discharge_like,medication_count
0,17504528,20171885,2137-06-17 20:40:00,2137-06-20 16:41:00,DIRECT EMER.,HOME,F,2142-07-24,56,2.834028,...,"[Chronic diastolic (congestive) heart failure,...",22,I5032,10,Chronic diastolic (congestive) heart failure,Cardiac & Circulatory,[0W9B30Z],1.0,"[5% Dextrose, Albuterol 0.083% Neb Soln, Aspir...",24.0
1,14273001,20371042,2187-10-19 19:00:00,2187-10-21 18:13:00,OBSERVATION ADMIT,HOME,F,2188-03-27,73,1.967361,...,"[Acute diastolic (congestive) heart failure, C...",13,I5031,10,Acute diastolic (congestive) heart failure,Cardiac & Circulatory,,,"[Acetaminophen, Aspirin, Atorvastatin, Bisacod...",19.0
2,11357031,27612249,2139-01-17 21:04:00,2139-01-22 18:00:00,OBSERVATION ADMIT,HOME HEALTH CARE,M,2144-10-28,58,4.872222,...,[Acute on chronic diastolic (congestive) heart...,25,I5033,10,Acute on chronic diastolic (congestive) heart ...,Cardiac & Circulatory,,,"[0.9% Sodium Chloride (Mini Bag Plus), Amoxici...",30.0


In [14]:
train_df['dod'].isna().sum()

np.int64(812)

In [24]:
stats

{'num_rows': 1500,
 'by_pdgm_bucket': {'Other': 1410,
  'Surgical Aftercare': 41,
  'Endocrine': 22,
  'Cardiac & Circulatory': 15,
  'Respiratory': 10,
  'Neuro/Rehab': 1,
  'Wounds': 1},
 'demographics': {'gender_counts': {'F': 772, 'M': 728},
  'age_summary': {'count': 1500.0,
   'mean': 56.28333333333333,
   'std': 17.857350835854835,
   'min': 18.0,
   '25%': 43.0,
   '50%': 58.0,
   '75%': 70.0,
   'max': 91.0}},
 'modalities_presence': {'has_procedures': 87805,
  'has_meds': 133657,
  'has_labs': 0},
 'diagnoses': {'num_icd10_codes_summary': {'count': 1500.0,
   'mean': 14.027333333333333,
   'std': 7.363971184306134,
   'min': 3.0,
   '25%': 8.0,
   '50%': 13.0,
   '75%': 18.0,
   'max': 39.0},
  'top_primary_icd10': {'Z5111': 27,
   'I130': 27,
   'A419': 26,
   'I214': 26,
   'N179': 18,
   'I110': 15,
   'J189': 14,
   'N390': 13,
   'I2699': 12,
   'I350': 12,
   'I2510': 10,
   'I671': 9,
   'J690': 9,
   'U071': 9,
   'I4891': 9,
   'O34211': 9,
   'I480': 8,
   'E6601': 

In [25]:
train_df[train_df['primary_pdgm_bucket_simple'] == 'Other']

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,admission_type,discharge_location,gender,dod,age_at_admit,length_of_stay_days,...,icd10_descriptions,num_icd10_codes,primary_icd10,icd_version,primary_icd10_desc,primary_pdgm_bucket_simple,procedures_icd10,num_procedures_total,meds_discharge_like,medication_count
25,16965272,27277017,2191-07-24 05:24:00,2191-07-25 17:00:00,EW EMER.,HOME HEALTH CARE,M,NaT,54,1.483333,...,"[Sepsis, unspecified organism, Body mass index...",20,A419,10,"Sepsis, unspecified organism",Other,[0J990ZZ],1.0,"[Apixaban, CefTRIAXone, LevoFLOXacin, MetroNID...",9.0
26,19006306,21580155,2131-11-22 18:23:00,2131-11-25 18:35:00,OBSERVATION ADMIT,HOME,F,NaT,81,3.008333,...,[Biliary acute pancreatitis without necrosis o...,16,K8510,10,Biliary acute pancreatitis without necrosis or...,Other,,,"[Acetaminophen, Bag, Cilostazol, Clopidogrel, ...",20.0
27,18573535,28169559,2161-01-16 19:58:00,2161-01-24 18:24:00,OBSERVATION ADMIT,HOME HEALTH CARE,F,NaT,48,7.934722,...,"[Osteomyelitis of vertebra, thoracic region, D...",15,M4624,10,"Osteomyelitis of vertebra, thoracic region",Other,"[0RP90JZ, 02HV33Z]",2.0,"[0.9% Sodium Chloride (Mini Bag Plus), Acetami...",25.0
28,15020369,29462888,2169-02-04 13:18:00,2169-03-10 07:30:00,EW EMER.,HOME HEALTH CARE,F,NaT,72,33.758333,...,[Hypertensive heart and chronic kidney disease...,25,I130,10,Hypertensive heart and chronic kidney disease ...,Other,"[02RF38Z, B2111ZZ, 0T2BX0Z]",3.0,"[Acetaminophen, Albuterol 0.083% Neb Soln, Art...",23.0
29,13898255,24599795,2121-07-25 12:05:00,2121-07-28 16:20:00,URGENT,HOME,F,NaT,34,3.177083,...,[Maternal care for low transverse scar from pr...,5,O34211,10,Maternal care for low transverse scar from pre...,Other,[10D00Z1],1.0,"[0.9% Sodium Chloride, Acetaminophen, Calcium ...",16.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1006,15115846,24252247,2132-01-16 21:21:00,2132-01-18 15:57:00,EW EMER.,HOME,M,NaT,54,1.775000,...,"[Paroxysmal atrial fibrillation, Hypertensive ...",12,I480,10,Paroxysmal atrial fibrillation,Other,,,"[Apixaban, Atorvastatin, Calcium Carbonate, De...",14.0
1007,18601151,27761968,2124-10-13 23:35:00,2124-10-17 10:29:00,EW EMER.,HOME,F,NaT,51,3.454167,...,"[Skin graft (allograft) (autograft) infection,...",7,T86822,10,Skin graft (allograft) (autograft) infection,Other,,,"[0.9% Sodium Chloride, Acetaminophen, Ciproflo...",15.0
1008,16156145,20382312,2181-12-20 04:18:00,2181-12-24 15:25:00,OBSERVATION ADMIT,HOME,F,NaT,62,4.463194,...,[Non-ST elevation (NSTEMI) myocardial infarcti...,14,I214,10,Non-ST elevation (NSTEMI) myocardial infarction,Other,"[027034Z, 0JH606Z, 02H63JZ, B211YZZ, 02HK3JZ]",5.0,"[5% Dextrose, Aspirin, Atorvastatin, Carvedilo...",16.0
1009,18574712,27380582,2176-03-25 18:32:00,2176-03-26 17:30:00,DIRECT EMER.,HOME,F,NaT,75,0.956944,...,"[Dyspnea, unspecified, Obstruction of bile duc...",16,R0600,10,"Dyspnea, unspecified",Other,"[0F798DZ, 0F7D8DZ]",2.0,"[Atenolol, Dextrose 50%, Gabapentin, Glucagon,...",15.0


In [28]:
write_outputs(train_df, val_df, test_df, stats, paths)

In [26]:
train_df.columns

Index(['subject_id', 'hadm_id', 'admittime', 'dischtime', 'admission_type',
       'discharge_location', 'gender', 'dod', 'age_at_admit',
       'length_of_stay_days', 'icd10_codes', 'icd10_descriptions',
       'num_icd10_codes', 'primary_icd10', 'icd_version', 'primary_icd10_desc',
       'primary_pdgm_bucket_simple', 'procedures_icd10',
       'num_procedures_total', 'meds_discharge_like', 'medication_count'],
      dtype='object')

In [27]:
train_df['icd10_codes'].head()

0    [I5032, E118, I10, E785, I2510, I071, D649, I2...
1    [I5031, I82501, C7931, C3431, E039, Z7901, I10...
2    [I5033, J9692, J9691, E870, E872, E662, J441, ...
3    [I5033, C9110, J918, N184, N179, I129, I2510, ...
4    [I5032, I8510, N179, I272, K7460, N390, I4891,...
Name: icd10_codes, dtype: object