In [2]:
import pandas as pd
import wfdb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import seaborn as sns
from scipy.signal import butter, filtfilt, iirnotch
import copy

In [3]:
BASE = Path("mimic-iv-ecg-demo-data")
DIAG_PATH = "/Users/shruti14/Desktop/ClinicalDigitalTwin/hosp_diagnosis.csv"
MEAS_PATH = "/Users/shruti14/Desktop/ClinicalDigitalTwin/machine_measurements.csv"

In [4]:
df_diag = pd.read_csv(DIAG_PATH)
df_ecg = pd.read_csv(BASE / "record_list.csv")
temp = pd.read_csv(MEAS_PATH, nrows=5)
report_cols = [col for col in temp.columns if col.startswith("report_") or col in ["filtering"]]
dtype_map = {col: "string" for col in report_cols}
df_meas = pd.read_csv(MEAS_PATH, dtype=dtype_map, low_memory=False)

In [5]:
diag_text = (
    df_diag.groupby("subject_id")["long_title"]
           .apply(lambda x: " ; ".join(x.unique()))
           .reset_index()
)

In [6]:
def flag_dx(df):
    # Example high-yield conditions
    afib = {"42731","I48","I480","I481","I482","I4891"}
    htn  = {"4010","4011","4019","I10"}
    cad  = {"4140","I25","I2510","I25119"}
    hf   = {"4280","I50","I500","I5030"}
    dm   = {"25000","E10","E11"}

    df["afib"] = df["icd_code"].astype(str).isin(afib).astype(int)
    df["htn"]  = df["icd_code"].astype(str).isin(htn).astype(int)
    df["cad"]  = df["icd_code"].astype(str).isin(cad).astype(int)
    df["hf"]   = df["icd_code"].astype(str).isin(hf).astype(int)
    df["dm"]   = df["icd_code"].astype(str).isin(dm).astype(int)

    return df

df_diag = flag_dx(df_diag)


In [7]:
diag_flags = (
    df_diag.groupby("subject_id")[["afib","htn","cad","hf","dm"]]
           .max()
           .reset_index()
)

In [8]:
ecg_grouped = (
    df_ecg.groupby("subject_id")["path"]
           .apply(list)
           .reset_index()
)


In [9]:
df_meas.columns.tolist()

['subject_id',
 'study_id',
 'cart_id',
 'ecg_time',
 'report_0',
 'report_1',
 'report_2',
 'report_3',
 'report_4',
 'report_5',
 'report_6',
 'report_7',
 'report_8',
 'report_9',
 'report_10',
 'report_11',
 'report_12',
 'report_13',
 'report_14',
 'report_15',
 'report_16',
 'report_17',
 'bandwidth',
 'filtering',
 'rr_interval',
 'p_onset',
 'p_end',
 'qrs_onset',
 'qrs_end',
 't_end',
 'p_axis',
 'qrs_axis',
 't_axis']

In [10]:
df_meas

Unnamed: 0,subject_id,study_id,cart_id,ecg_time,report_0,report_1,report_2,report_3,report_4,report_5,...,filtering,rr_interval,p_onset,p_end,qrs_onset,qrs_end,t_end,p_axis,qrs_axis,t_axis
0,10000032,40689238,6848296,2180-07-23 08:44:00,Sinus rhythm,Possible right atrial abnormality,,Borderline ECG,,,...,60 Hz notch Baseline filter,659,40,128,170,258,518,81,77,79
1,10000032,44458630,6848296,2180-07-23 09:54:00,Sinus rhythm,Possible right atrial abnormality,,Borderline ECG,,,...,60 Hz notch Baseline filter,722,40,124,162,246,504,77,75,70
2,10000032,49036311,6376932,2180-08-06 09:07:00,Sinus tachycardia,,Normal ECG except for rate,,,,...,60 Hz notch Baseline filter,600,40,130,162,244,474,79,72,77
3,10000117,45090959,6214760,2181-03-04 17:14:00,Sinus rhythm,,Normal ECG,,,,...,60 Hz notch Baseline filter,659,40,146,180,254,538,79,66,69
4,10000117,48446569,6632385,2183-09-18 13:52:00,Sinus rhythm,,,,,,...,<not specified>,659,368,29999,504,590,868,84,80,77
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
800030,19999840,48683947,6283529,2164-09-12 12:28:00,Sinus rhythm.,Left axis deviation,,Borderline ECG,,,...,60 Hz notch Baseline filter,659,40,154,182,266,560,38,-39,35
800031,19999840,41842293,6947865,2164-09-17 11:31:00,Accelerated idioventricular rhythm.,Left axis deviation,IV conduction defect,Inferior infarct - age undetermined,Ant/septal and lateral ST-T changes suggest my...,Low QRS voltages in limb leads,...,60 Hz notch Baseline filter,967,29999,29999,200,350,688,29999,-80,174
800032,19999987,41190887,6848296,2145-11-02 19:54:00,Sinus tachycardia with PVC(s).,Possible right atrial abnormality,Inferior and lateral ST elevation - possible e...,,Borderline ECG,,...,60 Hz notch Baseline filter,526,40,148,204,286,528,58,50,51
800033,19999987,45828463,6315641,2145-11-03 03:00:00,Sinus rhythm.,,Normal ECG,,,,...,60 Hz notch Baseline filter,631,40,156,202,282,570,59,54,56


In [14]:
# Identify report columns
report_cols = [c for c in df_meas.columns if c.startswith("report_")]

df_meas[report_cols] = df_meas[report_cols].astype(str)

# Create one combined text per ECG
df_meas["full_report"] = df_meas[report_cols].apply(
    lambda row: " ".join([x for x in row if x not in ["", "nan", "NaN"]]),
    axis=1
)

# Combine all ECG reports per subject
meas_reports = (
    df_meas.groupby("subject_id", as_index=False)
    .agg({"full_report": lambda x: " ".join(x)})
)


In [15]:
numeric_cols = [
    "rr_interval", "p_onset", "p_end",
    "qrs_onset", "qrs_end", "t_end",
    "p_axis", "qrs_axis", "t_axis"
]

numeric_cols = [c for c in numeric_cols if c in df_meas.columns]


In [16]:
meas_numeric = (
    df_meas.groupby("subject_id", as_index=False)
    .agg({
        **{col: ["mean", "std", "min", "max"] for col in numeric_cols},
        "study_id": "nunique"   # number of ECGs
    })
)

# flatten multi-index columns
meas_numeric.columns = [
    "_".join(col).rstrip("_") 
    for col in meas_numeric.columns
]

In [17]:
# Start with diagnosis text + flags
merged = diag_text.merge(diag_flags, on="subject_id", how="outer")

In [18]:
# Add numeric ECG features
merged = merged.merge(meas_numeric, on="subject_id", how="outer")

In [19]:
# Add text ECG reports
merged = merged.merge(meas_reports, on="subject_id", how="outer")

In [20]:
# Add raw ECG paths
merged = merged.merge(ecg_grouped, on="subject_id", how="left")

In [29]:
merged = merged[merged["path"].notna()]
merged

Unnamed: 0,subject_id,long_title,afib,htn,cad,hf,dm,rr_interval_mean,rr_interval_std,rr_interval_min,...,qrs_axis_std,qrs_axis_min,qrs_axis_max,t_axis_mean,t_axis_std,t_axis_min,t_axis_max,study_id_nunique,full_report,path
0,10000032,Portal hypertension ; Unspecified viral hepati...,0.0,0.0,0.0,0.0,0.0,660.333333,61.010928,600.0,...,2.516611,72.0,77.0,75.333333,4.725816,70.0,79.0,3.0,Sinus rhythm Possible right atrial abnormality...,"[files/p10000032/s107143276/107143276, files/p..."
26,10001217,Intracranial abscess ; Compression of brain ; ...,0.0,1.0,0.0,0.0,0.0,870.000000,35.355339,845.0,...,4.949747,8.0,15.0,39.500000,12.020815,31.0,48.0,2.0,Sinus rhythm <NA> Normal ECG <NA> <NA> <NA> <N...,"[files/p10001217/s105362569/105362569, files/p..."
35,10001725,Other specified retention of urine ; Other ana...,0.0,0.0,0.0,0.0,0.0,1016.000000,,1016.0,...,,45.0,45.0,45.000000,,45.0,45.0,1.0,Sinus bradycardia <NA> Normal ECG except for r...,[files/p10001725/s102147240/102147240]
55,10002428,Other drugs and medicinal substances causing a...,0.0,1.0,0.0,1.0,0.0,669.400000,111.007593,508.0,...,27.354289,-48.0,42.0,43.933333,12.819999,21.0,64.0,15.0,Sinus rhythm - supraventricular extrasystoles ...,"[files/p10002428/s104430624/104430624, files/p..."
58,10002495,Non-ST elevation (NSTEMI) myocardial infarctio...,1.0,1.0,1.0,0.0,0.0,551.333333,155.532419,394.0,...,20.599353,10.0,51.0,-44.666667,18.502252,-66.0,-33.0,3.0,Sinus tachycardia. Extensive ST-T changes sugg...,"[files/p10002495/s107316808/107316808, files/p..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
981,10038999,"Pneumonia, unspecified organism ; Displaced co...",0.0,0.0,0.0,0.0,0.0,609.000000,159.987500,483.0,...,6.082763,29.0,40.0,20.666667,16.165808,2.0,30.0,3.0,Sinus tachycardia. <NA> Normal ECG except for ...,"[files/p10038999/s109127970/109127970, files/p..."
1000,10039708,"Fall (on) (from) other stairs and steps, initi...",0.0,1.0,0.0,0.0,0.0,841.750000,189.197780,612.0,...,21.998485,6.0,90.0,18.562500,74.585940,-90.0,168.0,16.0,Sinus rhythm Septal T wave changes are nonspec...,"[files/p10039708/s103786521/103786521, files/p..."
1004,10039831,Secondary and unspecified malignant neoplasm o...,0.0,0.0,0.0,0.0,0.0,833.000000,,833.0,...,,21.0,21.0,37.000000,,37.0,37.0,1.0,Sinus rhythm <NA> Normal ECG <NA> <NA> <NA> <N...,[files/p10039831/s106035722/106035722]
1012,10039997,"Cerebral aneurysm, nonruptured ; Manic episode...",1.0,1.0,0.0,0.0,0.0,975.500000,10.606602,968.0,...,16.970563,-37.0,-13.0,96.000000,29.698485,75.0,117.0,2.0,Sinus rhythm with PVCs with borderline 1st deg...,"[files/p10039997/s102298909/102298909, files/p..."


Step 1: Split dataset → train / val / test (subject-level) \
Step 2: Preprocess numeric, text, waveform \
Step 3: Encode features: \
          - Tabular: StandardScaler \
          - Text: ClinicalBERT embedding \
          - Waveform: 1D CNN input \
Step 4: Build model: \
          - Option A: Tabular + Text embeddings → MLP \
          - Option B: Waveform → CNN + concatenate with tabular/text → final MLP \
Step 5: Train model \
Step 6: Evaluate: AUROC, accuracy, F1, etc. \
Step 7: Save model & patient embeddings → digital twin representation \
