In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
import ast
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# Config

In [2]:
# =========================
# CONFIG
# =========================
EMB_DIM = 2424
PARAM_NAMES = [
   "Heart_Rate","Respiratory_Rate","MAP","Temperature","SpO2","FiO2","GCS",
   "WBC","Platelet","Bilirubin","Creatinine","PaO2","Lactate","BUN",
   "AST","ALT","Albumin","Hemoglobin","INR","pH","HCO3","Glucose",
   "Sodium","Potassium","Chloride","Calcium","Magnesium"
]
P = len(PARAM_NAMES)
STRUCTURED_DIR = "data/structured"
NOTE_FILE = "data/Q&A_based_Note_embeddings.xlsx"
ADMISSION_FILE = "data/admissions.xlsx"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
NOTE_DIM_RAW = 2424
NOTE_DIM = 256
STRUCT_DIM = 64
N_CTX = 6
BATCH_SIZE = 4
EPOCHS = 10
LR = 1e-3

# Load Dataset

In [3]:

# =========================
# STEP 1: LOAD ADMISSIONS
# =========================
df_adm = pd.read_excel(ADMISSION_FILE)
los_dict = {int(r.HADM_ID): int(r["LOS(Days)"]) for _, r in df_adm.iterrows()}
# âœ… CANONICAL ORDER (CRITICAL FIX)
HADM_ID_ORDER = sorted(los_dict.keys())

In [4]:
HADM_ID_ORDER

[100229,
 100262,
 100328,
 100350,
 100357,
 100375,
 100380,
 100539,
 100598,
 100763,
 100765,
 100849,
 100863,
 100921,
 100981,
 101117,
 101153,
 101184,
 101194,
 101198,
 101295,
 101341,
 101387,
 101406,
 101424,
 101546,
 101713,
 101732,
 101747,
 101794,
 101803,
 101813,
 101872,
 101886,
 101901,
 101912,
 101934,
 102011,
 102038,
 102092,
 102204,
 102247,
 102298,
 102323,
 102396,
 102458,
 102566,
 102622,
 102842,
 102847,
 102869,
 102891,
 103045,
 103145,
 103195,
 103268,
 103287,
 103297,
 103298,
 103306,
 103405,
 103704,
 103734,
 103778,
 103844,
 103873,
 103875,
 103889,
 103904,
 103957,
 103980,
 104043,
 104141,
 104215,
 104252,
 104254,
 104329,
 104564,
 104566,
 104594,
 104642,
 104643,
 104715,
 104774,
 104788,
 104958,
 105013,
 105017,
 105027,
 105098,
 105207,
 105361,
 105386,
 105405,
 105452,
 105518,
 105557,
 105633,
 105742,
 105883,
 105918,
 105931,
 106011,
 106034,
 106068,
 106115,
 106120,
 106173,
 106194,
 106252,
 106296,
 

In [5]:

# =========================
# STEP 2: LOAD NOTE EMBEDDINGS
# =========================
df_note = pd.read_excel(NOTE_FILE)
df_note["HADM_ID"] = df_note["Note_id"].apply(lambda x: int(x.split("_")[0]))
df_note["DAY"] = df_note["Note_id"].apply(lambda x: int(x.split("_")[2]))
if isinstance(df_note.iloc[0]["Embedding"], str):
   df_note["embedding"] = df_note["Embedding"].apply(ast.literal_eval)
note_dict = {
   (r.HADM_ID, r.DAY): np.array(r.embedding, dtype=np.float32)
   for _, r in df_note.iterrows()
}

In [7]:
df_note.head(5)

Unnamed: 0,Note_id,Embedding,HADM_ID,DAY,embedding
0,176176_Day_2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1,...",176176,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1,..."
1,176176_Day_3,"[-1, 0, 0, 0, 0, 0, 1, 0, -1, 0, 0, 0, -1, 0, ...",176176,3,"[-1, 0, 0, 0, 0, 0, 1, 0, -1, 0, 0, 0, -1, 0, ..."
2,185910_Day_1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1,...",185910,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1,..."
3,185910_Day_2,"[0, -1, 0, 0, 0, 0, 1, 0, 0, 0, 1, -1, -1, 0, ...",185910,2,"[0, -1, 0, 0, 0, 0, 1, 0, 0, 0, 1, -1, -1, 0, ..."
4,185910_Day_3,"[0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, -1, 0, 0...",185910,3,"[0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, -1, 0, 0..."


In [8]:
# =========================
# STEP 3: LOAD STRUCTURED PARAMETER TABLES
# =========================
param_tables = {}
for p in PARAM_NAMES:
   print(p)
   param_tables[p] = pd.read_excel(f"{STRUCTURED_DIR}/daywise_{p}.xlsx")

Heart_Rate
Respiratory_Rate
MAP
Temperature
SpO2
FiO2
GCS
WBC
Platelet
Bilirubin
Creatinine
PaO2
Lactate
BUN
AST
ALT
Albumin
Hemoglobin
INR
pH
HCO3
Glucose
Sodium
Potassium
Chloride
Calcium
Magnesium


# Data Preprocessing

In [9]:
# =========================
# STEP 4: BUILD STRUCTURED TENSOR
# =========================
structured_tensor = defaultdict(dict)
for hadm_id in HADM_ID_ORDER:
   los = los_dict[hadm_id]
   last_seen = {p: None for p in PARAM_NAMES}
   for day in range(1, los + 1):
       mat = np.zeros((P, 3), dtype=np.float32)
       for i, p in enumerate(PARAM_NAMES):
           df_p = param_tables[p]
           row = df_p[df_p["HADM_ID"] == hadm_id]
           val = None
           if not row.empty and f"Day_{day}" in row.columns:
               cell = row.iloc[0][f"Day_{day}"]
               if pd.notna(cell) and str(cell).strip() not in ["", "$$"]:
                   try:
                       val = float(cell)
                   except:
                       pass
           if val is not None:
               mat[i] = [val, 1.0, 0.0]
               last_seen[p] = day
           else:
               mat[i] = [0.0, 0.0, los if last_seen[p] is None else day - last_seen[p]]
       structured_tensor[hadm_id][day] = mat

In [10]:
rows = []
for hadm_id, days in structured_tensor.items():
   for day, mat in days.items():
       for i, p in enumerate(PARAM_NAMES):
           rows.append({
               "HADM_ID": hadm_id,
               "DAY": day,
               "PARAM": p,
               "VALUE": mat[i, 0],
               "MASK": mat[i, 1],
               "DT": mat[i, 2]
           })
df_out = pd.DataFrame(rows)
df_out.to_csv("data/structured_input1.csv", index=False)

In [11]:
df_out.shape


(564246, 6)

In [12]:
note_dict

{(176176, 2): array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 (176176, 3): array([-1.,  0.,  0., ...,  0., -1.,  0.], dtype=float32),
 (185910, 1): array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 (185910, 2): array([ 0., -1.,  0., ...,  0.,  0.,  0.], dtype=float32),
 (185910, 3): array([ 0.,  0.,  0., ...,  0., -1.,  0.], dtype=float32),
 (185910, 4): array([ 0.,  0.,  0., ..., -1., -1.,  0.], dtype=float32),
 (185910, 5): array([ 0.,  0.,  1., ..., -1., -1.,  1.], dtype=float32),
 (185910, 6): array([ 1.,  1.,  1., ..., -1.,  0.,  1.], dtype=float32),
 (185910, 7): array([ 1.,  0., -1., ...,  1.,  1., -1.], dtype=float32),
 (185910, 8): array([ 1., -1.,  1., ...,  1., -1.,  1.], dtype=float32),
 (185910, 9): array([1., 0., 0., ..., 1., 0., 0.], dtype=float32),
 (185910, 10): array([ 1., -1.,  0., ...,  1.,  0., -1.], dtype=float32),
 (185910, 11): array([ 0., -1.,  0., ...,  0., -1.,  1.], dtype=float32),
 (185910, 12): array([ 0., -1.,  1., ...,  0., -1.,  1.], dtype=flo

In [13]:
# =========================
# STEP 5: NOTE TENSOR + MASK
# =========================
note_tensor = defaultdict(dict)
note_mask = defaultdict(dict)
for hadm_id in HADM_ID_ORDER:
   los = los_dict[hadm_id]
   for day in range(1, los + 1):
       if (hadm_id, day) in note_dict:
           note_tensor[hadm_id][day] = note_dict[(hadm_id, day)]
           note_mask[hadm_id][day] = 1.0
       else:
           note_tensor[hadm_id][day] = np.zeros(EMB_DIM, dtype=np.float32)
           note_mask[hadm_id][day] = 0.0

In [14]:
# =========================
# STEP 6: DAY MASK
# =========================
day_mask = defaultdict(dict)
for hadm_id in HADM_ID_ORDER:
   los = los_dict[hadm_id]
   for day in range(1, los + 1):
       has_struct = structured_tensor[hadm_id][day][:,1].sum() > 0
       has_note = note_mask[hadm_id][day] == 1.0
       day_mask[hadm_id][day] = 1.0 if (has_struct or has_note) else 0.0


In [15]:
# # =========================
# # STEP 7: BUILD SEQUENCES
# # =========================
# X_notes, X_struct, X_note_mask, X_day_mask, HADM_IDS = [], [], [], [], []
# for hadm_id in HADM_ID_ORDER:
#    los = los_dict[hadm_id]
#    X_notes.append(np.stack([note_tensor[hadm_id][d] for d in range(1, los+1)]))
#    X_struct.append(np.stack([structured_tensor[hadm_id][d] for d in range(1, los+1)]))
#    X_note_mask.append(np.array([note_mask[hadm_id][d] for d in range(1, los+1)]))
#    X_day_mask.append(np.array([day_mask[hadm_id][d] for d in range(1, los+1)]))
#    HADM_IDS.append(hadm_id)
    
# =========================
# STEP 7: BUILD MODEL-READY SEQUENCES
# =========================
X_notes = []
X_struct = []
X_note_mask = []
X_day_mask = []
HADM_IDS = []
for hadm_id in HADM_ID_ORDER:
   los = los_dict[hadm_id]
   notes_seq = []
   struct_seq = []
   note_mask_seq = []
   day_mask_seq = []
   for day in range(1, los + 1):
       notes_seq.append(note_tensor[hadm_id][day])
       struct_seq.append(structured_tensor[hadm_id][day])
       note_mask_seq.append(note_mask[hadm_id][day])
       day_mask_seq.append(day_mask[hadm_id][day])
   X_notes.append(np.stack(notes_seq))       # (T, 2400)
   X_struct.append(np.stack(struct_seq))     # (T, 27, 3)
   X_note_mask.append(np.array(note_mask_seq))
   X_day_mask.append(np.array(day_mask_seq))
   HADM_IDS.append(hadm_id)

In [16]:
# =========================
# FINAL CHECK
# =========================
print("Prepared admissions:", len(X_notes))
print("Example shapes:")
print("HDAM_ID:",HADM_IDS[3])
print("Notes:", X_notes[3].shape)
print("Structured:", X_struct[3].shape)
print("Note mask:", X_note_mask[3].shape)
print("Day mask:", X_day_mask[3].shape)

Prepared admissions: 1783
Example shapes:
HDAM_ID: 100350
Notes: (6, 2424)
Structured: (6, 27, 3)
Note mask: (6,)
Day mask: (6,)


# Split Train/Val/Test with data Normalization

In [17]:
# ----------------------------
# 1) SPLIT BY ADMISSION
# ----------------------------
N = len(X_notes)
idx = np.random.permutation(N)

tr = int(0.7 * N)
va = int(0.85 * N)


train_idx, val_idx, test_idx = idx[:tr], idx[tr:va], idx[va:]
print(len(train_idx), len(val_idx), len(test_idx))

1248 267 268


In [18]:
# ----------------------------
# 2) NORMALIZATION (TRAIN-ONLY)
# ----------------------------
# 2.1 Structured VALUE stats (mask==1 only)

P = 27

means = np.zeros(P)
stds  = np.zeros(P)

for p in range(P):
   vals = []
   for i in train_idx:
       v = X_struct[i][:, p, 0]
       m = X_struct[i][:, p, 1]
       vals.extend(v[m == 1])
   vals = np.array(vals)
   means[p] = vals.mean()
   stds[p]  = vals.std() + 1e-6

def normalize_struct_values(X):
   X = X.copy()
   for p in range(P):
       m = X[:, :, p, 1] == 1
       X[:, :, p, 0][m]  = (X[:, :, p, 0][m] - means[p]) / stds[p]
       X[:, :, p, 0][~m] = 0.0
   return X

# 2.2 Î”t log-normalization
MAX_LOS = max(x.shape[0] for x in X_struct)
print(MAX_LOS)

def normalize_dt(X):
   X = X.copy()
   X[:, :, :, 2] = np.log1p(X[:, :, :, 2]) / np.log1p(MAX_LOS)
   return X

# 2.3 Note LayerNorm (per sample)
def layer_norm_notes(X):
   mu = X.mean(axis=-1, keepdims=True)
   sd = X.std(axis=-1, keepdims=True) + 1e-6
   return (X - mu) / sd

# Apply normalization to ALL splits using TRAIN stats
for i in range(N):
   X_struct[i] = normalize_dt(normalize_struct_values(X_struct[i][None, ...]))[0]
   X_notes[i]  = layer_norm_notes(X_notes[i])

157


In [19]:
HADM_IDS[0], X_notes[0], X_notes[0].shape, X_struct[0].shape

(100229,
 array([[-0.01054451, -2.3341708 ,  2.3130817 , ..., -0.01054451,
         -0.01054451, -0.01054451],
        [-0.25198466, -1.9823267 ,  1.4783574 , ..., -0.25198466,
         -0.25198466, -0.25198466],
        [-0.27379748, -1.8101056 ,  1.2625105 , ...,  1.2625105 ,
         -0.27379748, -0.27379748],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]], dtype=float32),
 (14, 2424),
 (14, 27, 3))

In [20]:
# ----------------------------
# CONVERT MASK DICTS â†’ ARRAYS (REQUIRED)
# ----------------------------
note_mask_arr = []
day_mask_arr  = []
for hadm_id in HADM_ID_ORDER:
   los = los_dict[hadm_id]
   nm = []
   dm = []
   for day in range(1, los + 1):
       nm.append(note_mask[hadm_id][day])
       dm.append(day_mask[hadm_id][day])
   note_mask_arr.append(np.array(nm, dtype=np.float32))
   day_mask_arr.append(np.array(dm, dtype=np.float32))
note_mask = note_mask_arr
day_mask  = day_mask_arr

In [21]:
note_mask

[array([1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       dtype=float32),
 array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32),
 array([1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., 0., 0., 0.], dtype=float32),
 array([0., 0.], dtype=float32),
 array([1., 0., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., 0.], dtype=float32),
 array([0., 1., 1., 1., 1., 1., 1., 0., 0., 0., 1., 1., 0., 0.],
       dtype=float32),
 array([0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       dtype=float32),
 array([0., 0., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       dtype=float32),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32),
 array([0., 1., 1., 0., 0., 0.], dtype=float32),
 array(

In [22]:
def collate_admissions(batch):
   """
   batch: list of tuples
     (notes(T,2400), struct(T,27,3), note_mask(T), day_mask(T))
   """
   B = len(batch)
   T_max = max(item[0].shape[0] for item in batch)
   notes_dim = batch[0][0].shape[1]
   P = batch[0][1].shape[1]
   notes_pad  = torch.zeros(B, T_max, notes_dim)
   struct_pad = torch.zeros(B, T_max, P, 3)
   note_m_pad = torch.zeros(B, T_max)
   day_m_pad  = torch.zeros(B, T_max)
   for i, (n, s, nm, dm) in enumerate(batch):
       T = n.shape[0]
       notes_pad[i, :T]  = n
       struct_pad[i, :T] = s
       note_m_pad[i, :T] = nm
       day_m_pad[i, :T]  = dm
   return notes_pad, struct_pad, note_m_pad, day_m_pad

In [23]:
# ----------------------------
# 3) DATASET
# ----------------------------
class AdmissionDataset(Dataset):
   def __init__(self, indices):
       self.indices = indices
   def __len__(self):
       return len(self.indices)
   def __getitem__(self, k):
       i = self.indices[k]
       return (
           torch.tensor(X_notes[i], dtype=torch.float32),
           torch.tensor(X_struct[i], dtype=torch.float32),
           torch.tensor(note_mask[i], dtype=torch.float32),
           torch.tensor(day_mask[i], dtype=torch.float32),
       )
# train_loader = DataLoader(AdmissionDataset(train_idx), batch_size=BATCH_SIZE, shuffle=True)
# val_loader   = DataLoader(AdmissionDataset(val_idx),   batch_size=BATCH_SIZE)
# all_loader   = DataLoader(AdmissionDataset(range(N)),  batch_size=BATCH_SIZE)

In [24]:
train_loader = DataLoader(
   AdmissionDataset(train_idx),
   batch_size=BATCH_SIZE,
   shuffle=True,
   collate_fn=collate_admissions
)
val_loader = DataLoader(
   AdmissionDataset(val_idx),
   batch_size=BATCH_SIZE,
   collate_fn=collate_admissions
)
all_loader = DataLoader(
   AdmissionDataset(range(N)),
   batch_size=BATCH_SIZE,
   collate_fn=collate_admissions
)

# Model Building

In [25]:
# ----------------------------
# 4) MODEL
# ----------------------------
class StructuredEncoder(nn.Module):
   def __init__(self, d_out=STRUCT_DIM):
       super().__init__()
       self.mlp = nn.Sequential(
           nn.Linear(3, 64),
           nn.ReLU(),
           nn.Linear(64, d_out)
       )
   def forward(self, x):  # (B,T,27,3)
       return self.mlp(x)  # (B,T,27,STRUCT_DIM)

class NoteEncoder(nn.Module):
   def __init__(self, d_note=NOTE_DIM, n_ctx=N_CTX):
       super().__init__()
       self.proj = nn.Linear(NOTE_DIM_RAW, d_note)
       self.ctx  = nn.Linear(d_note, n_ctx * d_note)
       self.n_ctx = n_ctx
       self.d_note = d_note
   def forward(self, x):  # (B,T,2400)
       z = self.proj(x)    # (B,T,NOTE_DIM)
       c = self.ctx(z).view(z.size(0), z.size(1), self.n_ctx, self.d_note)
       return z, c         # note_latent, note_contexts

class CrossAttention(nn.Module):
   def __init__(self, d):
       super().__init__()
       self.attn = nn.MultiheadAttention(d, num_heads=4, batch_first=True)
   def forward(self, q, kv):  # q:(B,T,27,d), kv:(B,T,M,d)
       B, T, P, D = q.shape
       qf  = q.view(B * T, P, D)
       kvf = kv.view(B * T, -1, D)
       out, _ = self.attn(qf, kvf, kvf)
       return out.view(B, T, P, D)

class CrossModalModel(nn.Module):
   def __init__(self):
       super().__init__()
       self.struct = StructuredEncoder()
       self.note   = NoteEncoder()
       self.xattn  = CrossAttention(STRUCT_DIM)
       self.pool = nn.Linear(STRUCT_DIM, 1)
       # ðŸ”¹ NEW: project structured summary to NOTE_DIM
       self.struct_proj = nn.Linear(STRUCT_DIM, NOTE_DIM)
       # fuse note + struct
       self.fuse = nn.Linear(NOTE_DIM + NOTE_DIM, NOTE_DIM)
   def forward(self, notes, struct):
       # notes  : (B,T,2400)
       # struct : (B,T,27,3)
       s = self.struct(struct)                    # (B,T,27,64)
       n_lat, n_ctx = self.note(notes)           # (B,T,256), (B,T,M,256)
       s_x = self.xattn(s, n_ctx)                # (B,T,27,64)
       w = torch.softmax(self.pool(s_x).squeeze(-1), dim=-1)  # (B,T,27)
       s_sum = (s_x * w.unsqueeze(-1)).sum(dim=2)             # (B,T,64)
       # ðŸ”¹ PROJECT structured embedding
       s_sum_proj = self.struct_proj(s_sum)      # (B,T,256)
       # ðŸ”¹ FUSED cross-modal embedding
       z = self.fuse(torch.cat([n_lat, s_sum_proj], dim=-1))  # (B,T,256)
       return z, n_lat, s_sum_proj

In [26]:
# ----------------------------
# 5) SELF-SUPERVISED CONSISTENCY LOSS
# ----------------------------
def consistency_loss(z, z_note, z_struct, note_m, struct_present):
   loss = 0.0
   if note_m.any():
       loss += F.cosine_embedding_loss(
           z[note_m==1], z_note[note_m==1],
           torch.ones(int(note_m.sum()), device=z.device)
       )
   if struct_present.any():
       loss += F.cosine_embedding_loss(
           z[struct_present], z_struct[struct_present],
           torch.ones(int(struct_present.sum()), device=z.device)
       )
   return loss

In [27]:

model = CrossModalModel().to(DEVICE)
opt = torch.optim.Adam(model.parameters(), lr=LR)

# Traning/Validation

In [28]:
# ----------------------------
# 6) TRAIN + VALIDATE
# ----------------------------
for ep in range(EPOCHS):
   print(f"Epoch [{ep+1}/{EPOCHS}] starts.................................")

   # ===== TRAIN =====
   model.train()
   train_loss = 0.0
   n_train = 0
   for notes, struct, note_m, _ in train_loader:
       notes  = notes.to(DEVICE)
       struct = struct.to(DEVICE)
       note_m = note_m.to(DEVICE)
       z, zn, zs = model(notes, struct)  # (B,T,NOTE_DIM)
       B, T, _ = z.shape
       # ---- FLATTEN CONSISTENTLY ----
       zf  = z.reshape(B * T, NOTE_DIM)
       znf = zn.reshape(B * T, NOTE_DIM)
       zsf = zs.reshape(B * T, NOTE_DIM)
       note_mf = note_m.reshape(B * T)
       # struct presence mask (B,T) â†’ (B*T)
       struct_present = (
           struct[:, :, :, 1].sum(dim=2) > 0
       ).reshape(B * T).to(DEVICE)
       loss = consistency_loss(
           zf, znf, zsf, note_mf, struct_present
       )
       opt.zero_grad()
       loss.backward()
       opt.step()
       train_loss += loss.item()
       n_train += 1
   train_loss /= n_train

   # ===== VALIDATION =====
   model.eval()
   val_loss = 0.0
   n_val = 0
   with torch.no_grad():
       for notes, struct, note_m, _ in val_loader:
           notes  = notes.to(DEVICE)
           struct = struct.to(DEVICE)
           note_m = note_m.to(DEVICE)
           z, zn, zs = model(notes, struct)
           B, T, _ = z.shape
           zf  = z.reshape(B * T, NOTE_DIM)
           znf = zn.reshape(B * T, NOTE_DIM)
           zsf = zs.reshape(B * T, NOTE_DIM)
           note_mf = note_m.reshape(B * T)
           struct_present = (
               struct[:, :, :, 1].sum(dim=2) > 0
           ).reshape(B * T).to(DEVICE)
           loss = consistency_loss(
               zf, znf, zsf, note_mf, struct_present
           )
           val_loss += loss.item()
           n_val += 1
   val_loss /= n_val

   # ===== PROGRESS =====
   print(
       f"Epoch [{ep+1}/{EPOCHS}] "
       f"Train Loss: {train_loss:.4f} | "
       f"Val Loss: {val_loss:.4f}"
   )

Epoch [1/10] starts.................................
Epoch [1/10] Train Loss: 0.0327 | Val Loss: 0.0033
Epoch [2/10] starts.................................
Epoch [2/10] Train Loss: 0.0021 | Val Loss: 0.0019
Epoch [3/10] starts.................................
Epoch [3/10] Train Loss: 0.0011 | Val Loss: 0.0014
Epoch [4/10] starts.................................
Epoch [4/10] Train Loss: 0.0008 | Val Loss: 0.0010
Epoch [5/10] starts.................................
Epoch [5/10] Train Loss: 0.0006 | Val Loss: 0.0008
Epoch [6/10] starts.................................
Epoch [6/10] Train Loss: 0.0005 | Val Loss: 0.0011
Epoch [7/10] starts.................................
Epoch [7/10] Train Loss: 0.0004 | Val Loss: 0.0005
Epoch [8/10] starts.................................
Epoch [8/10] Train Loss: 0.0003 | Val Loss: 0.0004
Epoch [9/10] starts.................................
Epoch [9/10] Train Loss: 0.0002 | Val Loss: 0.0004
Epoch [10/10] starts.................................
Epoch [10/

# Test

In [30]:
model.eval()
rows = []
global_idx = 0  # admission index tracker
with torch.no_grad():
   for notes, struct, note_m, day_m in all_loader:
       notes  = notes.to(DEVICE)
       struct = struct.to(DEVICE)
       z, _, _ = model(notes, struct)   # (B, T, NOTE_DIM)
       z = z.cpu().numpy()
       day_m = day_m.numpy()
       B, T, D = z.shape
       #print(B)
       for b in range(B):
           hadm_id = HADM_IDS[global_idx]
           los = X_notes[global_idx].shape[0]
           for t in range(los):
               if day_m[b, t] == 1:   # valid day only
                   rows.append({
                       "HADM_ID": hadm_id,
                       "DAY": t + 1,
                       "crossmodal_embedding": z[b, t].tolist()
                   })
           global_idx += 1
# Create DataFrame
df_out = pd.DataFrame(rows)
# Save
df_out.to_excel("crossmodal_daywise_embeddings.xlsx", index=False)
print("Saved:", df_out.shape)

Saved: (19584, 3)


# Checking

In [75]:
df_adm = pd.read_excel("final_admissions.xlsx")
df_emb = pd.read_excel("crossmodal_daywise_embeddings.xlsx")

In [76]:
df_adm.head(5)

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,MORTALITY_STATUS,LOS(Days),DIAGNOSIS,AGE,GENDER
0,33,176176,2116-12-23 22:30:00,2116-12-27 12:05:00,0,5,SEPSIS;TELEMETRY,82,M
1,38,185910,2166-08-10 00:28:00,2166-09-04 11:30:00,0,26,ACUTE MYOCARDIAL INFARCTION-SEPSIS,76,M
2,357,122609,2198-11-01 22:36:00,2198-11-14 14:20:00,0,14,SEPSIS,64,M
3,366,134462,2164-11-18 20:27:00,2164-11-22 15:18:00,0,5,SEPSIS,53,M
4,62,116009,2113-02-15 00:19:00,2113-02-19 15:30:00,0,5,"SEPSIS,URINARY TRACT INFECTION",69,M


In [77]:
df_emb.head(5)

Unnamed: 0,HADM_ID,DAY,crossmodal_embedding
0,100229,1,"[4.992328643798828, 46.610862731933594, -20.30..."
1,100229,2,"[9.649316787719727, 73.99225616455078, -32.835..."
2,100229,3,"[9.673537254333496, 75.6829605102539, -33.5422..."
3,100229,4,"[8.00228214263916, 55.57709503173828, -25.0850..."
4,100229,5,"[8.116687774658203, 63.3602294921875, -28.0993..."


In [78]:
df_adm["HADM_ID"] = df_adm["HADM_ID"].astype(int)
df_emb["HADM_ID"] = df_emb["HADM_ID"].astype(int)
df_emb["DAY"] = df_emb["DAY"].astype(int)

In [79]:
# Count expected days per admission
expected = (
   df_adm
   .assign(expected_days=lambda x: x["LOS(Days)"].astype(int))
   .set_index("HADM_ID")["expected_days"]
)

In [80]:
expected

HADM_ID
176176     5
185910    26
122609    14
134462     5
116009     5
          ..
150731    13
112686     5
181449     8
134977     5
153703    11
Name: expected_days, Length: 1777, dtype: int32

In [81]:
# Count actual days per admission
actual = (
   df_emb
   .groupby("HADM_ID")["DAY"]
   .nunique()
)

In [82]:
actual

HADM_ID
100229    14
100262    45
100328    13
100350     6
100357     2
          ..
199605    44
199760     6
199774     5
199855     6
199880     1
Name: DAY, Length: 1777, dtype: int64

In [83]:
# Combine
check_df = pd.concat([expected, actual], axis=1)
check_df.columns = ["EXPECTED_DAYS", "ACTUAL_DAYS"]

In [84]:
check_df

Unnamed: 0_level_0,EXPECTED_DAYS,ACTUAL_DAYS
HADM_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
176176,5,5
185910,26,26
122609,14,14
134462,5,4
116009,5,5
...,...,...
150731,13,13
112686,5,5
181449,8,8
134977,5,5


In [86]:
# Missing admissions entirely
missing_admissions = check_df[check_df["ACTUAL_DAYS"].isna()]
# Admissions with missing days
partial_admissions = check_df[
   check_df["ACTUAL_DAYS"] < check_df["EXPECTED_DAYS"]
]
print("Total admissions:", len(check_df))
print("Admissions missing entirely:", len(missing_admissions))
print("Admissions with missing days:", len(partial_admissions))
# Optional: show problematic admissions
print("\nAdmissions missing entirely:")
print(missing_admissions)
print("\nAdmissions with missing days:")
print(partial_admissions)


Total admissions: 1777
Admissions missing entirely: 0
Admissions with missing days: 424

Admissions missing entirely:
Empty DataFrame
Columns: [EXPECTED_DAYS, ACTUAL_DAYS]
Index: []

Admissions with missing days:
         EXPECTED_DAYS  ACTUAL_DAYS
HADM_ID                            
134462               5            4
131488              20           19
150423              10            6
140561               6            5
106296              23           19
...                ...          ...
166418               5            4
178708               6            4
127737              10            6
181736              16           13
152550              26           12

[424 rows x 2 columns]


In [73]:
missing_admissions = [169179, 152571, 187308, 109963, 197907, 180378]
 
missing_admissions = set(map(int, missing_admissions))
# ----------------------------
# Drop admissions with zero signal
# ----------------------------
df_final = df_adm[~df_adm["HADM_ID"].isin(missing_admissions)].copy()
# ----------------------------
# Save cleaned admissions
# ----------------------------
df_final.to_excel("final_admissions.xlsx", index=False)
print("Original admissions:", len(df_adm))
print("Removed admissions:", len(missing_admissions))
print("Final admissions:", len(df_final))


Original admissions: 1783
Removed admissions: 6
Final admissions: 1777
