In [1]:
import sys
import os

# Path to the project root (one level above the notebooks folder)
project_root = os.path.abspath("..")

# Add to Python path if not already present
if project_root not in sys.path:
    sys.path.append(project_root)

print("Project root added:", project_root)

Project root added: /home/zervaki/Thesis_New


In [2]:

%load_ext autoreload
%autoreload 2

In [3]:
import numpy as np

In [4]:
from data.processed_mimic.aggregate_mimic import (
    load_mimic_tables, 
    select_labs, 
    aggregate_labs, 
    merge_with_patients_admissions, 
    save_dataset
)

In [5]:
import os
import pandas as pd


raw_path = "../data/raw_mimic/mimic-iv-clinical-database-demo-2.2/hosp"

print("Loading raw MIMIC CSVs...")

patients = pd.read_csv(f"{raw_path}/patients.csv.gz")
admissions = pd.read_csv(f"{raw_path}/admissions.csv.gz")
labevents = pd.read_csv(f"{raw_path}/labevents.csv.gz")
labitems = pd.read_csv(f"{raw_path}/d_labitems.csv.gz")

print("Loaded:")
print("  patients:", patients.shape)
print("  admissions:", admissions.shape)
print("  labevents:", labevents.shape)
print("  labitems:", labitems.shape)

Loading raw MIMIC CSVs...
Loaded:
  patients: (100, 6)
  admissions: (275, 16)
  labevents: (107727, 16)
  labitems: (1622, 4)


In [6]:
list(patients.columns)

['subject_id',
 'gender',
 'anchor_age',
 'anchor_year',
 'anchor_year_group',
 'dod']

In [7]:
list(admissions.columns)

['subject_id',
 'hadm_id',
 'admittime',
 'dischtime',
 'deathtime',
 'admission_type',
 'admit_provider_id',
 'admission_location',
 'discharge_location',
 'insurance',
 'language',
 'marital_status',
 'race',
 'edregtime',
 'edouttime',
 'hospital_expire_flag']

In [8]:
list(labevents.columns)

['labevent_id',
 'subject_id',
 'hadm_id',
 'specimen_id',
 'itemid',
 'order_provider_id',
 'charttime',
 'storetime',
 'value',
 'valuenum',
 'valueuom',
 'ref_range_lower',
 'ref_range_upper',
 'flag',
 'priority',
 'comments']

In [9]:
list(labitems.columns)

['itemid', 'label', 'fluid', 'category']

In [10]:
df = pd.read_csv("../data/processed_mimic/processed_mimic_24h_labs_demographics.csv")


In [11]:
print(df.shape) 
df.head()

(234, 25)


Unnamed: 0,subject_id,hadm_id,Sodium,Potassium,Chloride,Creatinine,Urea Nitrogen,Hematocrit,Hemoglobin,WBC,...,admission_type,admission_location,discharge_location,insurance,language,marital_status,race,hospital_expire_flag,length_of_stay_hours,ed_wait_time_hours
0,10000032,22595853.0,137.0,4.5,105.0,0.3,25.0,37.6,12.7,,...,URGENT,TRANSFER FROM HOSPITAL,HOME,Other,ENGLISH,WIDOWED,WHITE,0,18.866667,4.216667
1,10000032,22841357.0,126.0,5.2,92.0,0.3,29.0,35.5,12.4,13.0,...,EW EMER.,EMERGENCY ROOM,HOME,Medicaid,ENGLISH,WIDOWED,WHITE,0,24.366667,5.616667
2,10000032,25742920.0,126.0,6.6,94.5,0.6,37.0,34.6,12.1,,...,EW EMER.,EMERGENCY ROOM,HOSPICE,Medicaid,ENGLISH,WIDOWED,WHITE,0,42.1,4.766667
3,10000032,29079034.0,131.0,4.85,102.0,0.45,30.5,34.8,11.9,,...,EW EMER.,EMERGENCY ROOM,HOME,Medicaid,ENGLISH,WIDOWED,WHITE,0,53.333333,8.1
4,10001217,27703517.0,142.0,4.1,104.0,0.5,11.0,37.4,12.5,,...,DIRECT EMER.,PHYSICIAN REFERRAL,HOME HEALTH CARE,Other,?,MARRIED,WHITE,0,141.95,


In [30]:
def gen_linear_gaussian_data(adj, n_samples, seed=42, noise_scale=1.0):
    rng = np.random.default_rng(seed)
    d = adj.shape[0]
    X = np.zeros((n_samples, d))

    # Random weights for edges
    W = adj * rng.normal(loc=0.8, scale=0.3, size=adj.shape)

    # Topological order (simple: 0..d-1 works because we constructed DAG acyclic)
    order = list(range(d))

    for j in order:
        parents = np.where(adj[:, j] == 1)[0]
        if len(parents) == 0:
            X[:, j] = rng.normal(0, noise_scale, size=n_samples)
        else:
            X[:, j] = X[:, parents] @ W[parents, j] + rng.normal(0, noise_scale, size=n_samples)

    return X




In [31]:
df = pd.read_csv("../data/processed_mimic/processed_mimic_24h_labs_demographics.csv")

mvpc_vars = [
    "Sodium", "Potassium", "Chloride", "Creatinine", "Urea Nitrogen",
    "Hematocrit", "Hemoglobin", "WBC", "Platelet Count", "Glucose",
    "anchor_age", "length_of_stay_hours",
]

X_real = df[mvpc_vars].to_numpy()

scaler = StandardScaler()
X_complete = scaler.fit_transform(X_real)

print("X_complete shape:", X_complete.shape)


X_complete shape: (234, 12)


In [32]:
num_var = len(mvpc_vars)
idx = {v: i for i, v in enumerate(mvpc_vars)}

adj_true = np.zeros((num_var, num_var))

# ... your collider definitions ...


In [None]:
colliders = detect_colliders(adj_true)
collider_parents = detect_collider_parents(adj_true, colliders)

seed = 42

ms_mar, prt_ms_mar = create_mar_ind(
    colliders, collider_parents, num_var, num_extra_e=3, num_m=6, seed=seed
)

ms_mnar, prt_ms_mnar = create_mnar_ind(
    colliders, collider_parents, num_var, num_extra_e=3, num_m=6, seed=seed
)


In [None]:
def build_prt_m_from_ms(ms, prt_ms):
    prt_dict = {}
    for m, p in zip(ms, prt_ms):
        prt_dict.setdefault(m, []).append(p)
    m_inds = sorted(prt_dict.keys())
    return {"m": m_inds, "prt": prt_dict}


In [None]:
X_mar = generate_missing_values(
    X_complete=X_complete,
    ms=ms_mar,
    prt_ms=prt_ms_mar,
    p_missing_h=0.9,
    p_missing_l=0.1,
    seed=seed,
)

X_mcar = generate_mcar_reference(
    X_complete=X_complete,
    X_mar=X_mar,
    ms=ms_mar,
    seed=seed,
)

X_mnar = generate_missing_values(
    X_complete=X_complete,
    ms=ms_mnar,
    prt_ms=prt_ms_mnar,
    p_missing_h=0.9,
    p_missing_l=0.1,
    seed=seed,
)


In [None]:
prt_m_mar  = build_prt_m_from_ms(ms_mar,  prt_ms_mar)
prt_m_mnar = build_prt_m_from_ms(ms_mnar, prt_ms_mnar)
prt_m_mcar = prt_m_mar  # MCAR uses same structure, permuted mask


In [None]:
def run_mvpc_oracle_all_methods(X_m, adj_true, prt_m):
    results = {}

    mvpc_td = MVPC_Oracle(indep_test=gauss_ci_td, corr_test=gauss_ci_td)
    G_td = mvpc_td.run(X_m, prt_m)["G_corrected"]
    results["TD"] = {"shd": shd_directed(G_td, adj_true), "f1": f1_directed(G_td, adj_true)}

    mvpc_permc = MVPC_Oracle(indep_test=gauss_ci_td, corr_test=gauss_ci_permc)
    G_permc = mvpc_permc.run(X_m, prt_m)["G_corrected"]
    results["PERMC"] = {"shd": shd_directed(G_permc, adj_true), "f1": f1_directed(G_permc, adj_true)}

    mvpc_drw = MVPC_Oracle(indep_test=gauss_ci_td, corr_test=gauss_ci_drw)
    G_drw = mvpc_drw.run(X_m, prt_m)["G_corrected"]
    results["DRW"] = {"shd": shd_directed(G_drw, adj_true), "f1": f1_directed(G_drw, adj_true)}

    return results


In [None]:
results_oracle = {
    "MAR":  run_mvpc_oracle_all_methods(X_mar,  adj_true, prt_m_mar),
    "MNAR": run_mvpc_oracle_all_methods(X_mnar, adj_true, prt_m_mnar),
    "MCAR": run_mvpc_oracle_all_methods(X_mcar, adj_true, prt_m_mcar),
}

results_oracle


In [None]:
rows = []
for mode, methods in results_oracle.items():
    for method, metrics in methods.items():
        rows.append({
            "mode": mode,
            "method": method,
            "shd": float(metrics["shd"]),
            "f1": float(metrics["f1"]),
        })

df_oracle_results = pd.DataFrame(rows)
df_oracle_results
