In [3]:
import sys
import os

# Path to the project root (one level above the notebooks folder)
project_root = os.path.abspath("..")

# Add to Python path if not already present
if project_root not in sys.path:
    sys.path.append(project_root)

print("Project root added:", project_root)

Project root added: /home/zervaki/Thesis_New


In [7]:
import os
os.getcwd()


'/home/zervaki/Thesis_New/notebooks'

In [4]:

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:

from data.processed_mimic.aggregate_mimic import (
    load_mimic_tables, 
    select_labs, 
    aggregate_labs, 
    merge_with_patients_admissions, 
    save_dataset
)

In [6]:
# MIMIC iv dataset
patients, admissions, labevents, labitems = load_mimic_tables()
selected_labs = ['Potassium', 'Sodium', 'Creatinine', 'Chloride', 'Urea Nitrogen', 'Hematocrit']
filtered = select_labs(labevents, labitems, selected_labs)
admission_labs = aggregate_labs(filtered)
full_data = merge_with_patients_admissions(admission_labs, patients, admissions)
save_dataset(full_data)


Loading raw MIMIC CSVs...


FileNotFoundError: [Errno 2] No such file or directory: 'data/raw_mimic/mimic-iv-clinical-database-demo-2.2/hosp/patients.csv.gz'

In [8]:
import os
import pandas as pd

# ---------------------------------------------------------
# 1. Define the correct raw path relative to the notebook
# ---------------------------------------------------------
raw_path = "../data/raw_mimic/mimic-iv-clinical-database-demo-2.2/hosp"

# ---------------------------------------------------------
# 2. Load raw MIMIC tables
# ---------------------------------------------------------
print("Loading raw MIMIC CSVs...")

patients = pd.read_csv(f"{raw_path}/patients.csv.gz")
admissions = pd.read_csv(f"{raw_path}/admissions.csv.gz")
labevents = pd.read_csv(f"{raw_path}/labevents.csv.gz")
labitems = pd.read_csv(f"{raw_path}/d_labitems.csv.gz")

print("Loaded:")
print("  patients:", patients.shape)
print("  admissions:", admissions.shape)
print("  labevents:", labevents.shape)
print("  labitems:", labitems.shape)

# ---------------------------------------------------------
# 3. Select labs
# ---------------------------------------------------------
selected_labs = [
    "Potassium", "Sodium", "Creatinine",
    "Chloride", "Urea Nitrogen", "Hematocrit"
]

itemids = labitems[labitems["label"].isin(selected_labs)]["itemid"].tolist()

filtered = (
    labevents[labevents["itemid"].isin(itemids)]
    .merge(labitems[["itemid", "label"]], on="itemid", how="left")
)

print("Filtered lab events:", filtered.shape)

# ---------------------------------------------------------
# 4. Aggregate per admission
# ---------------------------------------------------------
agg = (
    filtered.groupby(["subject_id", "hadm_id", "label"])["valuenum"]
    .mean()
    .reset_index()
)

pivot = (
    agg.pivot_table(
        index=["subject_id", "hadm_id"],
        columns="label",
        values="valuenum"
    )
    .reset_index()
)

print("Aggregated pivot:", pivot.shape)

# ---------------------------------------------------------
# 5. Merge with admissions + patients
# ---------------------------------------------------------
full_data = (
    pivot
    .merge(admissions, on=["subject_id", "hadm_id"], how="left")
    .merge(patients, on="subject_id", how="left")
)

print("Final merged dataset:", full_data.shape)

# ---------------------------------------------------------
# 6. Inspect missingness
# ---------------------------------------------------------
print("\nMissing values per column:")
print(full_data.isnull().sum())

# ---------------------------------------------------------
# 7. Save processed dataset
# ---------------------------------------------------------
output_path = "../data/processed_mimic/processed_admissions_selected_labs.csv"
os.makedirs(os.path.dirname(output_path), exist_ok=True)

full_data.to_csv(output_path, index=False)
print(f"\nSaved processed dataset to: {output_path}")


Loading raw MIMIC CSVs...
Loaded:
  patients: (100, 6)
  admissions: (275, 16)
  labevents: (107727, 16)
  labitems: (1622, 4)
Filtered lab events: (17900, 17)
Aggregated pivot: (251, 8)
Final merged dataset: (251, 27)

Missing values per column:
subject_id                0
hadm_id                   0
Chloride                  1
Creatinine                1
Hematocrit                3
Potassium                 1
Sodium                    1
Urea Nitrogen             1
admittime                 0
dischtime                 0
deathtime               236
admission_type            0
admit_provider_id         0
admission_location        0
discharge_location       21
insurance                 0
language                  0
marital_status           12
race                      0
edregtime                87
edouttime                87
hospital_expire_flag      0
gender                    0
anchor_age                0
anchor_year               0
anchor_year_group         0
dod                     1

In [9]:
list(full_data.columns)


['subject_id',
 'hadm_id',
 'Chloride',
 'Creatinine',
 'Hematocrit',
 'Potassium',
 'Sodium',
 'Urea Nitrogen',
 'admittime',
 'dischtime',
 'deathtime',
 'admission_type',
 'admit_provider_id',
 'admission_location',
 'discharge_location',
 'insurance',
 'language',
 'marital_status',
 'race',
 'edregtime',
 'edouttime',
 'hospital_expire_flag',
 'gender',
 'anchor_age',
 'anchor_year',
 'anchor_year_group',
 'dod']

In [10]:
import pandas as pd

# ---------------------------------------------------------
# 1. Select the core continuous physiology variables
# ---------------------------------------------------------
phys_vars = [
    "Chloride",
    "Creatinine",
    "Hematocrit",
    "Potassium",
    "Sodium",
    "Urea Nitrogen"
]

df_phys = full_data[phys_vars].copy()

# ---------------------------------------------------------
# 2. Drop rows with structural missingness
#    (only 1â€“3 rows per variable, so this is fine)
# ---------------------------------------------------------
df_phys = df_phys.dropna().reset_index(drop=True)

print("Shape after dropping structural NaNs:", df_phys.shape)

# ---------------------------------------------------------
# 3. Standardize the continuous variables
# ---------------------------------------------------------
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_phys_scaled = pd.DataFrame(
    scaler.fit_transform(df_phys),
    columns=phys_vars
)

df_phys_scaled.head()


Shape after dropping structural NaNs: (247, 6)


Unnamed: 0,Chloride,Creatinine,Hematocrit,Potassium,Sodium,Urea Nitrogen
0,0.692642,-0.754144,1.065523,0.741277,-0.129685,0.028875
1,-1.898565,-0.754144,0.672999,2.298458,-2.824834,0.230984
2,-1.549749,-0.640795,0.40197,4.02248,-3.641545,0.348881
3,-0.237535,-0.663465,0.28982,1.779398,-1.681437,0.382566
4,0.333859,-0.618125,0.616924,-0.548958,0.360342,-0.779559


In [11]:
import numpy as np
import pandas as pd

# ---------------------------------------------------------
# 1. Load your standardized physiology dataset
# ---------------------------------------------------------
X_complete = df_phys_scaled.values   # shape (247, 6)
num_var = X_complete.shape[1]

print("Complete data shape:", X_complete.shape)

# ---------------------------------------------------------
# 2. Generate a random DAG for the 6 variables
# ---------------------------------------------------------
from data.synthetic_data_generation.dag_and_data import (
    random_dag, detect_colliders, detect_collider_parents
)

G, adj = random_dag(n_nodes=num_var, seed=42)
colliders = detect_colliders(adj)
collider_parents = detect_collider_parents(adj, colliders)

print("Adjacency matrix:\n", adj)
print("Colliders:", colliders)
print("Collider parents:", collider_parents)

# ---------------------------------------------------------
# 3. Create MAR missingness indicators
# ---------------------------------------------------------
from data.synthetic_data_generation.missingness_synthetic import (
    create_mar_ind, generate_missing_values, generate_mcar_reference
)

ms_mar, prt_ms_mar = create_mar_ind(
    colliders=colliders,
    collider_parents=collider_parents,
    num_var=num_var,
    num_extra_e=3,
    num_m=6,
    seed=42
)

print("MAR missingness indicators:", ms_mar)
print("MAR parents:", prt_ms_mar)

# ---------------------------------------------------------
# 4. Apply MAR missingness
# ---------------------------------------------------------
X_mar = generate_missing_values(
    X_complete=X_complete,
    ms=ms_mar,
    prt_ms=prt_ms_mar,
    p_missing_h=0.9,
    p_missing_l=0.1,
    seed=42
)

print("MAR missingness summary:")
print(pd.DataFrame(X_mar).isnull().mean())

# ---------------------------------------------------------
# 5. Generate MCAR reference (optional)
# ---------------------------------------------------------
X_mcar = generate_mcar_reference(
    X_complete=X_complete,
    X_mar=X_mar,
    ms=ms_mar,
    seed=42
)

# ---------------------------------------------------------
# 6. Create MNAR missingness indicators
# ---------------------------------------------------------
from data.synthetic_data_generation.missingness_synthetic import create_mnar_ind

ms_mnar, prt_ms_mnar = create_mnar_ind(
    colliders=colliders,
    collider_parents=collider_parents,
    num_var=num_var,
    num_extra_e=3,
    num_m=6,
    seed=42
)

print("MNAR missingness indicators:", ms_mnar)
print("MNAR parents:", prt_ms_mnar)

# ---------------------------------------------------------
# 7. Apply MNAR missingness
# ---------------------------------------------------------
X_mnar = generate_missing_values(
    X_complete=X_complete,
    ms=ms_mnar,
    prt_ms=prt_ms_mnar,
    p_missing_h=0.9,
    p_missing_l=0.1,
    seed=42
)

print("MNAR missingness summary:")
print(pd.DataFrame(X_mnar).isnull().mean())

# ---------------------------------------------------------
# 8. Save datasets
# ---------------------------------------------------------
df_mar = pd.DataFrame(X_mar, columns=df_phys_scaled.columns)
df_mnar = pd.DataFrame(X_mnar, columns=df_phys_scaled.columns)
df_mcar = pd.DataFrame(X_mcar, columns=df_phys_scaled.columns)

df_mar.to_csv("../data/processed_mimic/mimic_mar.csv", index=False)
df_mnar.to_csv("../data/processed_mimic/mimic_mnar.csv", index=False)
df_mcar.to_csv("../data/processed_mimic/mimic_mcar.csv", index=False)

print("Saved MAR, MNAR, MCAR datasets.")


Complete data shape: (247, 6)
Adjacency matrix:
 [[0 0 0 0 0 0]
 [1 0 0 1 0 0]
 [0 1 0 0 1 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [1 0 0 1 1 0]]
Colliders: [0, 3, 4]
Collider parents: [[np.int64(1), np.int64(5)], [np.int64(1), np.int64(5)], [np.int64(2), np.int64(5)]]


IndexError: list index out of range