In [1]:
# autoreload
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from npj_utils import *
from sklearn.multioutput import MultiOutputClassifier
import xgboost as xgb

In [3]:
mimic_iv_path = "/cis/home/charr165/Documents/physionet.org/mimiciv/2.2"
mm_dir = "/cis/home/charr165/Documents/multimodal"

output_dir = os.path.join(mm_dir, "preprocessing")

In [5]:
import pickle

base_name = "pheno" # ihm, los, pheno

if "pheno" in base_name:
    base_name += "-all"
else:
    base_name += "-48"

base_name += "-cxr-notes-ecg"

f_path = os.path.join(output_dir, f"train_{base_name}_stays.pkl")

with open(f_path, "rb") as f:
    train_stays = pickle.load(f)

f_path = os.path.join(output_dir, f"val_{base_name}_stays.pkl")

with open(f_path, "rb") as f:
    val_stays = pickle.load(f)

f_path = os.path.join(output_dir, f"test_{base_name}_stays.pkl")

with open(f_path, "rb") as f:
    test_stays = pickle.load(f)

In [6]:
include_notes = True
include_cxr = False
include_ecg = False

In [7]:
X_train = calc_ts_embeddings(train_stays)

if include_notes:
    txt_df = calc_avg_text_embedding(train_stays)
    X_train = pd.concat([X_train, txt_df], axis=1)

if include_cxr:
    cxr_df = calc_avg_cxr_embedding(train_stays)
    X_train = pd.concat([X_train, cxr_df], axis=1)

if include_ecg:
    ecg_df = calc_avg_ecg_embedding(train_stays)
    X_train = pd.concat([X_train, ecg_df], axis=1)

y_train = extract_labels(train_stays)

col_names = X_train.columns

Calculating Time Series Embeddings:   0%|          | 0/6349 [00:00<?, ?it/s]

Calculating Time Series Embeddings: 100%|██████████| 6349/6349 [00:42<00:00, 149.37it/s]
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = um.true_divide(
  df = pd.concat([df, curr_df], axis=0, ignore_index=True)
Calculating Text Embeddings: 100%|██████████| 6349/6349 [00:08<00:00, 737.79it/s] 


In [8]:
X_test = calc_ts_embeddings(test_stays)

if include_notes:
    txt_df = calc_avg_text_embedding(test_stays)
    X_test = pd.concat([X_test, txt_df], axis=1)

if include_cxr:
    cxr_df = calc_avg_cxr_embedding(test_stays)
    X_test = pd.concat([X_test, cxr_df], axis=1)

if include_ecg:
    ecg_df = calc_avg_ecg_embedding(test_stays)
    X_test = pd.concat([X_test, ecg_df], axis=1)

y_test = extract_labels(test_stays)

Calculating Time Series Embeddings: 100%|██████████| 1361/1361 [00:08<00:00, 151.64it/s]
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = um.true_divide(
  df = pd.concat([df, curr_df], axis=0, ignore_index=True)
Calculating Text Embeddings: 100%|██████████| 1361/1361 [00:00<00:00, 3666.35it/s]


In [9]:
X_val = calc_ts_embeddings(val_stays)

if include_notes:
    txt_df = calc_avg_text_embedding(val_stays)
    X_val = pd.concat([X_val, txt_df], axis=1)

if include_cxr:
    cxr_df = calc_avg_cxr_embedding(val_stays)
    X_val = pd.concat([X_val, cxr_df], axis=1)

if include_ecg:
    ecg_df = calc_avg_ecg_embedding(val_stays)
    X_val = pd.concat([X_val, ecg_df], axis=1)

y_val = extract_labels(val_stays)

Calculating Time Series Embeddings: 100%|██████████| 1360/1360 [00:09<00:00, 150.35it/s]
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = um.true_divide(
  df = pd.concat([df, curr_df], axis=0, ignore_index=True)
Calculating Text Embeddings: 100%|██████████| 1360/1360 [00:00<00:00, 3646.15it/s]


In [10]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_val = scaler.transform(X_val)

In [None]:
from prediction_util import run_xgb, run_xgb_multilabel

seed = 1
if "pheno" in base_name:
    y_pred_test, y_pred_prob_test, y_pred_train, y_pred_prob_train, gs = run_xgb_multilabel(X_train, y_train, X_test, gpu=0, seed=seed, n_jobs=16)
else:
    y_pred_test, y_pred_prob_test, y_pred_train, y_pred_prob_train, gs = run_xgb(X_train, y_train, X_test, gpu=0, seed=seed, n_jobs=16)

In [13]:
est = MultiOutputClassifier(xgb.XGBClassifier(verbosity=2, seed=42,
                                                  tree_method='gpu_hist', gpu_id=1,
                                                  eval_metric='logloss', n_jobs=32))
est.fit(X_train, y_train)
y_pred_prob_test = est.predict_proba(X_test)
y_pred_test = est.predict(X_test)
# Evaluate
_ = evaluate_model(y_test, y_pred_test, y_pred_prob_test)


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_metho

AUC (micro): 0.6941886503724302
AUC (macro): 0.6310456193884256
AUC (weighted): 0.6573607075090783
F1 Score (macro): 0.4303015517528131


In [None]:
est = xgb.XGBClassifier(verbosity=2, scale_pos_weight=(len(y_train) - sum(y_train)) / sum(y_train), seed=0,
                             device="cuda", n_jobs=32)
est.fit(X_train, y_train)

y_pred_test = est.predict(X_test)
y_pred_prob_test = est.predict_proba(X_test)

# Evaluate
_ = evaluate_model(y_test, y_pred_test, y_pred_prob_test[:,1])

In [None]:
est.feature_importances_

# Get the top 10 most important features
indices = np.argsort(est.feature_importances_)[::-1]
top_indices = indices[:100]
print('Feature ranking:')
for i in range(50):
    print('%d. %s (%f)' % (i + 1, col_names[top_indices[i]], est.feature_importances_[top_indices[i]]))

In [None]:
# print("TRAIN")
# _ = evaluate_model(y_train, y_pred_train, y_pred_prob_train)

print(f"Task: {base_name}")
print(f"Seed: {seed}")

modals = "ts"

if include_notes:
    modals += "+text"

if include_cxr:
    modals += "+cxr"

if include_ecg:
    modals += "+ecg"

print(f"Modals: {modals}")

print("\n\nTEST")
_ = evaluate_model(y_test, y_pred_test, y_pred_prob_test)