## **Setup**

In [1]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: ok


In [2]:
import os
import sys
import os.path as op
import numpy as np
from functools import partial
from pytorch_lightning import LightningModule, Trainer, seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint, StochasticWeightAveraging
from pytorch_lightning.loggers import WandbLogger

import torch
from torch.utils.data import DataLoader
import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2

sys.path.append("..")
from mtecg import MultiTaskClinicalCNNDataset, MultiTaskClinicalCNNModel
from mtecg.utils import load_ecg_dataframe


SEED = 42
np.random.seed(SEED)
seed_everything(SEED, workers=True)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.cuda.manual_seed(SEED)

c:\Anaconda3\envs\ecg\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
c:\Anaconda3\envs\ecg\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll
Global seed set to 42


In [3]:
lvef_threshold = 50
image_size= (384, 384)
batch_size = 16
num_epochs = 10

configs = {
    # ECG image.
    "in_channels": 3,
    "learning_rate": 5e-4,
    "use_timm": True,
    "pretrained": True,
    "backbone": "resnet34d",
    "latent_dim": 512,
    "scar_class": 2,
    "lvef_class": 2,
    "scar_lvef_loss_ratio": [0.7, 0.3],
    "bias_head": True,
    # Clinical features.
    "embedding_size" : 5,
    "rnn_type": "lstm",
    "num_rnn_layers" : 1,
    "rnn_output_size" : 128,
    "num_categorical_features" : 5,
    "num_numerical_features" : 1,
    # Specify the device.
    "device": "cuda",
    }

In [12]:
parent_save_dir = f"../trained_models/multi-task-clinical"
os.makedirs(parent_save_dir, exist_ok=True)

run_suffix = f"{image_size[0]}_LVEF{str(lvef_threshold)}_dim{configs['rnn_output_size']}"
run_name = f"{configs['backbone']}_{run_suffix}"

os.makedirs(op.join(parent_save_dir, run_name), exist_ok=True)

## **Prepare the data**

In [5]:
image_dir = "../../ecg/ecg-cnn-local/siriraj_data/ECG_MRI_images_new/"
csv_path = "../../ECG_EF_Clin_train_dev_new.csv"

df = load_ecg_dataframe(csv_path, image_dir, drop_impute=False, do_split=True)
print(f"Number of images: {len(df)}")
print(f"Unique splits: {df['split'].unique()}")
df.head(5)

Number of images: 13343
Unique splits: ['old_train' 'old_valid' 'old_test' 'new_train' 'new_valid']


Unnamed: 0,run_num,train_80_percent,develop_10_percent,file_name,lvef,scar_cad,hcm,mri_date,month,year,...,dm,ht,mi,pci,cabg,ua,chest pain,dyspnea,path,split
0,1,1.0,,2009_420521391,0,0,0,2552-08-01 00:00:00,8,2009,...,0,1,0,0,0,0,1,0,../../ecg/ecg-cnn-local/siriraj_data/ECG_MRI_i...,old_train
1,2,1.0,,2009_472422791,0,0,0,2552-08-01 00:00:00,8,2009,...,0,1,0,0,0,0,1,0,../../ecg/ecg-cnn-local/siriraj_data/ECG_MRI_i...,old_train
2,3,1.0,,2009_451191451,0,0,0,2552-08-01 00:00:00,8,2009,...,0,1,0,0,0,0,1,1,../../ecg/ecg-cnn-local/siriraj_data/ECG_MRI_i...,old_train
3,4,1.0,,2009_512029431,1,1,0,2552-08-01 00:00:00,8,2009,...,1,0,1,1,0,0,0,1,../../ecg/ecg-cnn-local/siriraj_data/ECG_MRI_i...,old_train
4,5,1.0,,2009_461543281,1,1,0,2552-08-04 00:00:00,8,2009,...,0,1,0,0,0,0,1,1,../../ecg/ecg-cnn-local/siriraj_data/ECG_MRI_i...,old_train


In [9]:
# Set ["dm", "ht", "smoke", "dlp"] to np.nan if "impute" is True.
df.loc[df["impute"] == True, ["dm", "ht", "smoke", "dlp"]] = np.nan

In [10]:
# Combine old train and new train.
train_df = df[df.split.isin(["old_train", "new_train"])].reset_index()
# Combine old valid and new valid.
valid_df = df[df.split.isin(["old_valid", "new_valid"])].reset_index()

train_df.shape, valid_df.shape

((9393, 28), (2905, 28))

### Impute Values

In [13]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import joblib

imputer = IterativeImputer(
    missing_values=np.nan,
    max_iter=10,
    sample_posterior=True,
    random_state=42
    )

clinical_feature_columns = ["age", "female_gender", "dm", "ht", "smoke", "dlp"]

# Fit the imputer on the train set.
imputer.fit(train_df[clinical_feature_columns])

# Save the imputer.
imputer_path = op.join(parent_save_dir, run_name, "imputer.joblib")
joblib.dump(imputer, imputer_path)

['../trained_models/multi-task-clinical\\resnet34d_384_LVEF50_dim128\\imputer.joblib']

In [14]:
# Impute missing values in the train set.
train_df[clinical_feature_columns] = imputer.transform(train_df[clinical_feature_columns])

# Impute missing values in the valid set.
valid_df[clinical_feature_columns] = imputer.transform(valid_df[clinical_feature_columns])

In [15]:
from typing import List
import pandas as pd

def find_best_thresholds(
    dataframe: pd.DataFrame,
    imputed_column_names: List[str] = ["dm", "ht", "smoke", "dlp"],
    ):
    original_df = dataframe[dataframe["impute"] == False].reset_index()[imputed_column_names]
    impute_df = dataframe[dataframe["impute"] == True].reset_index()[imputed_column_names]

    # Try out different threshold for categorizing the imputed values into 0 or 1 so that the distribution of the imputed values is similar to the original values.
    # Store the result that has the smallest difference between the prevalence of the imputed values and the original values.
    # The threshold that gives the smallest difference is the threshold that gives the best imputation.
    # The threshold of each column is stored in a dictionary.
    best_threshold_dict = {}
    for imputed_column_name in imputed_column_names:
        
        for threshold in np.arange(0.1, 0.9, 0.01):
            imputed_df = pd.DataFrame(
                impute_df.values > threshold,
                columns=imputed_column_names,
                index=impute_df.index,
                )
            imputed_df = imputed_df.astype(int)
            diff = abs(original_df[imputed_column_name].sum() / len(original_df) - imputed_df[imputed_column_name].sum() / len(imputed_df))
            if imputed_column_name not in best_threshold_dict:
                best_threshold_dict[imputed_column_name] = [round(threshold, 2), diff]
            else:
                if diff < best_threshold_dict[imputed_column_name][1]:
                    best_threshold_dict[imputed_column_name] = [round(threshold, 2), diff]
    return best_threshold_dict


def apply_thresholds(
    dataframe: pd.DataFrame,
    best_threshold_dict: dict,
    imputed_column_names: List[str] = ["dm", "ht", "smoke", "dlp"],
    ):
    impute_df = dataframe[dataframe["impute"] == True].reset_index()[imputed_column_names]
    for imputed_column_name in imputed_column_names:
        threshold = best_threshold_dict[imputed_column_name][0]
        imputed_df = pd.DataFrame(
            impute_df.values > threshold,
            columns=imputed_column_names,
            index=impute_df.index,
            )
        imputed_df = imputed_df.astype(int)
        dataframe.loc[dataframe["impute"] == True, imputed_column_name] = imputed_df[imputed_column_name].values
    return dataframe


In [18]:
# Find the best thresholds for imputing missing values from the train set.
best_threshold_dict = find_best_thresholds(train_df)

joblib.dump(best_threshold_dict, op.join(parent_save_dir, run_name, "imputer_threshold_dict.joblib"))

# Apply the best thresholds to the train set and the valid set.
train_df = apply_thresholds(train_df, best_threshold_dict)
valid_df = apply_thresholds(valid_df, best_threshold_dict)

In [12]:
train_transform = A.Compose([
    A.Resize(*image_size),
    A.Blur(blur_limit=3, p=0.2),
    A.RandomBrightnessContrast(),
    A.MotionBlur(p=0.2),
    A.Normalize(),
    ToTensorV2(),
])

valid_transform = A.Compose([
    A.Resize(*image_size),
    A.Normalize(),
    ToTensorV2()
])


train_ds = MultiTaskClinicalCNNDataset(train_df, train_transform, lvef_threshold=lvef_threshold)
valid_ds = MultiTaskClinicalCNNDataset(valid_df, valid_transform, lvef_threshold=lvef_threshold)

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, pin_memory=True)
valid_loader = DataLoader(valid_ds, batch_size=batch_size, shuffle=False, pin_memory=True)

## **Train**

In [13]:
model = MultiTaskClinicalCNNModel(**configs)

In [None]:
import wandb
project_name = f"ecg-multi-task-with-clinical-features"

os.environ["WANDB_NOTEBOOK_NAME"] = "ecg-multi-task-with-clinical-features.ipynb"
run = wandb.init(project = project_name, save_code = True)
run.log_code(".", include_fn = lambda path: path.endswith(".py") or path.endswith(".ipynb"))
run.config.update({"batch_size": batch_size,})

checkpoint_callback = ModelCheckpoint(
    filename = configs["backbone"] + "{val_acc:.2f}",
    save_top_k = 1,
    verbose = True,
    monitor = "val_loss",
    mode = "min",
)

logger = WandbLogger(
    project = project_name,
    name = configs["backbone"],
    # log_model = "all", # set to True to log at the end
)

logger.watch(
    model, 
    # log_freq=300, # uncomment to log gradients
    log_graph = True,
)

In [15]:
trainer = Trainer(
    accelerator="gpu",
    logger = logger,
    max_epochs = num_epochs,
    callbacks = [checkpoint_callback, StochasticWeightAveraging(1e-3)],
)

trainer.fit(
    model,
    train_dataloaders = train_loader,
    val_dataloaders = valid_loader,
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type             | Params
--------------------------------------------------------
0 | accuracy           | Accuracy         | 0     
1 | scar_loss_fn       | CrossEntropyLoss | 0     
2 | lvef_loss_fn       | CrossEntropyLoss | 0     
3 | model              | ResNet           | 21.6 M
4 | scar_head          | Linear           | 1.3 K 
5 | lvef_head          | Linear           | 1.3 K 
6 | embedding_layer    | Embedding        | 10    
7 | clinical_rnn_layer | LSTM             | 69.6 K
--------------------------------------------------------
21.6 M    Trainable params
0         Non-trainable params
21.6 M    Total params
86.553    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Epoch 0, global step 588: 'val_loss' reached 0.32442 (best 0.32442), saving model to '.\\ecg-multi-task-with-clinical-features\\1yg3zjgs\\checkpoints\\resnet34dval_acc=0.86.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 1, global step 1176: 'val_loss' reached 0.29559 (best 0.29559), saving model to '.\\ecg-multi-task-with-clinical-features\\1yg3zjgs\\checkpoints\\resnet34dval_acc=0.87.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 2, global step 1764: 'val_loss' reached 0.27795 (best 0.27795), saving model to '.\\ecg-multi-task-with-clinical-features\\1yg3zjgs\\checkpoints\\resnet34dval_acc=0.88.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 3, global step 2352: 'val_loss' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 4, global step 2940: 'val_loss' reached 0.27077 (best 0.27077), saving model to '.\\ecg-multi-task-with-clinical-features\\1yg3zjgs\\checkpoints\\resnet34dval_acc=0.88.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 5, global step 3528: 'val_loss' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 6, global step 4116: 'val_loss' was not in top 1
Swapping scheduler `ReduceLROnPlateau` for `SWALR`


Validation: 0it [00:00, ?it/s]

Epoch 7, global step 4704: 'val_loss' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 8, global step 5292: 'val_loss' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 9, global step 5880: 'val_loss' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 10, global step 6468: 'val_loss' was not in top 1
`Trainer.fit` stopped: `max_epochs=11` reached.


In [16]:
trainer.save_checkpoint(op.join(parent_save_dir, run_name, "model.ckpt"))
model.save_configs(op.join(parent_save_dir, run_name))

A.save(train_transform, op.join(parent_save_dir, run_name, "train_transform.json"))
A.save(valid_transform, op.join(parent_save_dir, run_name, "transform.json"))