## **Setup**

In [1]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: ok


In [14]:
import os
import sys
import os.path as op
import numpy as np
from sklearn.metrics import roc_auc_score

from xgboost import XGBClassifier

sys.path.append("..")
from mtecg.utils import load_ecg_dataframe, categorize_lvef


SEED = 42
np.random.seed(SEED)

In [9]:
lvef_threshold = 50
feature_list = ["female_gender", "age", "smoke", "dlp", "dm", "ht"]

## **Prepare the data**

In [6]:
image_dir = "../../ecg/ecg-cnn-local/siriraj_data/ECG_MRI_images_new/"
csv_path = "../../ECG_EF_Clin_train_dev_new.csv"

df = load_ecg_dataframe(csv_path, image_dir)
print(f"Number of images: {len(df)}")
print(f"Unique splits: {df['split'].unique()}")
df.head(5)

Number of images: 13343
Unique splits: ['old_train' 'old_valid' 'old_test' 'new_train' 'new_valid']


Unnamed: 0,run_num,train_80_percent,develop_10_percent,file_name,lvef,scar_cad,hcm,mri_date,month,year,...,dm,ht,mi,pci,cabg,ua,chest pain,dyspnea,path,split
0,1,1.0,,2009_420521391,59.9,0,0,2552-08-01 00:00:00,8,2009,...,0,1,0,0,0,0,1,0,../../ecg/ecg-cnn-local/siriraj_data/ECG_MRI_i...,old_train
1,2,1.0,,2009_472422791,81.7,0,0,2552-08-01 00:00:00,8,2009,...,0,1,0,0,0,0,1,0,../../ecg/ecg-cnn-local/siriraj_data/ECG_MRI_i...,old_train
2,3,1.0,,2009_451191451,64.7,0,0,2552-08-01 00:00:00,8,2009,...,0,1,0,0,0,0,1,1,../../ecg/ecg-cnn-local/siriraj_data/ECG_MRI_i...,old_train
3,4,1.0,,2009_512029431,10.7,1,0,2552-08-01 00:00:00,8,2009,...,1,0,1,1,0,0,0,1,../../ecg/ecg-cnn-local/siriraj_data/ECG_MRI_i...,old_train
4,5,1.0,,2009_461543281,19.3,1,0,2552-08-04 00:00:00,8,2009,...,0,1,0,0,0,0,1,1,../../ecg/ecg-cnn-local/siriraj_data/ECG_MRI_i...,old_train


In [7]:
# Combine old train and new train.
train_df = df[df.split.isin(["old_train", "new_train"])].reset_index()
# Combine old valid and new valid.
valid_df = df[df.split.isin(["old_valid", "new_valid"])].reset_index()

train_df.shape, valid_df.shape

((9393, 28), (2905, 28))

In [10]:
x_train = train_df[feature_list]
x_valid = valid_df[feature_list]

y_train_scar = train_df["scar_cad"]
y_valid_scar = valid_df["scar_cad"]
y_train_lvef = train_df["lvef"].apply(lambda lvef: categorize_lvef(lvef, lvef_threshold))
y_valid_lvef = valid_df["lvef"].apply(lambda lvef: categorize_lvef(lvef, lvef_threshold))

## **Train**

In [11]:
scar_model = XGBClassifier(
    booster="dart",
    tree_method="hist",
    grow_policy="lossguide",
    sample_type="weighted",
    sampling_method="gradient_based",
    normalize_type="forest",
    rate_drop=0.3,
    random_state=SEED,
)

lvef_model = XGBClassifier(
    booster="dart",
    tree_method="hist",
    grow_policy="lossguide",
    sample_type="weighted",
    sampling_method="gradient_based",
    normalize_type="forest",
    rate_drop=0.3,
    random_state=SEED,
)

In [12]:
scar_model.fit(x_train, y_train_scar)
print("Scar accuracy:", scar_model.score(x_valid, y_valid_scar))

lvef_model.fit(x_train, y_train_lvef)
print("LVEF accuracy:", lvef_model.score(x_valid, y_valid_lvef))

Scar accuracy: 0.7411359724612737
LVEF accuracy: 0.832013769363167


In [15]:
print("Scar AUC:", roc_auc_score(y_valid_scar, scar_model.predict_proba(x_valid)[:, 1]))
print("LVEF AUC:", roc_auc_score(y_valid_lvef, lvef_model.predict_proba(x_valid)[:, 1]))

Scar AUC: 0.6706836384439359
LVEF AUC: 0.6200136445509825
