## **Setup**

In [1]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: ok


In [2]:
import os
import sys
import os.path as op
import numpy as np
from sklearn.metrics import roc_auc_score

from xgboost import XGBClassifier

sys.path.append("..")
from mtecg.utils import load_ecg_dataframe, categorize_lvef, find_best_thresholds, apply_thresholds


SEED = 42
np.random.seed(SEED)

c:\Anaconda3\envs\ecg\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
c:\Anaconda3\envs\ecg\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll


In [4]:
lvef_threshold = 50
clinical_feature_columns = ["female_gender", "age", "smoke", "dlp", "dm", "ht"]

save_dir = "../scripts/mtecg/xgb"
os.makedirs(save_dir, exist_ok=True)

## **Prepare the data**

In [5]:
image_dir = "../../ecg/ecg-cnn-local/siriraj_data/ECG_MRI_images_new/"
csv_path = "../datasets/all_ECG_cleared_duplicate_may23_final.csv"

df = load_ecg_dataframe(csv_path, image_dir, drop_impute=False, do_split=True)
print(f"Number of images: {len(df)}")
print(f"Unique splits: {df['split'].unique()}")
df.head(5)

Number of images: 12788
Unique splits: ['old_train' 'old_valid' 'old_test' 'new_train' 'new_valid']


Unnamed: 0,run_num,file_name,lvef,scar_cad,hcm,mri_date,month,year,cut,edit_filename,...,ua,chest pain,dyspnea,subs,trans,boths,split,conflict,new_cut,path
0,1,2009_420521391,0,0,0,2552-08-01 00:00:00,8,2009,,,...,0,1,0,,,,old_train,,0,../../ecg/ecg-cnn-local/siriraj_data/ECG_MRI_i...
1,2,2009_472422791,0,0,0,2552-08-01 00:00:00,8,2009,,,...,0,1,0,,,,old_train,,0,../../ecg/ecg-cnn-local/siriraj_data/ECG_MRI_i...
2,3,2009_451191451,0,0,0,2552-08-01 00:00:00,8,2009,,,...,0,1,1,,,,old_train,,0,../../ecg/ecg-cnn-local/siriraj_data/ECG_MRI_i...
3,4,2009_512029431,1,1,0,2552-08-01 00:00:00,8,2009,,,...,0,0,1,,1.0,1.0,old_train,,0,../../ecg/ecg-cnn-local/siriraj_data/ECG_MRI_i...
4,5,2009_461543281,1,1,0,2552-08-04 00:00:00,8,2009,,,...,0,1,1,,1.0,1.0,old_train,,0,../../ecg/ecg-cnn-local/siriraj_data/ECG_MRI_i...


In [13]:
# Set ["dm", "ht", "smoke", "dlp"] to np.nan if "impute" is True.
df.loc[df["impute"] == True, ["dm", "ht", "smoke", "dlp"]] = np.nan

In [14]:
# Combine old train and new train.
train_df = df[df.split.isin(["old_train", "new_train"])].reset_index()
# Combine old valid and new valid.
valid_df = df[df.split.isin(["old_valid", "new_valid"])].reset_index()

train_df.shape, valid_df.shape

((9393, 31), (2500, 31))

In [15]:
non_impute_train_df = train_df[train_df["impute"] == False]

non_impute_train_df[["dm", "ht", "smoke", "dlp"]].sum() / len(non_impute_train_df)

dm       0.374580
ht       0.747920
smoke    0.164985
dlp      0.706143
dtype: float64

## Impute Values

In [16]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import joblib
from sklearn.linear_model import LinearRegression

imputer = IterativeImputer(
    missing_values=np.nan,
    max_iter=10,
    sample_posterior=True,
    random_state=42
    )

clinical_feature_columns = ["age", "female_gender", "dm", "ht", "smoke", "dlp"]

# Fit the imputer on the train set.
imputer.fit(train_df[clinical_feature_columns])

# Save the imputer.
imputer_path = op.join(save_dir, "imputer.joblib")
joblib.dump(imputer, imputer_path)

['../scripts/mtecg/xgb\\imputer.joblib']

In [17]:
# Impute missing values in the train set.
train_df[clinical_feature_columns] = imputer.transform(train_df[clinical_feature_columns])

# Impute missing values in the valid set.
valid_df[clinical_feature_columns] = imputer.transform(valid_df[clinical_feature_columns])

In [18]:
# Find the best thresholds for imputing missing values from the train set.
best_threshold_dict = find_best_thresholds(train_df)

joblib.dump(best_threshold_dict, op.join(save_dir, "imputer_threshold_dict.joblib"))

# Apply the best thresholds to the train set and the valid set.
train_df = apply_thresholds(train_df, best_threshold_dict)
valid_df = apply_thresholds(valid_df, best_threshold_dict)

best_threshold_dict

{'dm': [0.54, 0.0033188878449101344],
 'ht': [0.42, 0.001278732110516545],
 'smoke': [0.52, 0.003125978730067952],
 'dlp': [0.44, 0.0027248412471461148]}

In [19]:
train_df[["dm", "ht", "smoke", "dlp"]].sum() / len(train_df)

dm       0.373257
ht       0.748430
smoke    0.163739
dlp      0.707229
dtype: float64

In [20]:
x_train = train_df[clinical_feature_columns]
x_valid = valid_df[clinical_feature_columns]

y_train_scar = train_df["scar_cad"]
y_valid_scar = valid_df["scar_cad"]
y_train_lvef = train_df["lvef"]
y_valid_lvef = valid_df["lvef"]
# y_train_lvef = train_df["lvef"].apply(lambda lvef: categorize_lvef(lvef, lvef_threshold))
# y_valid_lvef = valid_df["lvef"].apply(lambda lvef: categorize_lvef(lvef, lvef_threshold))

## **Train**

In [21]:
scar_model = XGBClassifier(
    booster="dart",
    tree_method="hist",
    grow_policy="lossguide",
    sample_type="weighted",
    sampling_method="gradient_based",
    normalize_type="forest",
    rate_drop=0.3,
    random_state=SEED,
)

lvef_model = XGBClassifier(
    booster="dart",
    tree_method="hist",
    grow_policy="lossguide",
    sample_type="weighted",
    sampling_method="gradient_based",
    normalize_type="forest",
    rate_drop=0.3,
    random_state=SEED,
)

In [22]:
scar_model.fit(x_train, y_train_scar)
print("Scar accuracy:", scar_model.score(x_valid, y_valid_scar))

lvef_model.fit(x_train, y_train_lvef)
print("LVEF accuracy:", lvef_model.score(x_valid, y_valid_lvef))

Scar accuracy: 0.7564
LVEF accuracy: 0.8276


In [23]:
print("Scar AUC:", roc_auc_score(y_valid_scar, scar_model.predict_proba(x_valid)[:, 1]))
print("LVEF AUC:", roc_auc_score(y_valid_lvef, lvef_model.predict_proba(x_valid)[:, 1]))

Scar AUC: 0.6571506753413132
LVEF AUC: 0.6176889365767088


In [24]:
import joblib
scar_model_save_dir = op.join(save_dir, "scar_model")
lvef_model_save_dir = op.join(save_dir, "lvef_model")
os.makedirs(scar_model_save_dir, exist_ok=True)
os.makedirs(lvef_model_save_dir, exist_ok=True)

joblib.dump(scar_model, op.join(scar_model_save_dir, "model.joblib"))
joblib.dump(lvef_model, op.join(lvef_model_save_dir, "model.joblib"))

['../scripts/mtecg/xgb\\lvef_model\\model.joblib']

In [27]:
# A function to get XGBoost predictions.
from mtecg.evaluation import calculate_metrics_per_task
import pandas as pd
from typing import List

def evaluate_xgb_from_dataframe(
    dataframe: pd.DataFrame,
    model: XGBClassifier,
    feature_columns: List[str],
    label_column_name: str = "scar_cad",
    task="scar",
    ):
    x = dataframe[feature_columns]
    predicted_probability_array = model.predict_proba(x)[:, 1]
    prediction_array = model.predict(x)

    prediction_dataframe = pd.DataFrame(
        {
            f"{task}_label": dataframe[label_column_name].values,
            f"{task}_prediction": prediction_array,
            f"{task}_probability": predicted_probability_array,
        }
    )
    metrics_dataframe = calculate_metrics_per_task(prediction_dataframe, task)
    return prediction_dataframe, metrics_dataframe

In [28]:
scar_prediction_df, scar_metric_df = evaluate_xgb_from_dataframe(valid_df, scar_model, clinical_feature_columns, task="scar")

In [29]:
lvef_prediction_df, lvef_metric_df = evaluate_xgb_from_dataframe(valid_df, lvef_model, clinical_feature_columns, task="lvef")

In [32]:
lvef_metric_df

Unnamed: 0,0
Accuracy,0.7624
Sensitivity,0.001698
Specificity,0.99686
F1,0.66209
AUC,0.630827
FPR,0.00314
FNR,0.998302


In [30]:
calculate_metrics_per_task

<function mtecg.evaluation.calculate_metrics_per_task(result_dataframe, task: str, is_control_population: bool = False, average: str = 'weighted')>