In [1]:
import os
import sys
import torch
import numpy as np
import pandas as pd

sys.path.append("..")
from mtecg.utils import load_ecg_dataframe

SEED = 42
np.random.seed(SEED)

c:\Anaconda3\envs\ecg\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
c:\Anaconda3\envs\ecg\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll


In [2]:
train_dev_csv_path = "../../ECG_EF_Clin_train_dev_new.csv"
new_test_csv_path = "../../ECG_EF_Clin_test_new_nocut_noimpute.csv"
imputer_dir = "../trained_models/multi-task-clinical/resnet34d_384_LVEF50_birnn_dim512"

train_dev_image_dir = "../../ecg/ecg-cnn-local/siriraj_data/ECG_MRI_images_new/"
new_test_image_dir = "../../ecg/ecg-cnn-local/siriraj_data/ECG_MRI_test_images_new/"

In [3]:
# Old test set.
train_dev_df = load_ecg_dataframe(
    train_dev_csv_path,
    train_dev_image_dir,
    imputer_dir=imputer_dir,
    do_split=True,
    return_lvef_40_column=True,
)
# New test set. No need to impute.
new_test_df = load_ecg_dataframe(
    new_test_csv_path,
    new_test_image_dir,
    # imputer_dir=imputer_dir,
    do_split=False,
    return_lvef_40_column=True,
)

In [4]:
scar_type_excel_path = "../../AI_ECG_CAD_scar_type_221227.xlsx"
scar_type_df = pd.read_excel(scar_type_excel_path)

# select cols from scar
scar_type_df = scar_type_df[["File_Name","Month","Subendocardial_scar","Transmural_scar","Subendocardial_scar_or_Transmural_scar"]]
# Lowercase column names for consistency.
scar_type_df.columns = map(str.lower, scar_type_df.columns)

In [5]:
# merge scar type onto train dev & test
train_dev_df = pd.merge(train_dev_df, scar_type_df, on=["file_name", "month"], how='left')
new_test_df = pd.merge(new_test_df, scar_type_df, on=["file_name", "month"], how='left')

# remove 2 corrupted PDF
# 2010/10/2010_401658221.pdf
# 2016/6/2016_527006041.pdf
train_dev_df = train_dev_df.drop(train_dev_df[train_dev_df['file_name'].isin(["2010_401658221", "2016_527006041"])].index)

In [6]:
train_df = train_dev_df[train_dev_df["split"].isin(["old_train", "new_train"]) == 1].reset_index(drop=True)
dev_df = train_dev_df[train_dev_df["split"].isin(["old_valid", "new_valid"]) == 1].reset_index(drop=True)
old_test_df = train_dev_df[train_dev_df["split"] == "old_test"].reset_index(drop=True)

population_to_df_map_dict = {
    "train": train_df,
    "dev": dev_df,
    "old_test": old_test_df,
    "new_test": new_test_df,
}

## **Get stats for the dataset**

In [8]:
def get_n_and_prevalence(dataframe: pd.DataFrame, col: str, inverse: bool = False) -> tuple:
    total_samples = dataframe.shape[0]
    target_columns = [col]
    target_class_index = 0 if inverse else 1

    target_dataframe = dataframe[target_columns].copy()
    n = target_dataframe.value_counts()[target_class_index]
    prevalence = n/total_samples * 100

    n = round(n, 3)
    prevalence = round(prevalence, 3)
    return n, prevalence

# result in % prevalence 
def get_baseline_stats(
    dataframe: pd.DataFrame,
    lvef_col: str = "lvef",
    lvef_40_col: str = "lvef_40",
    scar_col: str = "scar_cad",
    age_col: str = "age",
    female_gender_col: str = "female_gender",
    smoke_col: str = "smoke",
    dm_col: str = "dm",
    ht_col: str = "ht",
    dlp_col: str = "dlp",
    SubS_col: str = "subendocardial_scar",
    TranS_col: str = "transmural_scar",
    ):
    n_samples = dataframe.shape[0]
    # Default values.
    lvef_prevalence = 0
    scar_prevalence = 0
    # Calculate prevalence of LVEF and scar if the columns are present.
    if lvef_col in dataframe.columns:
        lvef_n, lvef_prevalence = get_n_and_prevalence(dataframe, lvef_col)
    if lvef_40_col in dataframe.columns:
        lvef_40_n, lvef_40_prevalence = get_n_and_prevalence(dataframe, lvef_40_col)
    if scar_col in dataframe.columns:
        scar_n, scar_prevalence = get_n_and_prevalence(dataframe, scar_col)

    # Calculate baseline statistics.
    mean_age = dataframe[age_col].mean()
    std_age = dataframe[age_col].std()

    # Scale to back to original values.
    mean_age = round(mean_age * 100, 3) 
    std_age = round(std_age * 100, 3)

    male_n, male_percent = get_n_and_prevalence(dataframe, female_gender_col, inverse=True)
    smoke_n, smoke_percent = get_n_and_prevalence(dataframe, smoke_col)
    ht_n, ht_percent = get_n_and_prevalence(dataframe, ht_col)
    dm_n, dm_percent = get_n_and_prevalence(dataframe, dm_col)
    dlp_n, dlp_percent = get_n_and_prevalence(dataframe, dlp_col)
    SubEn_n, SubEn_prevalence = get_n_and_prevalence(dataframe, SubS_col)
    TranMu_n, TranMu_prevalence = get_n_and_prevalence(dataframe, TranS_col)

    SubEn_or_TranMu_n = dataframe[(dataframe[SubS_col] == 1) & (dataframe[TranS_col] == 1)].shape[0]
    SubEn_or_TranMu_prevalence = SubEn_or_TranMu_n/n_samples * 100
    SubEn_or_TranMu_prevalence = round(SubEn_or_TranMu_prevalence, 3)

    baseline_stat_tuple_dict = {
        "age": [f"{mean_age} +/- {std_age}"],
        "male": [f"{male_n} ({male_percent})"],
        "smoke": [f"{smoke_n} ({smoke_percent})"],
        "ht": [f"{ht_n} ({ht_percent})"],
        "dm": [f"{dm_n} ({dm_percent})"],
        "dlp": [f"{dlp_n} ({dlp_percent})"],
        "scar": [f"{scar_n} ({scar_prevalence})"],
        "lvef": [f"{lvef_n} ({lvef_prevalence})"],
        "lvef_40": [f"{lvef_40_n} ({lvef_40_prevalence})"],
        "SubEn": [f"{SubEn_n} ({SubEn_prevalence})"],
        "TranMu": [f"{TranMu_n} ({TranMu_prevalence})"],
        "SubEn_or_TranMu": [f"{SubEn_or_TranMu_n} ({SubEn_or_TranMu_prevalence})"],
    }

    baseline_stat_dataframe =  pd.DataFrame(baseline_stat_tuple_dict)
    return baseline_stat_dataframe

In [9]:
statistic_dataframe_list = []
for population_name, dataframe in population_to_df_map_dict.items():
    baseline_stat_dataframe = get_baseline_stats(dataframe)
    baseline_stat_dataframe.index = [population_name]
    statistic_dataframe_list.append(baseline_stat_dataframe)

all_population_df = pd.concat(population_to_df_map_dict.values(), axis=0)
population_stat_df = get_baseline_stats(all_population_df)
population_stat_df.index = ["population"]
statistic_dataframe_list.append(population_stat_df)

In [10]:
pd.concat(statistic_dataframe_list, axis=0).to_csv("../resources/statistics/population_statistics.csv", index=True)

In [11]:
pd.concat(statistic_dataframe_list, axis=0)

Unnamed: 0,age,male,smoke,ht,dm,dlp,scar,lvef,lvef_40,SubEn,TranMu,SubEn_or_TranMu
train,73.205 +/- 13.825,4822 (51.336),1584 (16.864),7063 (75.194),3457 (36.804),6655 (70.851),2655 (28.266),1803 (19.195),1197 (12.744),1978 (21.058),1693 (18.024),1016 (10.817)
dev,70.593 +/- 13.932,1396 (48.055),280 (9.639),1734 (59.69),853 (29.363),1590 (54.733),720 (24.785),489 (16.833),316 (10.878),492 (16.936),460 (15.835),232 (7.986)
old_test,71.592 +/- 14.013,531 (50.813),157 (15.024),815 (77.99),401 (38.373),766 (73.301),277 (26.507),192 (18.373),122 (11.675),188 (17.99),189 (18.086),100 (9.569)
new_test,69.961 +/- 14.494,751 (50.641),47 (3.169),641 (43.223),331 (22.32),528 (35.604),401 (27.04),263 (17.734),171 (11.531),241 (16.251),294 (19.825),134 (9.036)
population,72.255 +/- 13.986,7500 (50.587),2068 (13.948),10253 (69.156),5042 (34.008),9539 (64.34),4053 (27.337),2747 (18.528),1806 (12.181),2899 (19.553),2636 (17.78),1482 (9.996)
