In [None]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import os
import copy
import time

from matplotlib import pyplot as plt
import matplotlib as mpl


import seaborn as sns
sns.set()

from sklearn.metrics import classification_report, roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import LeaveOneGroupOut, GridSearchCV
from sklearn.preprocessing import StandardScaler

from utils import *


def Get_feature_df_list_for_all_ECG_type(data_folder, feature_folder):
    st = time.time()
    ECG_type_list = ["SINUS", "JET"]
    channel_list = ["ECG-Lead-I", "ECG-Lead-II", "ECG-Lead-III", "ECG-Lead-IV"]
    feature_df_list = []
    for ECG_type in ECG_type_list:
        print(f"\nLoading {ECG_type} data...")
        """ Read for patient info """
        download_report_path = os.path.join(data_folder, ECG_type, f"download_report_{ECG_type}.csv")
        download_report_df = pd.read_csv(download_report_path)
        counter = 0
        for row in download_report_df.iterrows():
            if counter % 60 == 0:
                print(f"{counter} / {len(download_report_df)}. Time: {time.time() - st:.4f}", end=", ")
            counter += 1

            interval_ID = row[1]["Interval ID#"]
            patient_ID = row[1][" reference"]

            """ Load ECG for each channel """
            for channel in channel_list:
                feature_filename_channel = f"patient-{patient_ID}_event-{interval_ID}_block-1_{channel}_{ECG_type}"
                feature_filepath = os.path.join(feature_folder, f"{feature_filename_channel}.csv")
                if os.path.exists(feature_filepath):
                    feature_df = pd.read_csv(feature_filepath)
                    feature_df["r_t"] = pd.to_datetime(feature_df["r_t"], unit="s")
                    feature_df["r_ID_abs"] = feature_df["start_ID"] + feature_df["r_ID"] - 1
                    if len(feature_df) > 0:
                        feature_df_list.append(feature_df)

    return feature_df_list


def Get_roc_curve_df_from_model_and_df(model, feature_df, target_fpr=0.1, target_threshold=None):
    y_scores = model.predict_proba(feature_df[model.feature_names_in_])[:, 1]
    y_true = feature_df["label"].values
    auroc = roc_auc_score(y_true, y_scores, average="weighted")
    fpr, tpr, thresholds = roc_curve(y_true, y_scores, pos_label=1)
    roc_curve_df = pd.DataFrame([fpr, tpr, thresholds]).T
    roc_curve_df.columns=["fpr", "tpr", "thresholds"]
    if target_threshold is not None:
        closest_threshold_idx = np.argsort(np.abs(roc_curve_df["thresholds"] - target_threshold))[0]
    else:
        closest_threshold_idx = np.argsort(np.abs(roc_curve_df["fpr"] - target_fpr))[0]
    selected_threshold = roc_curve_df["thresholds"][closest_threshold_idx]
    selected_fpr = roc_curve_df["fpr"][closest_threshold_idx]
    roc_curve_results_dict = {"roc_curve_df": roc_curve_df, "selected_threshold": selected_threshold, "selected_fpr": selected_fpr, "auroc": auroc}
    return roc_curve_results_dict


def Draw_roc_curve(model, df_train, df_test, target_fpr=0.1, suptitle=""):
    sns.set(font_scale=1.5)
    roc_curve_results_dict_train = Get_roc_curve_df_from_model_and_df(model, df_train, target_fpr=target_fpr)
    roc_curve_results_dict_test = Get_roc_curve_df_from_model_and_df(model, df_test, target_threshold=roc_curve_results_dict_train["selected_threshold"])
    roc_curve_results_dict_train["roc_curve_df"]["Dataset"] = "Training"
    roc_curve_results_dict_test["roc_curve_df"]["Dataset"] = "Test"
    roc_curve_df = pd.concat([roc_curve_results_dict_train["roc_curve_df"], roc_curve_results_dict_test["roc_curve_df"]], axis=0)

    g = sns.relplot(data=roc_curve_df.query("thresholds <= 1"), x="fpr", y="tpr", hue="thresholds", col="Dataset", height=6, kind="scatter", palette="magma",
                    linewidth=0.01)
    g.axes[0][0].axvline(roc_curve_results_dict_train["selected_fpr"], linestyle=":")
    g.axes[0][0].text(roc_curve_results_dict_train["selected_fpr"], 0.3, f"Selected threshold = {roc_curve_results_dict_train['selected_threshold']:.3f}")
    g.axes[0][0].set_title(f"{g.axes[0][0].get_title()}\nAUROC = {roc_curve_results_dict_train['auroc']:.4f}")
    g.axes[0][1].axvline(roc_curve_results_dict_test["selected_fpr"], linestyle=":")
    g.axes[0][1].text(roc_curve_results_dict_test["selected_fpr"], 0.3, f"Selected threshold = {roc_curve_results_dict_test['selected_threshold']:.3f}")
    g.axes[0][1].set_title(f"{g.axes[0][1].get_title()}\nAUROC = {roc_curve_results_dict_test['auroc']:.4f}")
    g.fig.suptitle(f"{suptitle} - ROC curve\n(target train FPR = {target_fpr})")
    plt.show()
    os.makedirs("Figures", exist_ok=True)
    # plt.savefig(os.path.join("Figures", "ROC_curve.png"), dpi=200)
    return roc_curve_results_dict_train, roc_curve_results_dict_test


""" A plot showing decision boundary """


def Get_contour_y_pred(model, feature_df, x1_var, x2_var, threshold=0.5):
    contour_x_linspace = np.linspace(feature_df[x1_var].min(), feature_df[x1_var].max(), 301)
    contour_y_linspace = np.linspace(feature_df[x2_var].min(), feature_df[x2_var].max(), 301)
    coutour_xx, coutour_yy = np.meshgrid(contour_x_linspace, contour_y_linspace)
    contour_df = pd.DataFrame(np.concatenate([coutour_xx.reshape(-1, 1), coutour_yy.reshape(-1, 1)], axis=1))
    contour_df.columns = [x1_var, x2_var]
    if isinstance(model, LogisticRegression):
        y_scores = model.predict_proba(contour_df)[:, 1]
        y_pred = (y_scores > threshold).astype(int)
    else:
        y_scores = model.predict(contour_df)
        y_pred = (y_scores > threshold).astype(int).values

    contour_y_pred = y_pred.reshape(*coutour_xx.shape)
    contour_result_dict = {"contour_x_linspace": contour_x_linspace, "contour_y_linspace": contour_y_linspace, "contour_y_pred": contour_y_pred}
    return contour_result_dict


def Draw_decision_boundary(model, df_train, df_test, colored_by_patient_ID=False, palette="bright", target_fpr=0.1):
    roc_curve_results_dict_train = Get_roc_curve_df_from_model_and_df(model, df_train, target_fpr=target_fpr)
    if isinstance(model, LogisticRegression):
        x1_var = model.feature_names_in_[0]
        x2_var = model.feature_names_in_[1]
    else:
        x1_var = list(dict(model.params).keys())[1]
        x2_var = list(dict(model.params).keys())[2]
    threshold_train = roc_curve_results_dict_train["selected_threshold"]

    """ Calculate predictions for contour """
    contour_result_dict_train = Get_contour_y_pred(model, df_train, x1_var, x2_var, threshold=threshold_train)

    """ Calculate predictions for input data """
    if isinstance(model, LogisticRegression):
        y_pred_train = (model.predict_proba(df_train[model.feature_names_in_])[:, 1] > threshold_train).astype(int)
        y_pred_test = (model.predict_proba(df_test[model.feature_names_in_])[:, 1] > threshold_train).astype(int)
    else:
        y_pred_train = (model.predict(df_train) > threshold_train).astype(int).values
        y_pred_test = (model.predict(df_test) > threshold_train).astype(int).values

    """ Plotting """
    sns.set(font_scale=2)
    levels = mpl.ticker.MaxNLocator(nbins=2).tick_values(0, 1)
    fig, axes = plt.subplots(1, 2, figsize=(24, 8))

    feature_df_renamed = df_train.rename(columns={"label": "True label"})
    if colored_by_patient_ID:
        feature_df_renamed["patient_ID"] = feature_df_renamed["patient_ID"].astype(int)
        sns.scatterplot(data=feature_df_renamed, x=x1_var, y=x2_var, alpha=0.04, hue="patient_ID", palette=palette, ax=axes[0])
    else:
        contour_handle = axes[0].contour(contour_result_dict_train["contour_x_linspace"], contour_result_dict_train["contour_y_linspace"], contour_result_dict_train["contour_y_pred"], levels=levels, cmap="bwr")
        cbar = plt.colorbar(contour_handle, ax=axes[0])
        cbar.ax.set_title("Prediction")
        sns.scatterplot(data=feature_df_renamed, x=x1_var, y=x2_var, alpha=0.01, hue="True label", palette=[(0, 0, 0.8), (0.8, 0, 0)], ax=axes[0])
    axes[0].set_title("Training data")

    feature_df_renamed = df_test.rename(columns={"label": "True label"})
    if colored_by_patient_ID:
        feature_df_renamed["patient_ID"] = feature_df_renamed["patient_ID"].astype(int)
        sns.scatterplot(data=feature_df_renamed, x=x1_var, y=x2_var, alpha=0.04, hue="patient_ID", palette=palette, ax=axes[1])
    else:
        contour_handle = axes[1].contour(contour_result_dict_train["contour_x_linspace"], contour_result_dict_train["contour_y_linspace"], contour_result_dict_train["contour_y_pred"], levels=levels, cmap="bwr")
        cbar = plt.colorbar(contour_handle, ax=axes[1])
        cbar.ax.set_title("Prediction")
        sns.scatterplot(data=feature_df_renamed, x=x1_var, y=x2_var, alpha=0.01, hue="True label", palette=[(0, 0, 0.8), (0.8, 0, 0)], ax=axes[1])
    axes[1].set_title("Test data")

    if isinstance(model, LogisticRegression):
        xlim = [-1, 4.5]
        ylim = [-1, 9]
    else:
        xlim = [-0.005, 0.12]
        ylim = [-0.02, 1.8]

    axes[0].set_xlim(xlim)
    axes[0].set_ylim(ylim)
    axes[1].set_xlim(xlim)
    axes[1].set_ylim(ylim)

    plt.show()
    os.makedirs("Figures", exist_ok=True)
    # plt.savefig(os.path.join("Figures", "decision_boundary.png"), dpi=200)
    return y_pred_train, y_pred_test, contour_result_dict_train

def Get_confusion_matrix(model, data_df, threshold=0.5):
    if isinstance(model, LogisticRegression):
        y_pred = (model.predict_proba(data_df[model.feature_names_in_])[:, 1] > threshold).astype(int)
    else:
        y_pred = (model.predict(data_df) > threshold).astype(int).values
    confusion_mat = np.histogram2d(data_df["label"], y_pred, bins=2)[0]
    return confusion_mat


def Visualize_confusion_matrix(confusion_mat, ax=None, title=None):
    if title is None:
        title = "Confusion matrix"
    confusion_name_mat = np.array([["TN", "FN"], ["FP", "TP"]])

    confusion_mat_percent = 100 * confusion_mat / np.sum(confusion_mat.ravel())
    sns.set(font_scale=1.5)
    plt.matshow(confusion_mat, cmap="summer")
    plt.colorbar()
    for (i, j), v in np.ndenumerate(confusion_mat):
        plt.text(i, j, f"{v:.0f}\n({confusion_mat_percent[i, j]:.0f}%, {confusion_name_mat[i, j]})", ha="center", va="center", color="#000000")
    plt.xlabel("Predicted label")
    plt.ylabel("True label")
    plt.gca().grid(False)
    plt.gca().set_xticklabels(["", "SINUS", "JET"])
    plt.gca().set_yticklabels(["", "SINUS", "JET"])
    plt.gcf().suptitle(title)

    plt.show()
    os.makedirs("Figures", exist_ok=True)
    # plt.savefig(os.path.join("Figures", "Confusion_matrix.png"), dpi=200)


def Get_download_report(data_folder):
    download_report_JET = pd.read_csv(os.path.join(data_folder, "JET", "download_report_JET.csv"))
    download_report_SINUS = pd.read_csv(os.path.join(data_folder, "SINUS", "download_report_SINUS.csv"))
    download_report_JET = download_report_JET.rename(columns={"Interval ID#": "interval_ID", " reference": "patient_ID"})
    download_report_SINUS = download_report_SINUS.rename(columns={"Interval ID#": "interval_ID", " reference": "patient_ID"})
    download_report_JET["label"] = 1
    download_report_SINUS["label"] = 0
    download_report_all = pd.concat([download_report_JET, download_report_SINUS], axis=0).rename(columns={"label": "True_label"})
    download_report_all["block_ID"] = 1
    download_report_all[" start_time"] = download_report_all[" start_time"].apply(Process_time_string)
    download_report_all[" stop_time"] = download_report_all[" stop_time"].apply(Process_time_string)
    print(download_report_all.columns)
    return download_report_all

In [None]:
timer = Timer()
""" Load data """
save_location = "Data_20220201"
save_location = "Data_20220105"
save_location = "Data_20211201"
save_location = "Data_20211109"
save_location = "Data_20211028"
ecg_data_folder = "JET-Detection"
download_report_all = Get_download_report(ecg_data_folder)

feature_name_list = ["pr_int_iqr", "p_prom_med"]
identifier_list = ["cycle_ID", "patient_ID", "interval_ID", "block_ID", "channel_ID", "r_ID_abs", "label", "r_ID_abs_ref"]

feature_df_list_all = Get_feature_df_list_for_all_ECG_type(data_folder=ecg_data_folder, feature_folder=save_location)
feature_df_all = pd.concat(feature_df_list_all, axis=0)

common_heartbeats_df_selected = pd.read_csv("common_heartbeats_df_selected.csv")
feature_df_all_selected = feature_df_all.merge(common_heartbeats_df_selected, how="left")
feature_df_all_selected = feature_df_all_selected[~feature_df_all_selected["selected"].isna()]
timer.Print("Load data completed.")

""" Data preprocessing """
feature_df_all_nona_more_features_processed = feature_df_all_selected.copy()
lead_II_and_not_aligned = feature_df_all_nona_more_features_processed["r_ID_abs_ref"].isna() & (
            feature_df_all_nona_more_features_processed["channel_ID"] == 2)
print("lead_II_and_not_aligned.sum()", lead_II_and_not_aligned.sum())
feature_df_all_nona_more_features_processed.loc[lead_II_and_not_aligned, "r_ID_abs_ref"] = feature_df_all_nona_more_features_processed.loc[
    lead_II_and_not_aligned, "r_ID_abs"]

feature_df_all_nona_more_features_processed_nona = feature_df_all_nona_more_features_processed[identifier_list + feature_name_list].dropna()
feature_df_all_nona_more_features_processed_nona["r_ID_abs_ref"] = feature_df_all_nona_more_features_processed_nona["r_ID_abs_ref"].astype(int)

feature_df_train = feature_df_all_nona_more_features_processed_nona.query(f"patient_ID in {patient_ID_list_train}")
feature_df_test = feature_df_all_nona_more_features_processed_nona.query(f"patient_ID in {patient_ID_list_test}")
feature_df_train_identifier = feature_df_train[identifier_list].reset_index(drop=True)
feature_df_train_features = feature_df_train[feature_name_list].reset_index(drop=True)
feature_df_test_identifier = feature_df_test[identifier_list].reset_index(drop=True)
feature_df_test_features = feature_df_test[feature_name_list].reset_index(drop=True)
print(f"Train: {feature_df_train.shape}, Test: {feature_df_test.shape}, Total: {feature_df_all_nona_more_features_processed_nona.shape}")

scaler = StandardScaler()
scaler.fit(feature_df_train_features)
feature_df_train_features_normalized = scaler.transform(feature_df_train_features)
feature_df_train_features_normalized = pd.DataFrame(feature_df_train_features_normalized, columns=feature_name_list)
feature_df_test_features_normalized = scaler.transform(feature_df_test_features)
feature_df_test_features_normalized = pd.DataFrame(feature_df_test_features_normalized, columns=feature_name_list)
feature_df_train_labels = feature_df_train_identifier["label"]
feature_df_test_labels = feature_df_test_identifier["label"]

feature_df_train_normalized = pd.concat([feature_df_train_identifier, feature_df_train_features_normalized], axis=1)
feature_df_test_normalized = pd.concat([feature_df_test_identifier, feature_df_test_features_normalized], axis=1)
timer.Print("Data preprocessing completed.")

""" Model fitting """
clf_dict = {}
st = time.time()
channel_ID_list = [2, 4]
for channel_ID in channel_ID_list:
    """ Preparing data """
    feature_df_train_normalized_lead = feature_df_train_normalized.query(f"channel_ID == {channel_ID}")
    feature_df_test_normalized_lead = feature_df_test_normalized.query(f"channel_ID == {channel_ID}")
    print(f"[Time {time.time() - st:.1f}]", channel_ID, feature_df_train_normalized_lead.shape, feature_df_test_normalized_lead.shape)

    patient_ID_train = feature_df_train_normalized_lead["patient_ID"]
    X_train = feature_df_train_normalized_lead[feature_name_list]
    y_train = feature_df_train_normalized_lead["label"]
    X_test = feature_df_test_normalized_lead[feature_name_list]
    y_test = feature_df_test_normalized_lead["label"]

    """ Experiments setup """
    leave_one_group_out_cv = LeaveOneGroupOut()
    tuned_parameters = [{'C': [1 / 1e-8, 1 / 0.03, 1 / 0.08, 1 / 0.3, 1 / 0.8, 1 / 3, 1 / 8, 1 / 30, 1 / 80, 1 / 300, 1 / 800, 1 / 3000, 1 / 8000, 1 / 30000, 1 / 80000, 1 / 300000, 1 / 800000]}] # sklearn uses 1 / C where C is the regularization strength

    score = "accuracy"
    print(f"# Tuning hyper-parameters for {score}\n")
    clf = GridSearchCV(
        LogisticRegression(solver="saga", random_state=0, penalty="l1"), tuned_parameters, scoring=score, cv=leave_one_group_out_cv)

    """ Run """
    clf.fit(X_train, y_train, groups=patient_ID_train.values)

    print("Best parameters set found on development set:\n")
    print(f"C = {1 / clf.best_params_['C']:.2f}\n")
    print("Grid scores on development set (using 5-fold cross validation):\n")
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print(f"{mean:.4f} (+/-{std * 3:.4f}) for {1 / params['C']:.2f}")

    print("\nDetailed classification report:\n")
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    y_true, y_pred = y_test, clf.predict(X_test)
    y_score = clf.best_estimator_.predict_proba(X_test)
    best_auroc = roc_auc_score(y_true, y_score[:, 1])
    print(f"Best roc_auc_score = {best_auroc:.4f}\n")
    print(classification_report(y_true, y_pred))

    clf_dict[f"lead{channel_ID}"] = copy.deepcopy(clf)
    timer.Print(f"Channel {channel_ID} model fitted.")

    """ Visualize results """
    roc_curve_results_dict_train, roc_curve_results_dict_test = Draw_roc_curve(clf_dict[f"lead{channel_ID}"].best_estimator_, feature_df_train_normalized_lead[feature_name_list + ["label"]], feature_df_test_normalized_lead[feature_name_list + ["label"]], target_fpr=0.05)
    confusion_mat_train = Get_confusion_matrix(clf_dict[f"lead{channel_ID}"].best_estimator_, feature_df_train_normalized_lead[feature_name_list + ["label"]], roc_curve_results_dict_train['selected_threshold'])
    confusion_mat_test = Get_confusion_matrix(clf_dict[f"lead{channel_ID}"].best_estimator_, feature_df_test_normalized_lead[feature_name_list + ["label"]], roc_curve_results_dict_train['selected_threshold'])
    Visualize_confusion_matrix(confusion_mat_train, title="Confusion matrix - train")
    Visualize_confusion_matrix(confusion_mat_test, title="Confusion matrix - test")

    y_pred_train, y_pred_test, contour_result_dict_train = Draw_decision_boundary(clf_dict[f"lead{channel_ID}"].best_estimator_, feature_df_train_normalized_lead[feature_name_list + ["label"]], feature_df_test_normalized_lead[feature_name_list + ["label"]], target_fpr=0.05)
    print(f"Lead {channel_ID} model coeffcients:")
    print(clf_dict[f"lead{channel_ID}"].best_estimator_.coef_, clf_dict[f"lead{channel_ID}"].best_estimator_.intercept_)
    timer.Print(f"Channel {channel_ID} model visualized.")

    """ Save error files for MATLAB """
    feature_df_test_with_prediction = feature_df_test.query(f"channel_ID == {channel_ID}").copy()
    feature_df_test_with_prediction = feature_df_test_with_prediction.rename(columns={"label": "True_label"})
    feature_df_test_with_prediction["Pred_label"] = y_pred_test
    feature_df_test_with_pred_incorrect = feature_df_test_with_prediction.query("True_label != Pred_label")
    feature_df_test_with_pred_incorrect_with_start_time = pd.merge(feature_df_test_with_pred_incorrect, download_report_all, on=["interval_ID", "patient_ID", "block_ID", "True_label"], how="left")

    path_error = os.path.join("Errors", f"{save_location}_errors")
    os.makedirs(path_error, exist_ok=True)
    np.random.seed(0)
    feature_df_test_with_pred_incorrect_with_start_time.query("True_label == 1").sample(frac=1).to_csv(
        os.path.join(path_error, f"feature_df_test_with_pred_incorrect_with_start_time_FN_Lead{channel_ID}.csv"), index=False)
    feature_df_test_with_pred_incorrect_with_start_time.query("True_label == 0").sample(frac=1).to_csv(
        os.path.join(path_error, f"feature_df_test_with_pred_incorrect_with_start_time_FP_Lead{channel_ID}.csv"), index=False)
    timer.Print(f"Channel {channel_ID} model error csv saved.")

    default_channel_ID = 2
    if channel_ID == default_channel_ID:
        feature_df_all_selected_with_ecg = pd.read_csv(os.path.join("feature_df_all_selected_with_ecg_20220210.csv"))
        feature_with_ecg_df_test = feature_df_all_selected_with_ecg.query(f"patient_ID in {patient_ID_list_test}")
        feature_with_ecg_df_test_lead = feature_with_ecg_df_test.query(f"channel_ID == {default_channel_ID}").rename(columns={"label": "True_label"})
        columns_to_drop = ['pr_int_iqr', 'p_prom_med']
        feature_df_test_with_pred_incorrect_with_start_time_with_ecg = pd.merge(
            feature_df_test_with_pred_incorrect_with_start_time.drop(columns=columns_to_drop), feature_with_ecg_df_test_lead,
            on=["interval_ID", "patient_ID", "channel_ID", "r_ID_abs", "True_label"], how="left")
        Plot_ecg_for_incorrect_predictions(feature_df_test_with_pred_incorrect_with_start_time_with_ecg, feature_df_all_selected_with_ecg, path_error=os.path.join("Errors", f"{save_location}_errors"))
        timer.Print(f"Channel {channel_ID} model error ecg saved.")


try:
    import pickle5
    os.makedirs("Models", exist_ok=True)
    with open(os.path.join("Models", "L1_logistic_regression.pickle"), "wb") as f:
        pickle5.dump(clf_dict, f)
    timer.Print(f"Channel {channel_ID} model saved.")
except:
    print("Package pickle5 not installed.")

In [2]:
from torch.utils.data import Dataset


def Normalize(vec, eps=1e-8):
    """ Normalize a 1d vector to 0-1 range """
    vec = vec - np.min(vec)
    vec = vec / np.max(vec + eps)
    return vec

def Lower(word):
    """ Convert word to lower case """
    return word.lower()

class ECG_classification_dataset_with_peak_features(Dataset):
    def __init__(self, feature_df_all_selected_p_ind_with_ecg, ecg_resampling_length_target=300,
                 peak_loc_name="p_ind_resampled", label_name="label", short_identifier_list=None,
                 peak_feature_name_list=None, shift_signal=False, shift_amount=None, normalize_signal=False,
                 transforms=None, dataset_name="tch-ecg-jet-p40"):
        """
        normalize_signal: Normalize each individual signal to 0 - 1 range
        """
        print(f"ecg_resampling_length_target: {ecg_resampling_length_target}")
        if short_identifier_list is None:
            short_identifier_list = ['patient_ID', 'interval_ID', 'block_ID', 'channel_ID', 'r_ID_abs', 'label',
                                     'r_ID_abs_ref']
        if peak_feature_name_list is None:
            peak_feature_name_list = ["p_prom_med", "pr_int_iqr"]

        if transforms is None:
            self.transforms = []
        else:
            if isinstance(transforms, str):
                transforms = [transforms]
            self.transforms = [Lower(ele) for ele in transforms]

        self.dataset_name = dataset_name
        self.short_identifier_list = short_identifier_list
        self.peak_feature_name_list = peak_feature_name_list
        self.label_name = label_name
        self.device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
        self.feature_df_all_selected_p_ind_with_ecg = feature_df_all_selected_p_ind_with_ecg
        self.ecg_resampling_length = 300
        self.ecg_resampling_length_target = ecg_resampling_length_target
        self.ecg_colnames = [f"ecg{i + 1}" for i in range(self.ecg_resampling_length)]
        self.peak_loc_name = peak_loc_name
        self.ecg_mat = self.feature_df_all_selected_p_ind_with_ecg[self.ecg_colnames].values
        self.peak_label_list = self.feature_df_all_selected_p_ind_with_ecg[self.peak_loc_name].values
        self.label_list = self.feature_df_all_selected_p_ind_with_ecg[self.label_name].values
        self.short_identifier_mat = self.feature_df_all_selected_p_ind_with_ecg[self.short_identifier_list].values
        self.peak_feature_mat = self.feature_df_all_selected_p_ind_with_ecg[self.peak_feature_name_list].values

        self.shift_signal = shift_signal
        self.shift_amount = shift_amount
        self.normalize_signal = normalize_signal
        if self.shift_signal:
            if self.shift_amount is None:
                self.shift_amount = 0
            self.ecg_mat -= self.shift_amount  # Shift ECG to 0 baseline

        if self.normalize_signal:
            ecg_min = np.min(self.ecg_mat, axis=1)[:, np.newaxis]
            ecg_max = np.max(self.ecg_mat, axis=1)[:, np.newaxis]
            self.ecg_mat = (self.ecg_mat - ecg_min) / (ecg_max - ecg_min)

    def obtain_perturbed_frame(self, frame):
        # Adapted from https://github.com/danikiyasseh/CLOCS

        """ Apply Sequence of Perturbations to Frame
        Args:
            frame (numpy array): frame containing ECG data
        Outputs
            frame (numpy array): perturbed frame based
        """
        if Lower('Gaussian') in self.transforms:
            mult_factor = 1
            if self.dataset_name in ['ptb', 'physionet2020']:
                # The ECG frames were normalized in amplitude between the values of 0 and 1.
                variance_factor = 0.01 * mult_factor
            elif self.dataset_name in ['cardiology', 'chapman']:
                variance_factor = 10 * mult_factor
            elif self.dataset_name in ['physionet', 'physionet2017']:
                variance_factor = 100 * mult_factor
            elif self.dataset_name in ["tch-ecg-jet-p40"]:
                variance_factor = 0.01 * mult_factor
            else:
                raise NotImplementedError("Dataset not implemented")
            gauss_noise = np.random.normal(0, variance_factor, size=(self.ecg_resampling_length_target))
            frame = frame + gauss_noise

        if Lower('FlipAlongY') in self.transforms:
            frame = np.flip(frame)

        if Lower('FlipAlongX') in self.transforms:
            frame = -frame

        # Keep data in 0-1 range
        frame = Normalize(frame)
        return frame

    def __len__(self):
        return len(self.feature_df_all_selected_p_ind_with_ecg)

    def __getitem__(self, idx):
        X = self.ecg_mat[idx, :]
        if self.ecg_resampling_length_target != self.ecg_resampling_length:
            X = resample_poly(X, int(self.ecg_resampling_length_target / 100), int(self.ecg_resampling_length / 100),
                              padtype="line")
        X = Normalize(X)
        X_aug = self.obtain_perturbed_frame(X)
        peak_idx = self.peak_label_list[idx]
        label = self.label_list[idx]
        id_vec = self.short_identifier_mat[idx, :]
        peak_features = self.peak_feature_mat[idx, :]

        # return X[np.newaxis, :], peak_idx, label, id_vec, peak_features[np.newaxis, :]
        if len(self.transforms) == 0:
            return (X[np.newaxis, :], X_aug[np.newaxis, :]), label
        else:
            return X[np.newaxis, :], label


# Load data

In [8]:
import os
import platform
import time
import pickle
import numpy as np
import pandas as pd
import argparse
import random
import torch
from torch.utils.data import Dataset
from scipy.signal import resample_poly

patient_ID_list_train = [398573, 462229, 637891, 667681, 537854, 628521, 642321, 662493,
                         387479, 624179, 417349, 551554, 631270, 655769, 678877]  # 15
patient_ID_list_test = [756172, 424072, 748555, 748900, 759678, 741235, 595561, 678607,
                        782501, 510915, 771495, 740475, 533362, 581650, 803389, 577874,
                        681150, 536886, 477589, 844864, 824744, 515544, 771958, 725860, 609090]  # 25
patient_ID_list_val = [462229, 642321, 387479]
patient_ID_list_dev = [patient_ID for patient_ID in patient_ID_list_train if patient_ID not in patient_ID_list_val]



if platform.system() == "Darwin":
    print("Using MacOS.")
    data_folder = os.path.normpath("/Users/yj31/Dropbox/Study/GitHub/JET-Detection")
    data_folder_2 = data_folder
elif platform.system() == "Linux":
    large_data_folder = data_folder_2 = data_folder = ""
    print("Using Linux.")
else:
    print("Using Windows.")
    data_folder = os.path.normpath("D:\\Dropbox\\Study\\GitHub\\JET-Detection")
    # data_folder_2 = os.path.normpath("D:\\Backup\\JET-Detection\\")
    data_folder_2 = data_folder
    large_data_folder = os.path.normpath("D:\\Backup\\JET-Detection\\Heartbeats_dict_20220201\\")

save_folder = os.path.join(data_folder, "Results")

Using MacOS.
Data shape: (10000, 371), train: (7642, 371), dev: (6690, 371), val: (952, 371), test: (2358, 371)


# Compute cc

$\mathbf{f}$: Time-course function in a particular pixel

$f_i$: $i$th element (tick) of $\mathbf{f}$

$\mu_f$: average of $\mathbf{f}$

$\mathbf{r}$: Time-course function in a reference pixel - *mean of ECG of a given label type*

$r_i$: $i$th element (tick) of $\mathbf{r}$

$\mu_r$: average of $\mathbf{r}$

In [12]:
class_mean_summary = feature_with_ecg_df_train_single_lead.groupby(["label"]).agg({ele: "mean" for ele in ecg_colnames})
class_mean_summary

Unnamed: 0_level_0,ecg1,ecg2,ecg3,ecg4,ecg5,ecg6,ecg7,ecg8,ecg9,ecg10,...,ecg291,ecg292,ecg293,ecg294,ecg295,ecg296,ecg297,ecg298,ecg299,ecg300
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-1.755326,-1.953039,-1.875564,-1.65256,-1.458367,-1.383229,-1.393706,-1.40426,-1.357915,-1.258599,...,-1.016271,-1.173605,-1.304049,-1.398137,-1.470227,-1.550712,-1.663186,-1.790479,-1.854859,-1.754734
1,-0.51641,-0.586584,-0.650702,-0.602289,-0.492103,-0.451481,-0.494895,-0.53294,-0.521067,-0.488851,...,-0.595169,-0.637709,-0.658493,-0.653338,-0.633022,-0.625785,-0.637344,-0.621625,-0.541468,-0.440227


In [19]:
class_mean_sinus = class_mean_summary.loc[0].values
class_mean_jet = class_mean_summary.loc[1].values
class_mean_sinus.shape, class_mean_jet.shape
class_mean_sinus_mean = np.mean(class_mean_sinus)
class_mean_jet_mean = np.mean(class_mean_jet)
class_mean_sinus_mean, class_mean_jet_mean

(0.01881555115694953, 0.008923398589509102)

In [16]:
ecg_mat_sinus = feature_with_ecg_df_train_single_lead.query(f"label == 0")[ecg_colnames].values
ecg_mat_jet = feature_with_ecg_df_train_single_lead.query(f"label == 1")[ecg_colnames].values
ecg_mat_sinus.shape, ecg_mat_jet.shape

((1230, 300), (420, 300))

In [36]:
import latexify
@latexify.with_latex
def Compute_cc(X):
    ecg_mat_class = X
#     ecg_mat_class = ecg_mat_sinus
    ecg_class_mean = np.mean(ecg_mat_class, axis=0)
    ecg_class_mean_mean = np.mean(ecg_class_mean)
    ecg_class_mean_centered = (ecg_class_mean - ecg_class_mean_mean)[:, np.newaxis]

    ecg_mat_class_centered = ecg_mat_class - np.mean(ecg_mat_class, axis=1)[:, np.newaxis]
    ecg_mat_class_centered.shape, ecg_class_mean_centered.shape

    cc = ecg_mat_class_centered @ ecg_class_mean_centered \
    / (np.linalg.norm(ecg_mat_class_centered, ord=2, axis=1) * np.linalg.norm(ecg_class_mean_centered, ord=2))

    return cc
Compute_cc

<latexify.core.with_latex.<locals>._LatexifiedFunction at 0x7fa7d16e8890>

In [25]:
ecg_mat_class = ecg_mat_sinus
ecg_class_mean = np.mean(ecg_mat_class, axis=0)
ecg_class_mean_mean = np.mean(ecg_class_mean)
ecg_class_mean_centered = (ecg_class_mean - ecg_class_mean_mean)[:, np.newaxis]

ecg_mat_class_centered = ecg_mat_class - np.mean(ecg_mat_class, axis=1)[:, np.newaxis]
ecg_mat_class_centered.shape, ecg_class_mean_centered.shape

cc = ecg_mat_class_centered @ ecg_class_mean_centered \
    / (np.linalg.norm(ecg_mat_class_centered, ord=2, axis=1) * np.linalg.norm(ecg_class_mean_centered, ord=2))



((1230, 300), (300, 1))

In [32]:
np.sqrt(np.sum(ecg_mat_class_centered ** 2, axis=1))

array([ 3.16200177,  4.88625773,  7.96143958, ...,  4.21726409,
       14.52191245, 14.38463665])

In [33]:
np.linalg.norm(ecg_mat_class_centered, ord=2, axis=1)

array([ 3.16200177,  4.88625773,  7.96143958, ...,  4.21726409,
       14.52191245, 14.38463665])

In [34]:
np.linalg.norm(ecg_class_mean_centered, ord=2)

9.024288781570759

# Prepare dataset

In [42]:
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import LeaveOneGroupOut, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import LinearSVC
import copy

## Load data

In [59]:
# TODO: modify this step to make it use less memory at once
debug = True
debug = False
# debug = args.debug
if debug:
    feature_df_all_selected_with_ecg = pd.read_csv(
        os.path.join(data_folder_2, "feature_df_all_selected_with_ecg_20220210_rtfixed_sample10000.csv"))
else:
    feature_df_all_selected_with_ecg = pd.read_csv(
        os.path.join(data_folder_2, "feature_df_all_selected_with_ecg_20220210_rtfixed.csv"))
feature_with_ecg_df_train = feature_df_all_selected_with_ecg.query(f"patient_ID in {patient_ID_list_train}")
feature_with_ecg_df_test = feature_df_all_selected_with_ecg.query(f"patient_ID in {patient_ID_list_test}")
feature_with_ecg_df_dev = feature_df_all_selected_with_ecg.query(f"patient_ID in {patient_ID_list_dev}")
feature_with_ecg_df_val = feature_df_all_selected_with_ecg.query(f"patient_ID in {patient_ID_list_val}")
print(f"Data shape: {feature_df_all_selected_with_ecg.shape}, "
      f"train: {feature_with_ecg_df_train.shape}, "
      f"dev: {feature_with_ecg_df_dev.shape}, "
      f"val: {feature_with_ecg_df_val.shape}, "
      f"test: {feature_with_ecg_df_test.shape}")


Data shape: (1899356, 370), train: (1447728, 370), dev: (1266878, 370), val: (180850, 370), test: (451628, 370)


## Model fitting

In [60]:
clf_dict = {}
st = time.time()
channel_ID_list = [1, 2, 3, 4]
channel_ID_list = [2]
for channel_ID in channel_ID_list:
#     channel_ID = 2
    """ Get dataloader """
    feature_with_ecg_df_train_single_lead = feature_with_ecg_df_train.query(f"channel_ID == {channel_ID}")
    feature_with_ecg_df_test_single_lead = feature_with_ecg_df_test.query(f"channel_ID == {channel_ID}")

    ecg_resampling_length = 300
    ecg_colnames = [f"ecg{i + 1}" for i in range(ecg_resampling_length)]
    ecg_mat = feature_with_ecg_df_train_single_lead[ecg_colnames].values
    signal_min_train = np.min(ecg_mat.ravel())
    
    feature_df_train = feature_with_ecg_df_train_single_lead
    feature_df_test = feature_with_ecg_df_test_single_lead
    identifier_list = ["cycle_ID", "patient_ID", "interval_ID", "block_ID", "channel_ID", "r_ID_abs", "label", "r_ID_abs_ref"]
    feature_name_list = ecg_colnames
    
    """ Normalizing data """
    feature_df_train_identifier = feature_df_train[identifier_list].reset_index(drop=True)
    feature_df_train_features = feature_df_train[feature_name_list].reset_index(drop=True)
    feature_df_test_identifier = feature_df_test[identifier_list].reset_index(drop=True)
    feature_df_test_features = feature_df_test[feature_name_list].reset_index(drop=True)
    # print(f"Train: {feature_df_train.shape}, Test: {feature_df_test.shape}, Total: {feature_df_all_nona_more_features_processed_nona.shape}")
    
    scaler = MinMaxScaler()
    scaler.fit(feature_df_train_features)
    feature_df_train_features_normalized = scaler.transform(feature_df_train_features)
    feature_df_train_features_normalized = pd.DataFrame(feature_df_train_features_normalized, columns=feature_name_list)
    feature_df_test_features_normalized = scaler.transform(feature_df_test_features)
    feature_df_test_features_normalized = pd.DataFrame(feature_df_test_features_normalized, columns=feature_name_list)
    feature_df_train_labels = feature_df_train_identifier["label"]
    feature_df_test_labels = feature_df_test_identifier["label"]
    feature_df_train_normalized = pd.concat([feature_df_train_identifier, feature_df_train_features_normalized], axis=1)
    feature_df_test_normalized = pd.concat([feature_df_test_identifier, feature_df_test_features_normalized], axis=1)
    # timer.Print("Data preprocessing completed.")

    """ Preparing data """
    feature_df_train_normalized_lead = feature_df_train_normalized.query(f"channel_ID == {channel_ID}")
    feature_df_test_normalized_lead = feature_df_test_normalized.query(f"channel_ID == {channel_ID}")
    print(f"[Time {time.time() - st:.1f}]", channel_ID, feature_df_train_normalized_lead.shape, feature_df_test_normalized_lead.shape)

    patient_ID_train = feature_df_train_normalized_lead["patient_ID"]
    X_train = feature_df_train_normalized_lead[feature_name_list]
    y_train = feature_df_train_normalized_lead["label"]
    X_test = feature_df_test_normalized_lead[feature_name_list]
    y_test = feature_df_test_normalized_lead["label"]
    print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

    """ Experiments setup """
    leave_one_group_out_cv = LeaveOneGroupOut()
    tuned_parameters = [{
        'C': [1 / 1e-6, 1 / 1e-4, 1 / 1e-2, 1 / 1, 1 / 1e2, 1 / 1e4, 1 / 1e6],
        'C': [1 / 1e-2, 1 / 3.16e-2, 1 / 1e-1, 1 / 3.16e-1, 1 / 1, 1 / 3.16,
              1 / 1e1, 1 / 3.16e1, 1 / 1e2, 1 / 3.16e2, 1 / 1e3, 1 / 3.16e3, 1 / 1e4],
    }] # sklearn uses 1 / C where C is the regularization strength

    score = "accuracy"
    print(f"# Tuning hyper-parameters for {score}\n")
    clf = GridSearchCV(
        LinearSVC(random_state=0, dual=False), tuned_parameters, scoring=score, cv=leave_one_group_out_cv)

    """ Run """
    clf.fit(X_train, y_train, groups=patient_ID_train.values)

    print("Best parameters set found on development set:\n")
    print(f"C = {1 / clf.best_params_['C']:.2f}\n")
    print("Grid scores on development set (using 5-fold cross validation):\n")
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print(f"{mean:.4f} (+/-{std * 3:.4f}) for {1 / params['C']:.2f}")

    print("\nDetailed classification report:\n")
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))

    clf_dict[f"lead{channel_ID}"] = copy.deepcopy(clf)
    
    

[Time 6.4] 2 (362268, 308) (114412, 308)
(362268, 300) (362268,) (114412, 300) (114412,)
# Tuning hyper-parameters for accuracy

Best parameters set found on development set:

C = 10000.00

Grid scores on development set (using 5-fold cross validation):

0.6615 (+/-0.8433) for 0.01
0.6591 (+/-0.8452) for 0.03
0.6590 (+/-0.8467) for 0.10
0.6537 (+/-0.8462) for 0.32
0.6556 (+/-0.8392) for 1.00
0.6570 (+/-0.8458) for 3.16
0.6511 (+/-0.8588) for 10.00
0.6403 (+/-0.8715) for 31.60
0.6324 (+/-0.8793) for 100.00
0.6192 (+/-0.8625) for 316.00
0.6199 (+/-0.8054) for 1000.00
0.6402 (+/-0.7326) for 3160.00
0.7069 (+/-0.7808) for 10000.00

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.
              precision    recall  f1-score   support

           0       0.62      1.00      0.76     68589
           1       0.98      0.08      0.14     45823

    accuracy                           0.63    114412
   macro av

In [61]:
# clf_dict = {}
st = time.time()
channel_ID_list = [1, 2, 3, 4]
channel_ID_list = [2]
for channel_ID in channel_ID_list:
#     channel_ID = 2
    """ Get dataloader """
    feature_with_ecg_df_train_single_lead = feature_with_ecg_df_train.query(f"channel_ID == {channel_ID}")
    feature_with_ecg_df_test_single_lead = feature_with_ecg_df_test.query(f"channel_ID == {channel_ID}")

    ecg_resampling_length = 300
    ecg_colnames = [f"ecg{i + 1}" for i in range(ecg_resampling_length)]
    ecg_mat = feature_with_ecg_df_train_single_lead[ecg_colnames].values
    signal_min_train = np.min(ecg_mat.ravel())
    
    feature_df_train = feature_with_ecg_df_train_single_lead
    feature_df_test = feature_with_ecg_df_test_single_lead
    identifier_list = ["cycle_ID", "patient_ID", "interval_ID", "block_ID", "channel_ID", "r_ID_abs", "label", "r_ID_abs_ref"]
    feature_name_list = ecg_colnames
    
    """ Normalizing data """
    feature_df_train_identifier = feature_df_train[identifier_list].reset_index(drop=True)
    feature_df_train_features = feature_df_train[feature_name_list].reset_index(drop=True)
    feature_df_test_identifier = feature_df_test[identifier_list].reset_index(drop=True)
    feature_df_test_features = feature_df_test[feature_name_list].reset_index(drop=True)
    # print(f"Train: {feature_df_train.shape}, Test: {feature_df_test.shape}, Total: {feature_df_all_nona_more_features_processed_nona.shape}")
    
    scaler = MinMaxScaler()
    scaler.fit(feature_df_train_features)
    feature_df_train_features_normalized = scaler.transform(feature_df_train_features)
    feature_df_train_features_normalized = pd.DataFrame(feature_df_train_features_normalized, columns=feature_name_list)
    feature_df_test_features_normalized = scaler.transform(feature_df_test_features)
    feature_df_test_features_normalized = pd.DataFrame(feature_df_test_features_normalized, columns=feature_name_list)
    feature_df_train_labels = feature_df_train_identifier["label"]
    feature_df_test_labels = feature_df_test_identifier["label"]
    feature_df_train_normalized = pd.concat([feature_df_train_identifier, feature_df_train_features_normalized], axis=1)
    feature_df_test_normalized = pd.concat([feature_df_test_identifier, feature_df_test_features_normalized], axis=1)
    # timer.Print("Data preprocessing completed.")

    """ Preparing data """
    feature_df_train_normalized_lead = feature_df_train_normalized.query(f"channel_ID == {channel_ID}")
    feature_df_test_normalized_lead = feature_df_test_normalized.query(f"channel_ID == {channel_ID}")
    print(f"[Time {time.time() - st:.1f}]", channel_ID, feature_df_train_normalized_lead.shape, feature_df_test_normalized_lead.shape)

    patient_ID_train = feature_df_train_normalized_lead["patient_ID"]
    X_train = feature_df_train_normalized_lead[feature_name_list]
    y_train = feature_df_train_normalized_lead["label"]
    X_test = feature_df_test_normalized_lead[feature_name_list]
    y_test = feature_df_test_normalized_lead["label"]
    print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

    """ Experiments setup """
    leave_one_group_out_cv = LeaveOneGroupOut()
    tuned_parameters = [{
#         'C': [1 / 1e-6, 1 / 1e-4, 1 / 1e-2, 1 / 1, 1 / 1e2, 1 / 1e4, 1 / 1e6],
#         'C': [1 / 1e-2, 1 / 3.16e-2, 1 / 1e-1, 1 / 3.16e-1, 1 / 1, 1 / 3.16,
#               1 / 1e1, 1 / 3.16e1, 1 / 1e2, 1 / 3.16e2, 1 / 1e3, 1 / 3.16e3, 1 / 1e4],
        'C': [1 / 3.16e4, 1 / 1e5],
    }] # sklearn uses 1 / C where C is the regularization strength

    score = "accuracy"
    print(f"# Tuning hyper-parameters for {score}\n")
    clf = GridSearchCV(
        LinearSVC(random_state=0, dual=False), tuned_parameters, scoring=score, cv=leave_one_group_out_cv)

    """ Run """
    clf.fit(X_train, y_train, groups=patient_ID_train.values)

    print("Best parameters set found on development set:\n")
    print(f"C = {1 / clf.best_params_['C']:.2f}\n")
    print("Grid scores on development set (using 5-fold cross validation):\n")
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print(f"{mean:.4f} (+/-{std * 3:.4f}) for {1 / params['C']:.2f}")

    print("\nDetailed classification report:\n")
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))

    clf_dict[f"lead{channel_ID}_largeC"] = copy.deepcopy(clf)
    
    

[Time 18.2] 2 (362268, 308) (114412, 308)
(362268, 300) (362268,) (114412, 300) (114412,)
# Tuning hyper-parameters for accuracy

Best parameters set found on development set:

C = 100000.00

Grid scores on development set (using 5-fold cross validation):

0.7115 (+/-0.7870) for 31600.00
0.7122 (+/-0.7880) for 100000.00

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.
              precision    recall  f1-score   support

           0       0.60      1.00      0.75     68589
           1       0.48      0.00      0.00     45823

    accuracy                           0.60    114412
   macro avg       0.54      0.50      0.38    114412
weighted avg       0.55      0.60      0.45    114412

