# Replicate CinC2020 Methodology for comparison

In [11]:
import os
import json
import pickle
from glob import glob

import torch
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
import zarr

from utils import ElapsedTimer
from utils.evaluation_helper import evaluate_score_batch
from utils.evaluate_12ECG_score import load_table

In [9]:
experiment_dir = "full_output"
print(experiment_dir)


full_output


In [6]:
print("Loading features...")
with ElapsedTimer() as t:
    features = pd.read_csv(
        os.path.join(experiment_dir, "features.csv"), index_col="header_file"
    )
    features.sort_values(by=["header_file"], inplace=True)
    # features
    print(f"Took {t.duration:.2f}s")

KeyboardInterrupt: 

In [12]:
dxs = {}
print("Loading labels...")
with open(os.path.join(experiment_dir, "dxs.txt")) as f:
    for line in f.readlines():
        k, v = json.loads(line)
        dxs[k] = v

Loading labels...


In [13]:
with open("data/snomed_ct_dx_map.json", "r") as f:
    SNOMED_CODE_MAP = json.load(f)
# SNOMED_CODE_MAP

In [15]:
root = zarr.open_group("data/ecgs.zarr", mode="r")
root.info

# check dxs match!
for idx, file_path in enumerate(features.iloc[:100].index.to_list()):
    assert list(root["raw/dx"][idx]) == dxs[file_path], idx


In [16]:
weights_file = "evaluation-2020/weights.csv"
rows, cols, all_weights = load_table(weights_file)
assert rows == cols
scored_codes = rows
# scored_codes

## Load same training/validation/test splits

In [7]:
torch_checkpoints = glob("log_beat_autoencoder/*/checkpoints/*.ckpt")
data_configs = []
for torch_checkpoint in torch_checkpoints:
    state_dict = torch.load(torch_checkpoint)
    raw_data_config = state_dict["hyper_parameters"]["data_config"]
    
    data_config = {}
    for k, v in raw_data_config.items():
        data_config[k] = v.indices
    data_configs.append(data_config)

    # data_configs.append(data_config)

len(data_configs)

20

In [8]:
data_configs[0].keys()

dict_keys(['train_records', 'val_records', 'test_records'])

### Train Classifiers per label

In [9]:
def _determine_sample_weights(
    data_set, scored_codes, label_weights, weight_threshold=0.5
):
    """Using the scoring labels weights to increase the dataset size of positive labels
    """
    data_labels = []
    sample_weights = []
    for dt in data_set:
        sample_weight = None
        for dx in dt:
            if str(dx) in scored_codes:
                _sample_weight = label_weights[scored_codes.index(str(dx))]
                if _sample_weight < weight_threshold:
                    continue
                if sample_weight is None or _sample_weight > sample_weight:
                    sample_weight = _sample_weight

        if sample_weight is None:
            # not a scored label, treat as a negative example (weight of 1)
            sample_weight = 1.0
            data_labels.append(False)
        else:
            data_labels.append(True)
        sample_weights.append(sample_weight)
    return data_labels, sample_weights


def _train_label_classifier(
    sc,
    idx_sc,
    all_weights,
    train_features,
    train_labels,
    eval_features,
    eval_labels,
    scored_codes,
    early_stopping_rounds,
    num_gpus,
):
    label_weights = all_weights[idx_sc]
    train_labels, train_weights = _determine_sample_weights(
        train_labels, scored_codes, label_weights
    )
    eval_labels, eval_weights = _determine_sample_weights(
        eval_labels, scored_codes, label_weights
    )

    # try negative over positive https://machinelearningmastery.com/xgboost-for-imbalanced-classification/
    pos_count = len([e for e in train_labels if e])
    pos_count = max(pos_count, 1)
    scale_pos_weight = (len(train_labels) - pos_count) / pos_count

    model = XGBClassifier(
        booster="dart",  # gbtree, dart or gblinear
        verbosity=0,
        tree_method = "gpu_hist",
        sampling_method = "gradient_based",
        scale_pos_weight=scale_pos_weight,
    )

    eval_set = [
        (train_features, train_labels), (eval_features, eval_labels)
    ]
    sample_weight_eval_set = [
        train_weights, eval_weights
    ]

    model = model.fit(
        train_features,
        train_labels,
        sample_weight=train_weights,
        eval_set=eval_set,
        sample_weight_eval_set=sample_weight_eval_set,
        early_stopping_rounds=early_stopping_rounds,
        verbose=False,
    )

    return sc, model

def train_experiment(
    data_config,
    all_weights=all_weights,
    scored_codes=scored_codes,
    features=features,
    root=root,
    early_stopping_rounds=20,
    
):
    train_idxs = data_config["train_records"]
    val_idxs = data_config["val_records"]
    test_idxs = data_config["test_records"]
    
    train_features, train_labels = features.take(train_idxs), np.take(root["raw/dx"], train_idxs)
    eval_features, eval_labels = features.take(val_idxs), np.take(root["raw/dx"], val_idxs)
    test_features, test_labels = features.take(test_idxs), np.take(root["raw/dx"], test_idxs)

    classes = []
    labels = []
    scores = []
    
    models = {}
    
    for idx_sc, sc in enumerate(scored_codes):
        with ElapsedTimer() as t:
            print(f"Training {SNOMED_CODE_MAP[sc][1]} classifier...", end="")
            sc, model = _train_label_classifier(
                sc,
                idx_sc,
                all_weights,
                train_features,
                train_labels,
                eval_features,
                eval_labels,
                scored_codes,
                early_stopping_rounds,
                1,
            )
            classes.append(str(sc))
            labels.append(model.predict(test_features).tolist())
            scores.append(model.predict_proba(test_features)[:, 1].tolist())
            models[sc] = model
        print(f"Took {t.duration:.2f} seconds")

    (
        classes,
        auroc,
        auprc,
        auroc_classes,
        auprc_classes,
        accuracy,
        f_measure,
        f_measure_classes,
        f_beta_measure,
        g_beta_measure,
        challenge_metric,
    ) = evaluate_score_batch(
        predicted_classes=classes,
        predicted_labels=np.array(labels).T,
        predicted_probabilities=np.array(scores).T,
        raw_ground_truth_labels=test_labels,
    )
        
    log = {
        "test_auroc": auroc,
        "test_auprc": auprc,
        "test_accuracy": accuracy,
        "test_f_measure": f_measure,
        "test_f_beta_measure": f_beta_measure,
        "test_g_beta_measure": g_beta_measure,
        "test_challenge_metric": challenge_metric,
    }
    class_output_string = "Classes,{}\nAUROC,{}\nAUPRC,{}\nF-measure,{}".format(
        ",".join(classes),
        ",".join("{:.3f}".format(x) for x in auroc_classes),
        ",".join("{:.3f}".format(x) for x in auprc_classes),
        ",".join("{:.3f}".format(x) for x in f_measure_classes),
    )

    print(log)
    return log, class_output_string, models


In [11]:
for dc_idx, data_config in enumerate(data_configs):
    print(f"Experiment {dc_idx}")
    with ElapsedTimer() as t:
        log, class_output_string, models = train_experiment(data_config)

        with open(f"cinc-2020-redux/v{dc_idx}_test_results.json", "w") as f:
            json.dump(log, f)
        with open(f"cinc-2020-redux/v{dc_idx}_test_class_labelwise.csv", "w") as f:
            f.write(class_output_string)
        with open(f"cinc-2020-redux/v{dc_idx}_models.pkl", "wb") as f:
            pickle.dump(models, f)
            
    print(f"Experiment {dc_idx} Took {t.duration:.2f}s")

Experiment 0
Training 1st degree av block classifier...Took 5.21 seconds
Training atrial fibrillation classifier...Took 5.09 seconds
Training atrial flutter classifier...Took 4.18 seconds
Training bradycardia classifier...Took 5.14 seconds
Training complete right bundle branch block classifier...Took 4.28 seconds
Training incomplete right bundle branch block classifier...Took 5.04 seconds
Training left anterior fascicular block classifier...Took 4.96 seconds
Training left axis deviation classifier...Took 4.99 seconds
Training left bundle branch block classifier...Took 3.71 seconds
Training low qrs voltages classifier...Took 3.98 seconds
Training nonspecific intraventricular conduction disorder classifier...Took 5.08 seconds
Training pacing rhythm classifier...Took 4.60 seconds
Training premature atrial contraction classifier...Took 4.90 seconds
Training premature ventricular contractions classifier...Took 4.90 seconds
Training Prolonged PR interval classifier...Took 5.18 seconds
Traini