In [1]:
import os
os.chdir("/opt/project")
os.environ['TF_CPP_MIN_LOG_LEVEL'] = "3"

import h5py
import psutil
import numpy as np
import pandas as pd
from contextlib import suppress
from settings import (LOGS_PATH, MODELS_PATH, PREDICTIONS_PATH,
                      LABEL_DATA_PATH, INPUT_DATA_PATH, TEST_DATA_PATH)

import keras
keras.mixed_precision.set_global_policy("mixed_float16")

from model import get_model
from datasets import ECGSequence
from statistics import ECGStatistics

In [None]:
def align_entries(indices: np.ndarray, data: np.ndarray, num_classes=6):
    ydict = {}
    for item in data:
        ydict[item[0]] = item

    result = np.empty(shape=(len(indices), num_classes), dtype=object)
    for index, value in enumerate(indices):
        with suppress(KeyError):
            result[index] = ydict[str(value)][4: 4 + num_classes] == "True"

    return result

In [None]:
def process_and_export_file_data(input_file: str, output_file: str):
    y = pd.read_csv(LABEL_DATA_PATH.parent / "exams.csv", dtype=object).values
    with h5py.File(INPUT_DATA_PATH / f"{input_file}", "r+") as file:
        x, ids = file["tracings"], file["exam_id"]
        y_curr = align_entries(ids, y)

        # Log info
        print(f" FILE: {input_file} -> {output_file} ".center(60, "*"))
        print(f"X SHAPE: {x.shape}".center(60, " "))
        print(f"Y SHAPE: {y_curr.shape}".center(60, " "))
        print(f"I SHAPE: {ids.shape}\n".center(60, " "))

        # Save labels
        pd.DataFrame(y_curr).astype(int).to_csv(
            LABEL_DATA_PATH / output_file,
            sep=",", encoding="utf-8", index=False, header=True
        )

In [None]:
for i in range(len(inp := os.listdir(INPUT_DATA_PATH))):
    if not f"exams_part{i}.hdf5" in inp: break
    process_and_export_file_data(f"exams_part{i}.hdf5", f"exams_part{i}.csv")

In [None]:
# Model settings
val_split = 0.02
dataset_name = "tracings"
model_id = len(os.listdir("models")) - 1
worker_num = psutil.cpu_count(logical=True) - 2

# Optimization settings
lr = 0.001
batch_size = 64
opt = keras.optimizers.Adam(lr)
loss = keras.losses.BinaryCrossentropy()

callbacks = [
    # Learning Optimizers
    keras.callbacks.EarlyStopping(patience=9, min_delta=0.00001),
    keras.callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.1,
                                      patience=7, min_lr=lr / 100),
    # Logs
    keras.callbacks.TensorBoard(log_dir=LOGS_PATH, write_graph=False),
    keras.callbacks.CSVLogger(LOGS_PATH / "training.log", append=False),
    # Checkpoints
    keras.callbacks.ModelCheckpoint(MODELS_PATH / f"backup/model_last_{model_id}.keras"),
    keras.callbacks.ModelCheckpoint(MODELS_PATH / f"backup/model_best_{model_id}.keras"),
]

train_seq, valid_seq = ECGSequence.get_train_and_val(
    [INPUT_DATA_PATH / file for file in os.listdir(INPUT_DATA_PATH)],
    [LABEL_DATA_PATH / file for file in os.listdir(LABEL_DATA_PATH)],
    dataset_name, batch_size, val_split,
    workers=worker_num, use_multiprocessing=True
)

# If you are continuing an interrupted section, uncomment line bellow:
# model = keras.models.load_model(PATH_TO_MODEL, compile=False)
model = get_model(train_seq.n_classes)
model.compile(loss=loss, optimizer=opt)

In [None]:
# Train neural network
# If you are continuing an interrupted section change initial epoch
history = model.fit(train_seq,
                    epochs=70,
                    initial_epoch=0,
                    callbacks=callbacks,
                    validation_data=valid_seq,
                    verbose=1)
# Save final result
model.save(MODELS_PATH / f"model_{model_id}.keras")

In [None]:
# Predictions of the model on the test set
seq = ECGSequence([TEST_DATA_PATH / "ecg_tracings.hdf5"], None, dataset_name, batch_size=1)
y_score = model.predict(seq, verbose=1)

np.save(PREDICTIONS_PATH / f"predictions_{model_id}.npy", y_score)

In [None]:
# Generate figures
stats = ECGStatistics(model_id)

stats.generate_table_two()
stats.generate_supplementary_figure_one()