# Import Data

In [None]:
import pickle as pkl
import numpy as np
import matplotlib.pyplot as plt

with open('cleaned_128', 'rb') as f:
    cleaned_128 = pkl.load(f)
with open('cleaned_down', 'rb') as f:
    cleaned_down = pkl.load(f)
with open('clean_peak_128', 'rb') as f:
    clean_peak_128 = pkl.load(f)
with open('clean_peak_down', 'rb') as f:
    clean_peak_down = pkl.load(f)
with open('clean_label_128', 'rb') as f:
    clean_label_128 = pkl.load(f)
with open('clean_label_down', 'rb') as f:
    clean_label_down = pkl.load(f)

# Build Sequences

In [None]:
# Define a Patient class so that train-validation-test split can be performed easily later
class Patient:
    def __init__(self,
                 sequences=None, masks=None,
                 sequences_labels=None, sequences_peaks=None):

        # Segmented beats and masks
        self.sequences = sequences if sequences is not None else []
        self.masks = masks if masks is not None else []
        # Labels and peak locations
        self.sequences_labels = sequences_labels if sequences_labels is not None else []
        self.sequences_peaks = sequences_peaks if sequences_peaks is not None else []

In [None]:
# Initialize the patient instances as empty lists
NUM_PATIENTS = len(cleaned_128) + len(cleaned_down)

patient_instances = [Patient() for _ in range(NUM_PATIENTS)]

In [None]:
def build_sequences(signal, peaks, labels, sampling_frequency, sequence_length):
    sequence_length_samples = int(sequence_length * sampling_frequency)

    sequences = []
    corresponding_peaks = []
    peak_labels = []
    num_samples = len(signal)
    num_sequences = int(np.ceil(num_samples / sequence_length_samples))

    for j in range(num_sequences):
        start_index = j * sequence_length_samples
        end_index = min((j + 1) * sequence_length_samples, num_samples)

        # Extract the sequence
        sequence = signal[start_index:end_index]

        # Pad the sequence if its duration is less than 30 seconds
        padding = np.zeros(sequence_length_samples - len(sequence))
        padded_sequence = np.concatenate((sequence, padding))

        sequences.append(padded_sequence)

        # Calculate corresponding peak position in the sequence
        relative_peak_positions = peaks[(peaks >= start_index) & (peaks < end_index)] - start_index
        corresponding_peaks.append(relative_peak_positions)

        # Find indices of relative_peak_positions in the original peaks
        peak_indices = np.where(np.isin(peaks, relative_peak_positions + start_index))[0]

        # Assign the labels of the found peaks to the sequence
        found_labels = labels[peak_indices]
        peak_labels.append(found_labels)


    return sequences, corresponding_peaks, peak_labels

In [None]:
def generate_mask_sequences(output_sequences, corresponding_peaks, labels, sampling_frequency):
    mask_sequences = []

    for i in range(len(output_sequences)):
        sequence = output_sequences[i]
        peaks = corresponding_peaks[i]
        label = labels[i]

        mask_sequence = np.zeros_like(sequence, dtype=int)

        # Defining the area of each beat for masks creation
        before_samples = 15
        after_samples = 25

        # Set mask values based on beat type
        for j in range(len(peaks)):
            if np.any(label[j] == "N"):
                mask_sequence[peaks[j] - before_samples : peaks[j] + after_samples] = 1
            elif np.any(label[j] == "S"):
                mask_sequence[peaks[j] - before_samples : peaks[j] + after_samples] = 2
            elif np.any(label[j] == "V"):
                mask_sequence[peaks[j] - before_samples : peaks[j] + after_samples] = 3

        mask_sequences.append(mask_sequence)

    return mask_sequences

In [None]:
import tqdm
print("Build sequences of 128Hz signals...")
for i, patient_instance in tqdm.tqdm(enumerate(patient_instances[:len(cleaned_128)]), total=len(cleaned_128)):
    # Build sequences of signal
    sequences_128, sequences_peaks_128, sequences_labels_128 = build_sequences(cleaned_128[i], clean_peak_128[i], clean_label_128[i], 128, 25)
    # Generate Masks for 128Hz
    mask_sequences_128 = generate_mask_sequences(sequences_128, sequences_peaks_128, sequences_labels_128, 128)

    # Store the sequences and masks in the patient instance
    patient_instance.sequences = sequences_128
    patient_instance.masks = mask_sequences_128
    patient_instance.sequences_peaks = sequences_peaks_128
    # Store the labels in the patient instance
    patient_instance.sequences_labels = sequences_labels_128


In [None]:
import tqdm
print("Build sequences of Downsampled signals...")
for i, patient_instance in tqdm.tqdm(enumerate(patient_instances[:len(cleaned_down)]), total=len(cleaned_down)):
    # Build sequences of signal
    sequences_down, sequences_peaks_down, sequences_labels_down = build_sequences(cleaned_down[i], clean_peak_down[i], clean_label_down[i], 128, 25)
    # Generate Masks for Downsampled
    mask_sequences_down = generate_mask_sequences(sequences_down, sequences_peaks_down, sequences_labels_down, 128)

    # Store the sequences and masks in the patient instance
    patient_instance.sequences = sequences_down
    patient_instance.masks = mask_sequences_down
    patient_instance.sequences_peaks = sequences_peaks_down
    # Store the labels in the patient instance
    patient_instance.sequences_labels = sequences_labels_down

In [None]:
plt.plot(patient_instance.masks[4])

In [None]:
plt.plot(patient_instance.sequences[4])

# Train-Test split

In [None]:
# Define a function to check class distribution
def calculate_label_distribution(labels):
  tot_count_n = 0
  tot_count_s = 0
  tot_count_v = 0
  for idx in range(len(labels)):
    counts_n = np.count_nonzero(labels[idx] == 'N')
    counts_s = np.count_nonzero(labels[idx] == 'S')
    counts_v = np.count_nonzero(labels[idx] == 'V')
    tot_count_n += counts_n
    tot_count_s += counts_s
    tot_count_v += counts_v
  return tot_count_n, tot_count_s, tot_count_v

def calculate_class_distribution(patient_instances):
    """
    Calculates the class distribution of the labels.

    Args:
        patient_instances (list): The list of patient instances.
    """
    tot_count_n = 0
    tot_count_s = 0
    tot_count_v = 0
    for patient in patient_instances:
        count_n, count_s, count_v = calculate_label_distribution(patient.sequences_labels)
        tot_count_n += count_n
        tot_count_s += count_s
        tot_count_v += count_v
    print(f"Label Distribution: {tot_count_n} N beats, {tot_count_s} S beats, {tot_count_v} V beats")

# Check class distribution
calculate_class_distribution(patient_instances)

In [None]:
# Define a function to compute class proportions
def calculate_class_proportions(patient_instances):
    """
    Calculates the class proportions of the labels.

    Args:
        patient_instances (list): The list of patient instances.
    """
    tot_count_n = 0
    tot_count_s = 0
    tot_count_v = 0
    for patient in patient_instances:
        count_n, count_s, count_v = calculate_label_distribution(patient.sequences_labels)
        tot_count_n += count_n
        tot_count_s += count_s
        tot_count_v += count_v
    n_ratio = tot_count_n / (tot_count_n + tot_count_v + tot_count_s)
    v_ratio = tot_count_v / (tot_count_n + tot_count_v + tot_count_s)
    s_ratio = tot_count_s / (tot_count_n + tot_count_v + tot_count_s)
    print(f"Label proportions: {round(n_ratio, 4)} N beats, {round(v_ratio, 4)} V beats, {round(s_ratio, 4)} S beats")
    return n_ratio, v_ratio, s_ratio

# Check class proportions
n_ratio, v_ratio, s_ratio = calculate_class_proportions(patient_instances)

In [None]:
from sklearn.model_selection import train_test_split

# Initialize variables
n_ratio_train = 0
s_ratio_train = 0
v_ratio_train = 0

n_ratio_val = 0
s_ratio_val = 0
v_ratio_val = 0

n_ratio_test = 0
s_ratio_test = 0
v_ratio_test = 0
random_state = 999
max_iterations = 100
iteration = 0

# Initialize variables for the best split
best_diff = float('inf')
best_split = None

# Loop until desired conditions are met
while((abs(n_ratio_train - n_ratio) > 0.001 or abs(s_ratio_train - s_ratio) > 0.001 or abs(v_ratio_train - v_ratio) > 0.001 or
       abs(n_ratio_val - n_ratio) > 0.001 or abs(s_ratio_val - s_ratio) > 0.001 or abs(v_ratio_val - v_ratio) > 0.001 or
       abs(n_ratio_test - n_ratio) > 0.001 or abs(s_ratio_test - s_ratio) > 0.001 or abs(v_ratio_test - v_ratio) > 0.001) and iteration < max_iterations):

    # Split the data into train, validation and test sets
    X_train_val, X_test = train_test_split(patient_instances, test_size=0.15, random_state=random_state)
    X_train, X_val = train_test_split(X_train_val, test_size=len(X_test), random_state=random_state)

    # Check label distribution in train set
    print("Train set:")
    calculate_class_distribution(X_train)
    n_ratio_train, v_ratio_train, s_ratio_train = calculate_class_proportions(X_train)

    # Check label distribution in validation set
    print("Validation set:")
    calculate_class_distribution(X_val)
    n_ratio_val, v_ratio_val, s_ratio_val = calculate_class_proportions(X_val)

    # Check label distribution in test set
    print("Test set:")
    calculate_class_distribution(X_test)
    n_ratio_test, v_ratio_test, s_ratio_test = calculate_class_proportions(X_test)

    # Calculate the total difference between the ratios
    total_diff = abs(n_ratio_train - n_ratio) + abs(s_ratio_train - s_ratio) + abs(v_ratio_train - v_ratio) + \
                abs(n_ratio_val - n_ratio) + abs(s_ratio_val - s_ratio) + abs(v_ratio_val - v_ratio) + \
                abs(n_ratio_test - n_ratio) + abs(s_ratio_test - s_ratio) + abs(v_ratio_test - v_ratio)

    # If this split is better than the previous best, update the best split
    if total_diff < best_diff:
        best_diff = total_diff
        best_split = (X_train, X_val, X_test)

    random_state += 1
    iteration += 1

# After the loop, best_split contains the best split found
if(iteration >= max_iterations):
    print("Max iterations reached")
    X_train, X_val, X_test = best_split

In [None]:
# Check the class proportions of the sets
print("Train set:")
calculate_class_proportions(X_train)
print("Validation set:")
calculate_class_proportions(X_val)
print("Test set:")
calculate_class_proportions(X_test)

In [None]:

# Build train, validation and test sets
X_train_seq = [sequence for patient in X_train for sequence in patient.sequences]
X_val_seq = [sequence for patient in X_val for sequence in patient.sequences]
X_test_seq = [sequence for patient in X_test for sequence in patient.sequences]

# Build train, validation and test labels
y_train = [mask for patient in X_train for mask in patient.masks]
y_val = [mask for patient in X_val for mask in patient.masks]
y_test = [mask for patient in X_test for mask in patient.masks]

# Check dimensionality of train, validation and test sets
print(f"Train dim.: {len(X_train_seq)}")
print(f"Validation dim.: {len(X_val_seq)}")
print(f"Test dim.: {len(X_test_seq)}")

# Check dimensionality of labels
print("-> Labels")
print(f"Train labels dim.: {len(y_train)}")
print(f"Validation labels dim.: {len(y_val)}")
print(f"Test labels dim.: {len(y_test)}")


In [None]:
X_train = tf.convert_to_tensor(X_train_seq)
X_val = tf.convert_to_tensor(X_val_seq)
X_test = tf.convert_to_tensor(X_test_seq)

print(X_train.shape), print(X_val.shape), print(X_test.shape)

In [None]:
y_train_encoded = to_categorical(y_train, num_classes=4)
y_val_encoded = to_categorical(y_val, num_classes=4)
y_test_encoded = to_categorical(y_test, num_classes=4)

# 1D Unet Architecture

In [None]:
from keras.models import Model
from keras.layers import Input, Conv1D, MaxPooling1D, UpSampling1D, concatenate, BatchNormalization, Activation
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def conv_block(x, filters, kernel_size=9, dilation_rate=3, strides=1):
    x = Conv1D(filters, kernel_size, strides=strides, padding='same', dilation_rate=dilation_rate)(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    return x

def unet(input_shape, num_classes):
    inputs = Input(input_shape)

    # Contracting Path (Encoder)
    conv1 = conv_block(inputs, 64)
    pool1 = MaxPooling1D(pool_size=2)(conv1)

    conv2 = conv_block(pool1, 128)
    pool2 = MaxPooling1D(pool_size=2)(conv2)

    conv3 = conv_block(pool2, 256)
    pool3 = MaxPooling1D(pool_size=2)(conv3)

    conv4 = conv_block(pool3, 512)
    pool4 = MaxPooling1D(pool_size=2)(conv4)

    # Bottleneck
    bottleneck = conv_block(pool4, 1024)

    # Expanding Path (Decoder)
    up5 = UpSampling1D(size=2)(bottleneck)
    concat5 = concatenate([up5, conv4], axis=-1)
    conv5 = conv_block(concat5, 512)

    up6 = UpSampling1D(size=2)(conv5)
    concat6 = concatenate([up6, conv3], axis=-1)
    conv6 = conv_block(concat6, 256)

    up7 = UpSampling1D(size=2)(conv6)
    concat7 = concatenate([up7, conv2], axis=-1)
    conv7 = conv_block(concat7, 128)

    up8 = UpSampling1D(size=2)(conv7)
    concat8 = concatenate([up8, conv1], axis=-1)
    conv8 = conv_block(concat8, 64)

    # Output Layer
    output = Conv1D(4, kernel_size=1, activation='sigmoid')(conv8)
    model = Model(inputs=inputs, outputs=output)

    return model

# Define the input shape and create the model
input_shape = (3200, 1)

# Define your model
model = unet(input_shape, 4)

# Print a summary of the model architecture
model.summary()

In [None]:
import tensorflow.keras as tfk
import tensorflow as tf
# Compile the model
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(), optimizer=tfk.optimizers.Adam(), metrics=[tf.keras.metrics.Recall(class_id=0),
                                                                                                       tf.keras.metrics.Recall(class_id=1),
                                                                                                       tf.keras.metrics.Recall(class_id=2),
                                                                                                       tf.keras.metrics.Recall(class_id=3),
                                                                                                       tf.keras.metrics.Precision(class_id=0),
                                                                                                       tf.keras.metrics.Precision(class_id=1),
                                                                                                       tf.keras.metrics.Precision(class_id=2),
                                                                                                       tf.keras.metrics.Precision(class_id=3)
                                                                                                        ])

In [None]:
early_stopping = tfk.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=15,
    mode='min',
    min_delta=1e-7,
    restore_best_weights=True,
    start_from_epoch = 20
)

In [None]:
y_train_tensor = tf.convert_to_tensor(y_train)
y_val_tensor = tf.convert_to_tensor(y_val)
y_test_tensor = tf.convert_to_tensor(y_test)

In [None]:
# Calculation of class weights
w = np.sum(y_train_encoded,axis=(0,1))
print(w)


In [None]:
somma = 0
weights = {}
weights_norm = {}
NUM_CLASSES = 4

for i in range(NUM_CLASSES):
  weights[i] = sum(w)/w[i]
  somma += weights[i]
print(weights)

for i in range(NUM_CLASSES):
  weights_norm[i]=weights[i]/somma
#compute weights for each label to use during training

In [None]:
batch_size = 64
epochs = 50

history = model.fit(
    x=X_train,
    y=y_train_encoded,
    epochs=epochs,
    validation_data=(X_val, y_val_encoded),
    callbacks=[early_stopping],
    class_weight = weights
)

In [None]:
# Get predictions for test data
predictions = model.predict(X_test)


In [None]:
y_test_np = np.array(y_test)

In [None]:
# Aggregate predictions along the sequence axis (axis=1)
aggregated_predictions = np.argmax(predictions, axis=2)

# Flatten true labels and aggregated predictions
true_labels_flat = y_test_np.flatten()
aggregated_predictions_flat = aggregated_predictions.flatten()

# Compute confusion matrix
cm = confusion_matrix(true_labels_flat, aggregated_predictions_flat, normalize='true')

print("Confusion Matrix:")
plt.figure(figsize=(10,8))
sns.heatmap(cm, annot=True)#, fmt='.2f' if normalize else 'd', cmap='Blues')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Compute accuracy
accuracy = accuracy_score(true_labels_flat, aggregated_predictions_flat)

# Compute precision
precision = precision_score(true_labels_flat, aggregated_predictions_flat, average='macro')

# Compute recall
recall = recall_score(true_labels_flat, aggregated_predictions_flat, average='macro')

# Compute F1 score
f1 = f1_score(true_labels_flat, aggregated_predictions_flat, average='macro')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


In [None]:
plt.plot(y_test[17])

In [None]:
plt.plot(predictions[17])

In [None]:
# Define thresholds for each label to do a post processing
thresholds = [0.8, 0.4, 0.4]

# Create a new array for the post-processed predictions
post_processed_predictions = np.zeros_like(np.zeros(predictions.shape))

# Iterate through each sample
for sample_idx in range(predictions.shape[0]):
    for idx in range(predictions.shape[1]):
        # Check for the 4th label
        if predictions[sample_idx, idx, 3] > thresholds[2]:
            post_processed_predictions[sample_idx, max(0, idx-10):min(predictions.shape[1], idx+15)] = 3
        # Check for the 3rd label
        elif predictions[sample_idx, idx, 2] > thresholds[1]:
            post_processed_predictions[sample_idx, max(0, idx-10):min(predictions.shape[1], idx+15)] = 2
        # Check for the 2nd label
        elif predictions[sample_idx, idx, 1] > thresholds[0]:
            post_processed_predictions[sample_idx, max(0, idx-10):min(predictions.shape[1], idx+15)] = 1
        else:
            post_processed_predictions[sample_idx, idx] = 0


In [None]:
plt.plot(post_processed_predictions[17])