# Main Pipeline File

the following will be the pipeline:
<ol>
   <li>Read in the files and deal with the missing data. </li>
   <li>Preprocess the signals</li>
   <li>Complete feature extraction</li>
   <li>Put data into a model.</li>
   <li>optimize, compare, iterate - try other models.</li>
</ol>

In [None]:
# Import libraries. 
import numpy as np
import torch
import torch.nn as nn
import pandas as pd
import matplotlib.pyplot as plt 
import os
import wfdb
import pickle
import sys
import glob
from scipy.signal import butter, lfilter
import pprint
from collections import Counter


In [None]:
# Load helper files.
import dataloaders
import visualize
import preprocess
import segment 

In [None]:
dataloaders.get_device_info()

In [None]:
# 'C:/Users/henry/OneDrive/Desktop/ELEC 872 - AI and Interactive Systems/Project/mit-bih-arrhythmia-database-1.0.0/'
# 'G:/Datasets/mit-bih-arrhythmia-database-1.0.0/'
# 'D:/Datasets/mit-bih-arrhythmia-database-1.0.0/'
file_path = 'G:/Datasets/mit-bih-arrhythmia-database-1.0.0/'

In [None]:
# Load data. 
patient_data = dataloaders.load_all_records(file_path)

In [None]:
pp = pprint.PrettyPrinter(indent=2)

# Print the structure of the patient_data dictionary
pp.pprint(patient_data['103'][:3])

### Print out the rhythm count (as beats per rhythm type)

In [None]:

# for i in range (100,125):
#     visualize.summarize_rhythm_counts(patient_data, str(i))

# for i in range (200,225):
#     visualize.summarize_rhythm_counts(patient_data, str(i))

#visualize.summarize_rhythm_counts(patient_data, "203")


# Preprocessing Stage

Note. Prior to this, we downsample. typically we downsample AFTER preprocessing. I've elected to do it before because of the way the annotation object is structured.

<h2>Method 1</h2>

0. Convert Dictionary to array value style for use with the filter functions from scipy.
1. High-Pass Filter to remove baseline wander
2. Notch Filter to remove powerline interference (if any?)
3. Low-Pass Filter to remove high-frequency noise (set for 40 hz for now?)
4. Moving Average Filter to smooth the remaining signal 
5. Normalization for 0-->1 because the leads all act differently

<h2>Method 2</h2>

0. Convert Dictionary to array value style for use with the filter functions from the pywavelet
1. apply wavelet using the equation + methods from the paper
2. normalization for 0-->1

<p> @TODO: possibly a dual channel arch. if we have the time, implement the feature creation (R-R, HRV, beat) that we have already created</p>

In [None]:
# aggregate data into arrays from the dict first, note this is a progress test.
# aggregated_patient_data = preprocess.aggregate_signals(patient_data)
# pp.print(aggregated_data['103'])

In [None]:
# Preprocess and print out. should all be normal for first few samples shown.
processed_data = preprocess.preprocess_patient_data(patient_data)
pp.pprint(processed_data['103'])

In [None]:
#without labels. 
# visualize.visualize_patient_data(processed_data,'100',10)

In [None]:
# with labels - note i just did very basic math and put it in the general area of the segment. might not lineup 1:1
# visualize.visualize_patient_data_with_rhythm(processed_data, patient_id='103', display_seconds=10)

## Feature Extractions, Selection

<p> regarding featuere extraction and selection. We've got HRV, R-R peak calculations. do we want to actually use? </p>

- P.S., other models performing fine w/out, can try if we have time. If not, that's in our next steps / how to improve.

<br>

<p>Now that the signals have been filtered and normalized - we have 2 leads. we can select which one is more important to use. The database website on PhysioNet recommends certain leads for specific tasks, however, in our case, we want generalizability - so we will consider the lead that has the most information / least noise.
</p>

Considerations are as follows and are weighted based on what we felt was best:

1. Signal Noise Ratio (SNR)
2. Entropy
3. Standard Deviation
4. High Frequency Power
5. ICA

Other considerations were made and baselines were tested, however, we felt that these 4/5 had the highest impact. The leads not chosen were tested however and results were similar.

In [None]:
# Create a dictionary to store the best lead's signal and labels for each patient
best_leads_data = {}

# Iterate through all patients and determine the better lead for each
for patient_id in processed_data.keys():

    better_lead = visualize.compute_noise_metrics_for_patient(processed_data, patient_id)
    # print(f"Patient {patient_id}: Best Lead is {better_lead}")

    # Extract the corresponding lead's signal and labels , 
    if better_lead == 'Lead 1':
        best_signal = processed_data[patient_id]['signals_lead_1']
    elif better_lead == 'Lead 2':
        best_signal = processed_data[patient_id]['signals_lead_2']

    labels = processed_data[patient_id]['labels']

    best_leads_data[patient_id] = {
        'signal': best_signal,
        'labels': labels
    }

print("new dictionary with specifc labels + lead for given patient added.")

# Test Models

<p><strong>Note:</strong> See the sub folder/module, named models, for individual model information. Some use the same train file (see reshape parameter in model class). Some have their own train. They all use the same preprocessing pipeline (with some minor adjustments to assist with the shape.) </p>

Note: 

``` 
LABEL_ENCODING = 
{
        'normal': 0,
        'atrial_fibrillation': 1,
        'atrial_flutter': 2,
        'ventricular_bigeminy': 3,
        'Other': 4 
}
```

    

In [None]:
# libraries from the model sub folder
from models.rnn import ECG_RNN
# from models.rnn2 import RNN2
from models.resnet import ResNet1D
from models.cnnlstm import CNNLSTM
from models.seq2seq import ECGSeq2Seq

from train import train_model, evaluate_model
from traincnnlstm import train_cnn_lstm

# functions from the reg files
from segment import prepare_rnn_data, split_data
from dataloaders import create_dataloaders
# from preprocess import encode_labels

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # pretty sure need to re-add this, didnt use the one I used from EDA

In [None]:
def balance_dataset(signals, labels, label_encoding, max_normal_samples=1500):
    """
    Balances the dataset by taking a fixed number of samples from the 'normal' class.

    Args:
        signals (np.array): The segmented signal data.
        labels (np.array): The encoded labels corresponding to the signals.
        label_encoding (dict): Dictionary mapping class names to numeric labels.
        max_normal_samples (int): Fixed number of samples to take from the 'normal' class.

    Returns:
        balanced_signals (np.array): Subsampled signal data.
        balanced_labels (np.array): Corresponding labels after subsampling.
    """

    # Count class distribution
    label_counts = Counter(labels)

    # Identify indices
    normal_label = label_encoding['normal']
    normal_indices = np.where(labels == normal_label)[0]
    other_indices = np.where(labels != normal_label)[0]

    selected_normal_indices = np.random.choice(normal_indices, max_normal_samples, replace=False)

    # Combine with all other class samples
    balanced_indices = np.concatenate([selected_normal_indices, other_indices])

    # Extract balanced data
    balanced_signals = signals[balanced_indices]
    balanced_labels = labels[balanced_indices]

    # Print updated label counts
    updated_counts = Counter(balanced_labels)
    print("\n========== UPDATED ENCODED LABEL COUNTS ==========")
    for encoded_label, count in updated_counts.items():
        decoded_label = [k for k, v in label_encoding.items() if v == encoded_label][0]
        print(f"Encoded Label: {encoded_label} ({decoded_label}) - Count: {count}")
    print("===================================================\n")

    return balanced_signals, balanced_labels


In [None]:
# Parameters for Segmentation
segment_length_sec = 10  
fs = 250  

# Initialize Signal and Label Storage
rnn_signals, rnn_labels = [], []

# Process Best Leads Data
for patient_id, data in best_leads_data.items():
    signal = data['signal']
    labels = data['labels']
    segments, segment_labels = prepare_rnn_data(signal, labels, segment_length_sec, fs)
    rnn_signals.extend(segments)
    rnn_labels.extend(segment_labels)

# Define Label Encoding Dictionary
LABEL_ENCODING = {
    'normal': 0,
    'atrial_fibrillation': 1,
    'atrial_flutter': 2,
    'ventricular_bigeminy': 3,
    'Other': 4  
}

# Encode Labels After Segmentation
rnn_signals = np.array(rnn_signals)
rnn_labels_encoded = np.array([LABEL_ENCODING[label] for label in rnn_labels])

# Print Encoded Data Information
print(f"Labels Before Encoding (First 10): {rnn_labels[:10]}")
print(f"Encoded Labels (First 10): {rnn_labels_encoded[:10]}")
print(f"Segmented Signals Shape: {rnn_signals.shape}")
print(f"Encoded Labels Shape: {rnn_labels_encoded.shape}")


encoded_label_counts = Counter(rnn_labels_encoded)
print("\n========== ENCODED LABEL COUNTS ==========")
for encoded_label, count in encoded_label_counts.items():
    decoded_label = [k for k, v in LABEL_ENCODING.items() if v == encoded_label][0]
    print(f"Encoded Label: {encoded_label} ({decoded_label}) - Count: {count}")
print("==========================================\n")

# sub sample the normal class, without it just overfits right away.
rnn_signals, rnn_labels_encoded = balance_dataset(
    signals=rnn_signals, 
    labels=rnn_labels_encoded, 
    label_encoding=LABEL_ENCODING, 
    max_normal_samples=800
)


# Split Data into Train, Validation, and Test Sets
data_splits = split_data(rnn_signals, rnn_labels_encoded)

# Create DataLoaders for PyTorch
batch_size = 32
train_loader, val_loader, test_loader = create_dataloaders(data_splits, batch_size=batch_size)

# Print DataLoader Sizes
print(f"Train Loader Size: {len(train_loader)} batches")
print(f"Validation Loader Size: {len(val_loader)} batches")
print(f"Test Loader Size: {len(test_loader)} batches")


In [None]:
# Define model configurations
# input size is just 1 because 1D, output size is the number of classes. RNN might need to be updated later with the num layers,
# need to add GRU / LSTM's to it or maybe just an attention mechanism.

# below is for the CNNLSTM in the event that sequence length changes
sample_data, _ = next(iter(train_loader))
sequence_length = sample_data.shape[1]  


model_configs = {
    'RNN': ECG_RNN(input_size=1, hidden_size=64, num_layers=2, num_classes=5),
    # 'RNN2': RNN2(input_size=1, hidden_size=128, num_layers=4, num_classes=5),
    #'ResNet': ResNet1D(input_channels=1, num_classes=5), 
    # 'RNN' : ECG_RNN(input_size=1, hidden_size=128, num_layers=4, num_classes=5), #<--- mod this one if playground -ing 
    'CNNLSTM': CNNLSTM(input_shape=(1, sequence_length)),
    'Seq2Seq': ECGSeq2Seq(input_size=1, hidden_size=1, num_layers=2, num_classes=5), 
}

# Select model to train
selected_model_name = ''  # Change this to 'ResNet' or whatever really, 
#the config might bug out sometiems tho if u dont reset kernels and run the notebook

# Initialize model, criterion, and optimizer <-- adam / n-adam worked best, feel free to mess w it
model = model_configs[selected_model_name].to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
num_epochs = 25

# Train the selected model <-- uncomment the below for other models
# trained_model = train_model(
#     model, train_loader, val_loader, num_epochs=25, criterion=criterion, 
#     optimizer=optimizer, device=device, patience=5, min_delta=0.001
# )

# use the below for CNNLSTM as it has its own train.
trained_model = train_cnn_lstm(
    model, train_loader, val_loader, num_epochs=num_epochs, criterion=criterion, 
    optimizer=optimizer, device=device, patience=5, min_delta=0.001
)

# Evaluate the trained model
test_acc, test_loss = evaluate_model(trained_model, test_loader, criterion, device)
print(f"{selected_model_name} Test Accuracy: {test_acc:.4f}, Test Loss: {test_loss:.4f}")


In [None]:
# # Parameters for segmentation
# segment_length_sec = 10  # Segment length in seconds
# fs = 250  # Sampling frequency

# # Prepare segmented data
# rnn_signals = []
# rnn_labels = []

# for patient_id, data in best_leads_data.items():
#     signal = data['signal']  # The best lead's signal
#     labels = data['labels']  # Corresponding labels

#     # Segment the data
#     segments, segment_labels = segment.prepare_rnn_data(signal, labels, segment_length_sec, fs)
#     rnn_signals.extend(segments)
#     rnn_labels.extend(segment_labels)

# # Convert to numpy arrays
# rnn_signals = np.array(rnn_signals)
# rnn_labels = np.array(rnn_labels)

# print(f"RNN Signals Shape: {rnn_signals.shape}")
# print(f"RNN Labels Shape: {rnn_labels.shape}")


In [None]:
# print(rnn_labels[:10])

In [None]:
# # putting it in segment file wasn't working so trying it here 
# LABEL_ENCODING = {
#     'normal': 0,
#     'atrial_fibrillation': 1,
#     'atrial_flutter': 2,
#     'ventricular_bigeminy': 3,
#     'Other': 4  
# }

# print(f"Labels Before Encoding (First 10): {rnn_labels[:10]}")

# # Encode Labels After Segmentation
# rnn_labels_encoded = np.array([LABEL_ENCODING[label] for label in rnn_labels])

# print(f"Encoded Labels (First 10): {rnn_labels_encoded[:10]}")
# print(f"Segmented Signals Shape: {rnn_signals.shape}")
# print(f"Encoded Labels Shape: {rnn_labels_encoded.shape}")

# # Split and Create DataLoaders
# data_splits = segment.split_data(rnn_signals, rnn_labels_encoded)
# train_loader, val_loader, test_loader = dataloaders.create_dataloaders(data_splits, batch_size=32)

# # Print DataLoader Sizes to double check, was throwing an issue earlier because I forgot to categorically encode it 
# print(f"Train Loader Size: {len(train_loader)} batches")
# print(f"Validation Loader Size: {len(val_loader)} batches")
# print(f"Test Loader Size: {len(test_loader)} batches")

In [None]:
# # Split data into train, validation, and test sets
# data_splits = segment.split_data(rnn_signals, rnn_labels)

# # Create DataLoaders for PyTorch
# train_loader, val_loader, test_loader = dataloaders.create_dataloaders(data_splits, batch_size=32)

# print(f"Train Loader Size: {len(train_loader)} batches")
# print(f"Validation Loader Size: {len(val_loader)} batches")
# print(f"Test Loader Size: {len(test_loader)} batches")


In [None]:
# import torch.nn as nn

# class ECG_RNN(nn.Module):
#     def __init__(self, input_size=1, hidden_size=64, num_layers=2, num_classes=5):
#         super(ECG_RNN, self).__init__()
#         self.rnn = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
#         self.fc = nn.Linear(hidden_size, num_classes)

#     def forward(self, x):
#         out, _ = self.rnn(x)
#         out = out[:, -1, :]  # Take the last hidden state
#         out = self.fc(out)
#         return out

# def train_model(model, train_loader, val_loader, num_epochs, criterion, optimizer, device):
#     model.to(device)

#     for epoch in range(num_epochs):
#         model.train()
#         running_loss = 0.0

#         for inputs, labels in train_loader:
#             inputs, labels = inputs.to(device), labels.to(device)

#             optimizer.zero_grad()
#             outputs = model(inputs)
#             loss = criterion(outputs, labels)
#             loss.backward()
#             optimizer.step()
#             running_loss += loss.item()

#         # Validation
#         model.eval()
#         val_loss, correct, total = 0.0, 0, 0
#         with torch.no_grad():
#             for inputs, labels in val_loader:
#                 inputs, labels = inputs.to(device), labels.to(device)
#                 outputs = model(inputs)
#                 val_loss += criterion(outputs, labels).item()
#                 _, predicted = torch.max(outputs, 1)
#                 total += labels.size(0)
#                 correct += (predicted == labels).sum().item()

#         print(f"Epoch {epoch+1}/{num_epochs}, "
#               f"Train Loss: {running_loss/len(train_loader):.4f}, "
#               f"Val Loss: {val_loss/len(val_loader):.4f}, "
#               f"Val Accuracy: {100 * correct / total:.2f}%")

#     return model

# def evaluate_model(model, test_loader, device):
#     model.eval()
#     correct, total = 0, 0

#     with torch.no_grad():
#         for inputs, labels in test_loader:
#             inputs, labels = inputs.to(device), labels.to(device)
#             outputs = model(inputs)
#             _, predicted = torch.max(outputs, 1)
#             total += labels.size(0)
#             correct += (predicted == labels).sum().item()

#     print(f"Test Accuracy: {100 * correct / total:.2f}%")



In [None]:
# # Hyperparameters
# input_size = 1  # ECG data is 1D
# hidden_size = 64
# num_layers = 2
# num_classes = len(set(rnn_labels))
# num_epochs = 20
# batch_size = 32
# learning_rate = 0.001

# # Device setup
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# # Create the model
# model = ECG_RNN(input_size, hidden_size, num_layers, num_classes)

# # Define loss and optimizer
# criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# # Train the model
# train_loader, val_loader, test_loader = dataloaders.create_dataloaders(data_splits, batch_size)
# model = train_model(model, train_loader, val_loader, num_epochs, criterion, optimizer, device)

# # Evaluate the model
# evaluate_model(model, test_loader, device)
