# Main Pipeline File

the following will be the pipeline:
<ol>
   <li>Read in the files and deal with the missing data. </li>
   <li>Preprocess the signals</li>
   <li>Complete feature extraction</li>
   <li>Put data into a model.</li>
   <li>optimize, compare, iterate - try other models.</li>
</ol>

In [None]:
# Import libraries. 
import numpy as np
import torch
import pandas as pd
import matplotlib.pyplot as plt 
import os
import wfdb
import pickle
import sys
import glob
from scipy.signal import butter, lfilter
import pprint


In [None]:
# Load helper files.
import dataloaders
import visualize
import preprocess

In [None]:
dataloaders.get_device_info()

In [None]:
file_path = 'G:/Datasets/mit-bih-arrhythmia-database-1.0.0/'

In [None]:
# Load data. 
patient_data = dataloaders.load_all_records(file_path)

#print(patient_data)
# print(patient_data['102'][0])  # Prints the first beat entry for patient 100

In [None]:
# Load raw ECG and annotations for a single record.
signal, annotations = dataloaders.load_record(file_path + '/114')
# print(signal.shape)

# record = wfdb.rdrecord(file_path+'/100')  
# test check for loading lead count, weird bug
# print("Number of leads:", record.p_signal.shape[1])


In [None]:
pp = pprint.PrettyPrinter(indent=2)

# Print the structure of the patient_data dictionary
pp.pprint(patient_data['100'][:3])

In [None]:
# Visualize the raw ECG with annotations for the 1 record.
visualize.visualize_ecg_with_labels(signal, annotations, fs=360, duration=15)

In [None]:
# check for missing values now before preprocessing. 
missing_values_report = preprocess.check_for_missing_values(patient_data)

In [None]:
# # R-R interval calculation. 
# rr_intervals_report = preprocess.calculate_rr_intervals(patient_data, fs=360)
# # print("R-R intervals for patient 100:", rr_intervals_report.get('10', []))

# unique_rr_counts = {patient_id: len(set(rr_intervals)) for patient_id, rr_intervals in rr_intervals_report.items()}
# print(unique_rr_counts)


# Preprocessing Stage

1. High-Pass Filter to remove baseline wander
2. Notch Filter to remove powerline interference (if any?)
3. Low-Pass Filter to remove high-frequency noise (set for 40 hz for now?)
4. Moving Average Filter to smooth the remaining signal 
5. Normalization for 0-->1 because the leads all act differently

<p> may want to consider an FFT or a Wavelet Transform because it's time series data. can determine later on. </p>

In [None]:
# TESTING NEW -1 AND 1 NORMALIZATION, COULDN'T ACCESS EARLY AHAHA

# for patient_id, beats in patient_data.items():
#     for beat in beats:
#         # Apply preprocessing to each lead in the beat entry
#         for key in beat:
#             if key.startswith('signal_lead_'):
#                 beat[key] = preprocess.test_signal(beat[key])

# patient_id = '100'
# if patient_id in patient_data:
#     print("After preprocessing:")
#     pp.pprint(patient_data[patient_id][:3])  # Print first 3 beats for verification
# else:
#     print(f"Patient ID {patient_id} not found in data.")

In [None]:
# Pre-Preprocessing visual check.
visualize.visualize_ecg_with_annotationsV2(patient_data, patient_id='100', num_beats=5, fs=360)

In [None]:
patient_data = preprocess.apply_filters_and_normalization(patient_data, fs=360)


In [None]:
# Post Pre-Preprocessing visual check.
visualize.visualize_ecg_with_annotationsV2(patient_data, patient_id='100', num_beats=5)