## Main script to perform heart rate estimation of wearable PPG

This script uses both PPG and accelerometer and performs the following steps:
1. Loading all metadata of PPG and IMU
2. Query on data availability + synchronization
3. Loading relevant segment sensor data using tsdf wrapper (start for loop over synchronized segment indices)
4. Synchronize the data (correct indices etc)
5. Data preprocessing
6. Feature extraction
7. Classification


## Architecture overview
The script implements the following steps:
 - Step 1: IMU and PPG preprocessing
 - Step 2: IMU and PPG feature extraction
 - Step 3: Signal quality assessment

In [86]:
# Automatically reload modules
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
from datetime import datetime
from pathlib import Path
import json
import os
from typing import List

import tsdf
from dbpd.constants import DataColumns


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
# Constants
sensor_imu = 'IMU'
sensor_ppg = 'PPG'

imu_meta_filename = f'{sensor_imu}_meta.json'
imu_values_filename = f'{sensor_imu}_samples.bin'
imu_time_filename = f'{sensor_imu}_time.bin'

ppg_meta_filename = f'{sensor_ppg}_meta.json'
ppg_values_filename = f'{sensor_ppg}_samples.bin'
ppg_time_filename = f'{sensor_ppg}_time.bin'

rotation_units = 'deg/s'
acceleration_units = 'm/s^2'

d_channels_units = {
    DataColumns.ACCELEROMETER_X: acceleration_units,
    DataColumns.ACCELEROMETER_Y: acceleration_units,
    DataColumns.ACCELEROMETER_Z: acceleration_units,
    DataColumns.GYROSCOPE_X: rotation_units,
    DataColumns.GYROSCOPE_Y: rotation_units,
    DataColumns.GYROSCOPE_Z: rotation_units,

}

# filtering
sampling_frequency = 100
lower_cutoff_frequency = 0.3
filter_order = 4

In [87]:
# Module methods

def tsdf_scan_meta(tsdf_data_full_path : str) -> List[dict]:
    """
    For each given TSDF directory, transcribe TSDF metadata contents to a list of dictionaries.
    
    Parameters
    ----------
    tsdf_data_full_path : str
        Full path to the directory containing TSDF metadata files.

    Returns
    -------
    List[Dict]
        List of dictionaries with metadata from each JSON file in the directory.

    Examples
    --------
    >>> tsdf_scan_meta('/path/to/tsdf_data')
    [{'start_iso8601': '2021-06-27T16:52:20Z', 'end_iso8601': '2021-06-27T17:52:20Z'}, ...]
    """
    tsdf = []
    
    # Collect all metadata JSON files in the specified directory
    meta_list = list(Path(tsdf_data_full_path).rglob('*_meta.json'))
    for meta_file in meta_list:
        with open(meta_file, 'r') as file:
            json_obj = json.load(file)
            meta_data = {
                'tsdf_meta_fullpath': str(meta_file),
                'subject_id': json_obj['subject_id'],
                'start_iso8601': json_obj['start_iso8601'],
                'end_iso8601': json_obj['end_iso8601']
            }
            tsdf.append(meta_data)
    
    return tsdf

In [88]:
# Constants
UNIX_TICKS_MS = 1000.0
FS_PPG = 30  # Sampling rate for PPG
FS_IMU = 100  # Sampling rate for IMU

# Paths
raw_data_root = '../../../tests/data/1.sensor_data/'
ppp_data_path_ppg = os.path.join(raw_data_root, 'PPG')
ppp_data_path_imu = os.path.join(raw_data_root, 'IMU')

In [89]:
# 1. Loading all metadata of PPG and IMU

meta_ppg = tsdf_scan_meta(ppp_data_path_ppg)
meta_imu = tsdf_scan_meta(ppp_data_path_imu)

In [90]:
from datetime import datetime, timedelta
import numpy as np

def convert_iso8601_to_datetime(date_str):
        """
        Convert a date string to a datetime object.

        Parameters
        ----------
        date_str : str
            Date string in the format '%d-%b-%Y %H:%M:%S %Z'.

        Returns
        -------
        datetime
            A datetime object corresponding to the input date string.

        Examples
        --------
        >>> convert_to_datetime('27-Jun-2021 16:52:20 UTC')
        datetime.datetime(2021, 6, 27, 16, 52, 20, tzinfo=<UTC>)
        """
        return datetime.strptime(date_str, '%d-%b-%Y %H:%M:%S %Z')

def synchronization(ppg_meta, imu_meta):
    """
    Synchronize PPG and IMU data segments based on their start and end times.

    Parameters
    ----------
    ppg_meta : list of dict
        List of dictionaries containing 'start_iso8601' and 'end_iso8601' keys for PPG data.
    imu_meta : list of dict
        List of dictionaries containing 'start_iso8601' and 'end_iso8601' keys for IMU data.

    Returns
    -------
    segment_ppg_total : list of int
        List of synchronized segment indices for PPG data.
    segment_imu_total : list of int
        List of synchronized segment indices for IMU data.
    """
    ppg_start_time = [convert_iso8601_to_datetime(t['start_iso8601']) for t in ppg_meta]
    imu_start_time = [convert_iso8601_to_datetime(t['start_iso8601']) for t in imu_meta]
    ppg_end_time = [convert_iso8601_to_datetime(t['end_iso8601']) for t in ppg_meta]
    imu_end_time = [convert_iso8601_to_datetime(t['end_iso8601']) for t in imu_meta]

    # Create a time vector covering the entire range
    time_vector_total = []
    current_time = min(min(ppg_start_time), min(imu_start_time))
    end_time = max(max(ppg_end_time), max(imu_end_time))
    while current_time <= end_time:
        time_vector_total.append(current_time)
        current_time += timedelta(seconds=1)
    
    time_vector_total = np.array(time_vector_total)

    # Initialize variables
    data_presence_ppg = np.zeros(len(time_vector_total), dtype=int)
    data_presence_ppg_idx = np.zeros(len(time_vector_total), dtype=int)
    data_presence_imu = np.zeros(len(time_vector_total), dtype=int)
    data_presence_imu_idx = np.zeros(len(time_vector_total), dtype=int)

    # Mark the segments of PPG data with 1
    for i, (start, end) in enumerate(zip(ppg_start_time, ppg_end_time)):
        indices = np.where((time_vector_total >= start) & (time_vector_total < end))[0]
        data_presence_ppg[indices] = 1
        data_presence_ppg_idx[indices] = i

    # Mark the segments of IMU data with 1
    for i, (start, end) in enumerate(zip(imu_start_time, imu_end_time)):
        indices = np.where((time_vector_total >= start) & (time_vector_total < end))[0]
        data_presence_imu[indices] = 1
        data_presence_imu_idx[indices] = i

    # Find the indices where both PPG and IMU data are present
    corr_indices = np.where((data_presence_ppg == 1) & (data_presence_imu == 1))[0]

    # Find the start and end indices of each segment
    corr_start_end = []
    if len(corr_indices) > 0:
        start_idx = corr_indices[0]
        for i in range(1, len(corr_indices)):
            if corr_indices[i] - corr_indices[i - 1] > 1:
                end_idx = corr_indices[i - 1]
                corr_start_end.append((start_idx, end_idx))
                start_idx = corr_indices[i]
        # Add the last segment
        corr_start_end.append((start_idx, corr_indices[-1]))

    # Extract the synchronized indices for each segment
    segment_ppg_total = []
    segment_imu_total = []
    for start_idx, end_idx in corr_start_end:
        segment_ppg = np.unique(data_presence_ppg_idx[start_idx:end_idx + 1])
        segment_imu = np.unique(data_presence_imu_idx[start_idx:end_idx + 1])
        if len(segment_ppg) > 1 and len(segment_imu) == 1:
            segment_ppg_total.extend(segment_ppg)
            segment_imu_total.extend([segment_imu[0]] * len(segment_ppg))
        elif len(segment_ppg) == 1 and len(segment_imu) > 1:
            segment_ppg_total.extend([segment_ppg[0]] * len(segment_imu))
            segment_imu_total.extend(segment_imu)
        elif len(segment_ppg) == len(segment_imu):
            segment_ppg_total.extend(segment_ppg)
            segment_imu_total.extend(segment_imu)
        else:
            continue

    return segment_ppg_total, segment_imu_total


In [91]:
# 2. Query on data availability + synchronization
segment_ppg, segment_imu = synchronization(meta_ppg, meta_imu)  # Define `synchronization`

In [97]:
import re

def tsdf_values_idx(metadata_list: List[dict], suffix: str) -> List[int]:
    """
    This does not work.
    Searches for indices in the metadata list where the file name ends with a specified suffix. 

    Parameters
    ----------
    metadata_list : list of dict
        A list where each item is a dictionary containing metadata, including a 'file_name' key with the file's name as its value.
    suffix : str
        The suffix to search for in the file names. It should include the extension, such as '.bin'.

    Returns
    -------
    list of int
        A list of indices where the file names end with the specified suffix.
    
    Examples
    --------
    >>> metadata_list = [{'file_name': 'data1.bin'}, {'file_name': 'data2.txt'}, {'file_name': 'data3.bin'}]
    >>> tsdf_values_idx(metadata_list, '.bin')
    [0, 2]
    """
    indices = []
    pattern = re.compile(rf".*{re.escape(suffix)}$")
    
    for i, metadata in enumerate(metadata_list):
        # Ensure the metadata is a dictionary and contains the 'file_name' key
        if isinstance(metadata, dict) and 'file_name' in metadata:
            # Check if the 'file_name' matches the pattern
            if pattern.search(metadata['file_name']):
                indices.append(i)
    
    return indices

In [105]:
# 3. Loading relevant segment sensor data
n = 0  # Assuming we're only looking at the first synchronized segment
meta_path_ppg = meta_ppg[segment_ppg[n]]['tsdf_meta_fullpath']
meta_path_imu = meta_imu[segment_imu[n]]['tsdf_meta_fullpath']

metadata_list_ppg = tsdf.load_metadata_from_path(meta_path_ppg)
metadata_list_imu = tsdf.load_metadata_from_path(meta_path_imu)

# Extract indices for time and samples (TSDF should support extracting data based on the channel names)
time_idx_ppg = metadata_list_ppg["PPG_time.bin"]
time_idx_imu = metadata_list_imu["IMU_time.bin"]
values_idx_ppg = metadata_list_ppg["PPG_samples.bin"]
values_idx_imu = metadata_list_imu["IMU_samples.bin"]

# Process time data
datetime_ppg = datetime.strptime(time_idx_ppg.start_iso8601, '%d-%b-%Y %H:%M:%S %Z')
datetime_imu = datetime.strptime(time_idx_imu.start_iso8601, '%d-%b-%Y %H:%M:%S %Z')

ts_ppg = int(datetime_ppg.timestamp() * UNIX_TICKS_MS)
ts_imu = int(datetime_imu.timestamp() * UNIX_TICKS_MS)

# Calculating continuous time vectors
t_ppg = np.cumsum(data_list_ppg[time_idx_ppg]) + ts_ppg
t_imu = np.cumsum(data_list_imu[time_idx_imu]) + ts_imu

NameError: name 'data_list_ppg' is not defined

In [None]:
# 4. Data synchronization on right indices
ppg_indices, imu_indices = extract_overlapping_segments(t_ppg, t_imu)  # Define this function

# Update data vectors based on synchronized indices
v_ppg = data_list_ppg[values_idx_ppg][ppg_indices[0]:ppg_indices[1]]
v_imu = data_list_imu[values_idx_imu][imu_indices[0]:imu_indices[1]]

In [None]:

# 5. Data preprocessing
# Implement `preprocessing_ppg` and `preprocessing_imu` to suit your data format
v_ppg_pre, tr_ppg_pre = preprocessing_ppg(v_ppg, FS_PPG)
v_imu_pre, tr_imu_pre = preprocessing_imu(v_imu, FS_IMU)

# Save preprocessed data
location = "../../tests/data/2.preprocessed_data/ppg"
save_preprocessed_data(v_ppg_pre, tr_ppg_pre, v_imu_pre, tr_imu_pre, location)

# Feature extraction and Classification
# Assume feature extraction and classification functions are implemented
features_ppg, features_imu = extract_features(v_ppg_pre, v_imu_pre)
classification_results = classify_signals(features_ppg, features_imu)

# Save the classification results
save_classification_data(classification_results, location)

# We need to implement:
# - synchronization: to find overlapping segments between PPG and IMU data based on metadata
# - extract_overlapping_segments: to calculate the correct indices for synchronized data segments
# - preprocessing_ppg, preprocessing_imu: functions to preprocess the raw PPG and IMU data
# - extract_features: to extract relevant features from the preprocessed data
# - classify_signals: to perform the classification on the extracted features
# - save_preprocessed_data, save_classification_data: functions to save data to files in a suitable format


NameError: name 'synchronization' is not defined