# Postprocessing Template (Tutorial)

Before running this script, extract the example data into the `data/raw` folder in the analysis directory of this repository.

In [None]:
import sys
sys.path.append("..") # Allow imports from project directory
print(sys.executable) # Verify conda environment is active
print(sys.version)

# Load postprocessing modules
from bangle_process import data_access, pipelines, reporting, utils

## Configuration

Configuration variables are set in the file: `config.yml`.

Here, the processed data and results will be output into the `template_example` folder.

In [None]:
cfg = utils.load_config("config.yml", print_config = True)

## Initialize directories

Ensure each folder is created to store the processed data and results.

In [None]:
utils.init_directories(cfg)

## Summarize raw data files in directory

First we can check for the valid files in the raw data directory

Files containing raw watch data (in the configured directory) are read and stored in a dataframe.

We can then manually inspect each watch's `RecordStart` and `RecordFinish` time, its `Duration`, and the number of `Samples` collected.

The file information is also exported to a `.csv` file in the `../template_example/results/summary/` folder.

In [None]:
files_watch_summary = data_access.summarise_files_in_directory(cfg)
files_watch_summary

# TODO: fix this warning []

We can automatically flag any records that are not within our configured length or sample rate.

In [None]:
flagged_data = data_access.flag_records(files_watch_summary, cfg)

In this example, the following watch is flagged:

In [None]:
flagged_data

Now we can exclude this record from the watch file summary

In [None]:
files_watch_summary.drop(flagged_data.index, inplace=True)
files_watch_summary

## Read raw data

Now all of the raw data that meet our criteria can be read and combined into a single dataframe.

In [None]:
raw_data_full = data_access.get_raw_watch_data(files_watch_summary, cfg, save_data=True)

In [None]:
raw_data_full.head()

Note: The first value in the `timeDifference` column is `NaN` because [TODO:]

### Trim raw data to time period

Optionally, data outside of a specified time window can be removed from the dataframe.

This time range can be specified in the config file.

For example, you likely want to trim to a global start time and end time of data collection.

In [None]:
raw_data_trimmed = data_access.trim_raw_watch_data(raw_data_full, cfg, save_data=True)

In [None]:
raw_data_trimmed

## Visualization

Coarse overview of time-series
TODO

In [None]:
reporting.plotly_data(raw_data_trimmed, cfg)

#TODO 4 second bins (efficiency)

In [None]:
# Datashader
import xarray as xr
import datashader as ds
import datashader.transfer_functions as tf


def get_ds_aggs(df, name_y, h=1500, w=4000):
    # datashader does not have native date support -- convert time to int
    d = df.copy()
    d.loc[:, "ITime"] = d["time"].astype("int64")
    d.sort_values("ITime", inplace=True)
    dx_min = d["ITime"].min()  # For manual scaling of img
    dx_max = d["ITime"].max()
    dy_min = d[name_y].min() - 2
    dy_max = d[name_y].max() + 2

    # For multiple plots (each HR timeseries), cvs must be same size
    cvs = ds.Canvas(
        x_range=(dx_min, dx_max), y_range=(dy_min, dy_max), plot_height=h, plot_width=w
    )
    aggs = {}
    for c in d["watchId"].unique():
        aggs[c] = cvs.line(d[d["watchId"] == c], "ITime", name_y)
    return aggs


def plot_raw_individual_watches(df, config_dat, value="heartRate"):
    dir_fig_out = config_dat["directories"]["figures"]
    df_agg = get_ds_aggs(df, value)
    for w in df_agg.keys():
        img = tf.shade(df_agg[w])
        ds.utils.export_image(img, dir_fig_out + f"fig_raw_{value}_{w}")

plot_raw_individual_watches(raw_data_trimmed, cfg, value="ppgRaw")

In [None]:
# Plot heartRate by default
reporting.plot_raw_individual_watches(raw_data_trimmed, cfg)

In [None]:
# Plot ppgRaw
reporting.plot_raw_individual_watches(raw_data_trimmed, cfg, value='ppgRaw')

## Heart rate processing

### Resampling
The raw data must be resampled at a constant rate for all watches.

In [None]:
resampled_data_HR_1000ms = pipelines.resample_HR(raw_data_trimmed, cfg, save_data=True)

In [None]:
resampled_data_HR_1000ms.reset_index('time', inplace=True)
# TODO: fix plotting for resampled df


In [None]:
reporting.plotly_data(resampled_data_HR_1000ms, cfg)

## PPG processing

In [None]:
raw_data_trimmed.reset_index(inplace=True) # TODO: Fix in resample_PPG()

resampled_data_PPG_25ms = pipelines.resample_PPG(
    raw_data_trimmed, cfg, save_data=True
)
# TODO: Fix warning

In [None]:
resampled_data_PPG_25ms

# Plot output -- compare to original signal

## Peak finding

In [None]:
PPG_peaks = pipelines.PPG_find_peaks(resampled_data_PPG_25ms, cfg)
PPG_peaks

In [None]:
PPG_peaks['PPG_PeaksAlt'] = PPG_peaks['PPG_Peaks'] * PPG_peaks['PPG_Clean'].std() * 3
PPG_peaks.loc[PPG_peaks['watchId'] == 'W020', ['PPG_Clean', 'PPG_PeaksAlt']].plot(figsize=(1000,5))

TODO:

Plot comparison of PPG HR and heartRate (from bangle)

more meaningful function names

consistent index axis names

rename plot_raw_data? generalize to any data? also handle resampling.

final data frame PPG-based HRV

# More

## Read processed data