In [1]:
# Sensor biomarker data are organized in CSV files for each biomarker across several sessions for all participants. 
# Our goal is proper data organization. First, we want to structure the data as organized data from all sensors according to participants (sensor readings for each participant are resampled and merged for each individual). 
# This approach facilitates training various anomaly detection models (population-based model, purely personalized model, or a personalized model via adaptation). 
# By segregating data according to participants, features can be engineered across this data, and then the data can be easily merged to obtain a general population dataset. 

In [18]:
import pandas as pd
import numpy as np
import os

In [19]:
# 1) List your combined-by-signal files
signal_files = {
    "HR":   "../data/processed/hr_processed.csv",
    "EDA":  "../data/processed/eda_processed.csv",
    "ACC":  "../data/processed/acc_processed.csv",
    "BVP":  "../data/processed/bvp_processed.csv",
    "TEMP": "../data/processed/temp_processed.csv",
}

In [20]:

# 2) For each signal, load and group by participant
data_by_participant = {}  # nested dict: {participant: {signal: DataFrame}}

for signal, path in signal_files.items():
    df = pd.read_csv(path, parse_dates=["timestamp"], header=0)
    
    # Group rows by participant
    for pid, sub in df.groupby("participant"):
        # Initialize inner dict if needed
        data_by_participant.setdefault(pid, {})
        data_by_participant[pid][signal] = sub.set_index("timestamp").sort_index()




In [21]:
# 3) Check what we got
for pid, sig_dict in data_by_participant.items():
    print(f"Participant {pid}: signals = {list(sig_dict.keys())}")
    # And maybe print head of one, e.g. HR
    print(sig_dict.get("HR", pd.DataFrame()).head())
    print("---")

Participant 15: signals = ['HR', 'EDA']
                        HR participant  session_ts
timestamp                                         
2020-07-07 16:43:05  83.00          15  1594140175
2020-07-07 16:43:06  83.00          15  1594140175
2020-07-07 16:43:07  83.67          15  1594140175
2020-07-07 16:43:08  87.25          15  1594140175
2020-07-07 16:43:09  81.80          15  1594140175
---
Participant 5C: signals = ['HR', 'EDA']
                         HR participant  session_ts
timestamp                                          
2020-04-14 17:50:36  121.00          5C  1586886626
2020-04-14 17:50:37   85.00          5C  1586886626
2020-04-14 17:50:38   79.67          5C  1586886626
2020-04-14 17:50:39   77.00          5C  1586886626
2020-04-14 17:50:40   74.40          5C  1586886626
---
Participant 6B: signals = ['HR', 'EDA']
                         HR participant  session_ts
timestamp                                          
2020-04-22 15:29:43  175.00          6B  158756

In [24]:
# resampled_data_by_participant = {}

# target_freq = "1s"
# for pid, signals in data_by_participant.items():
#     if "HR" not in signals or "EDA" not in signals:
#         continue

#     # Drop non-numeric columns
#     hr = signals["HR"][["HR"]]  # keep only the HR column
#     temp = signals["EDA"][["EDA"]]

#     # Resample to 1Hz
#     hr_resampled = hr.resample(target_freq).mean()
#     temp_resampled = temp.resample(target_freq).mean()

#     # Merge on timestamp
#     merged = pd.merge(hr_resampled, temp_resampled, left_index=True, right_index=True, how="outer")

#     # Optionally: Re-add participant ID
#     #merged["participant"] = pid

#     resampled_data_by_participant[pid] = merged

resampled_data_by_participant = {}

target_freq = "1s"
for pid, signals in data_by_participant.items():
    if "HR" not in signals or "EDA" not in signals or "ACC" not in signals or "BVP" not in signals or "TEMP" not in signals:
        continue

    # Drop non-numeric columns
    hr = signals["HR"][["HR"]]  # keep only the HR column
    temp = signals["EDA"][["EDA"]]  # keep only the EDA column

    # Resample to 1Hz
    hr_resampled = hr.resample(target_freq).mean()
    temp_resampled = temp.resample(target_freq).mean()

    # Merge on timestamp
    merged = pd.merge(hr_resampled, temp_resampled, left_index=True, right_index=True, how="outer")

    # ✅ Drop rows where all signals are missing
    merged = merged.dropna(how="all", subset=["HR", "EDA"])

    resampled_data_by_participant[pid] = merged



In [23]:
output_dir = "processed_participant_data"
os.makedirs(output_dir, exist_ok=True)

for pid, df in resampled_data_by_participant.items():
    file_path = os.path.join(output_dir, f"{pid}_merged.csv")
    df.to_csv(file_path)
    print(f"✅ Saved: {file_path}")


✅ Saved: processed_participant_data/15_merged.csv
✅ Saved: processed_participant_data/5C_merged.csv
✅ Saved: processed_participant_data/6B_merged.csv
✅ Saved: processed_participant_data/6D_merged.csv
✅ Saved: processed_participant_data/7A_merged.csv
✅ Saved: processed_participant_data/7E_merged.csv
✅ Saved: processed_participant_data/83_merged.csv
✅ Saved: processed_participant_data/8B_merged.csv
✅ Saved: processed_participant_data/94_merged.csv
✅ Saved: processed_participant_data/BG_merged.csv
✅ Saved: processed_participant_data/CE_merged.csv
✅ Saved: processed_participant_data/DF_merged.csv
✅ Saved: processed_participant_data/E4_merged.csv
✅ Saved: processed_participant_data/EG_merged.csv
✅ Saved: processed_participant_data/F5_merged.csv
