In [1]:
import pandas as pd
import numpy as np

In [3]:
# Sensor biomarker data are organized in CSV files for each biomarker across several sessions for all participants. 
# Our goal is proper data organization. First, we want to structure the data as organized data from all sensors according to participants (sensor readings for each participant are resampled and merged for each individual). 
# This approach facilitates training various anomaly detection models (population-based model, purely personalized model, or a personalized model via adaptation). 
# By segregating data according to participants, features can be engineered across this data, and then the data can be easily merged to obtain a general population dataset. 

In [65]:
# Read sensorfile
hr_df_orig = pd.read_csv("../data/processed/hr_processed.csv", low_memory=False, header=0)
temp_df_orig = pd.read_csv("../data/processed/temp_processed.csv", low_memory=False, header=0)

In [66]:
hr_df = hr_df_orig.drop(["participant", "session_ts"], axis = 1)
temp_df = temp_df_orig.drop(["participant", "session_ts"], axis = 1)

In [67]:
# convert to datatime
hr_df['timestamp'] = pd.to_datetime(hr_df['timestamp'])
temp_df['timestamp'] = pd.to_datetime(temp_df['timestamp'])

In [68]:
# Setting the timestamp as index
hr_df.set_index('timestamp', inplace=True)
temp_df.set_index('timestamp', inplace=True)

In [69]:
# Step 1: Get only numeric columns
hr_numeric = hr_df.select_dtypes(include='number')
temp_numeric = temp_df.select_dtypes(include='number')

# Step 2: Add back the datetime index
hr_numeric.index = hr_df.index
temp_numeric.index = temp_df.index

In [72]:
# Resampling
hr_df = hr_numeric.resample('1s').mean().interpolate()
temp_df = temp_numeric.resample('1s').mean().interpolate()

In [73]:
hr_df.shape

(22179058, 1)

In [74]:
temp_df.shape

(22179066, 1)

In [75]:
hr_df.head()

Unnamed: 0_level_0,HR
timestamp,Unnamed: 1_level_1
2020-04-13 14:32:03,109.0
2020-04-13 14:32:04,83.5
2020-04-13 14:32:05,83.0
2020-04-13 14:32:06,83.0
2020-04-13 14:32:07,83.0


In [76]:
temp_df.head()

Unnamed: 0_level_0,TEMP
timestamp,Unnamed: 1_level_1
2020-04-13 14:31:53,31.73
2020-04-13 14:31:54,31.73
2020-04-13 14:31:55,31.71
2020-04-13 14:31:56,31.71
2020-04-13 14:31:57,31.71


In [77]:
hr_temp_df = merged_df = pd.merge(hr_df, temp_df, left_index=True, right_index=True, how="outer")

In [78]:
hr_temp_df.head()

Unnamed: 0_level_0,HR,TEMP
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-04-13 14:31:53,,31.73
2020-04-13 14:31:54,,31.73
2020-04-13 14:31:55,,31.71
2020-04-13 14:31:56,,31.71
2020-04-13 14:31:57,,31.71


In [79]:
hr_temp_df[:20]

Unnamed: 0_level_0,HR,TEMP
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-04-13 14:31:53,,31.73
2020-04-13 14:31:54,,31.73
2020-04-13 14:31:55,,31.71
2020-04-13 14:31:56,,31.71
2020-04-13 14:31:57,,31.71
2020-04-13 14:31:58,,31.73
2020-04-13 14:31:59,,31.71
2020-04-13 14:32:00,,31.71
2020-04-13 14:32:01,,31.73
2020-04-13 14:32:02,,31.71
