<a href="https://colab.research.google.com/github/santteegt/om-fol-timeseries/blob/master/WESAD_Data_Exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# WESAD - A Multimodal Dataset for Wearable Stress and Affect Detection

This notebook does data segmentation. Filters were applied to raw data to finally apply a sliding window of 0.25 sec with a window size of 1 second

## Requires Dependencies

In [3]:
!pip install neurokit2 pyhrv pyarrow



In [4]:
import concurrent.futures
from datetime import timedelta

import gzip
import logging
import matplotlib as plt
import neurokit2 as nk
import numpy as np
import os
import pandas as pd
import pyhrv
import scipy.signal as scisig
import scipy.stats
import shutil
import time
from urllib.request import Request, urlopen
import zipfile

import cvxEDA

%matplotlib inline
plt.rcParams['figure.figsize'] = [10, 7]  # Bigger images

In [5]:
if not os.path.exists('cvxEDA.py'):
    !wget https://raw.githubusercontent.com/lciti/cvxEDA/master/src/cvxEDA.py

In [6]:
!ls -l

total 1112
-rw-r--r--  1 santteegt  staff    5351 Jun 25 21:39 BASELINE_EXPERIMENTS.md
-rw-r--r--  1 santteegt  staff   71111 Jun 25 21:39 LSTM_Model_Chest_device.ipynb
-rw-r--r--  1 santteegt  staff  291153 Jun 25 21:39 ML Classifiers - Chest Device.ipynb
-rw-r--r--  1 santteegt  staff   75156 Jun 25 21:39 ML Classifiers per Modality.ipynb
-rw-r--r--  1 santteegt  staff    3022 Jun 25 21:39 README.md
-rw-r--r--  1 santteegt  staff   95150 Jun 28 20:54 WESAD_Data_Exploration.ipynb
drwxr-xr-x  3 santteegt  staff      96 Jun 11 10:13 [34m__pycache__[m[m
-rw-r--r--  1 santteegt  staff    1481 Jun 21 21:53 chest_scores_per_mod.csv
-rw-r--r--  1 santteegt  staff    5876 Jun 11 10:12 cvxEDA.py
-rw-r--r--  1 santteegt  staff       0 Jun 11 11:45 process.log
drwxr-xr-x  5 santteegt  staff     160 Jun 28 20:48 [34msegmented_data[m[m


## Data Loader

In [7]:
class WesadDataLoader():
    """Downloads and load data from the WESAD dataset
        
        Source URI: https://uni-siegen.sciebo.de/s/pYjSgfOVs6Ntahr/download
    """
    
    LABEL = 'label'
    SIGNAL = 'signal'
    SUBJECT = 'subject'
    
    WRIST_DEV = 'wrist'
    CHEST_DEV = 'chest'
    
    DATASET_NAME = 'WESAD'
    DATASET_URI = 'https://uni-siegen.sciebo.de/s/pYjSgfOVs6Ntahr/download'
    
    def __init__(self, subject, basepath='.'):
        self.logger = logging.getLogger(WesadDataLoader.__name__)
        self.logger.info('Init...')
        self.chest_modalities = ['ACC', 'ECG', 'EDA', 'EMG', 'Resp', 'Temp']
        self.wrist_modalities = ['ACC', 'BVP', 'EDA', 'TEMP']
        self.mod_samp_rate = {'ACC': 32, 'BVP': 64, 'EDA': 4, 'TEMP': 4, 'chest': 700}  # Hz
        WesadDataLoader.download(basepath)
        basepath = os.path.join(os.path.abspath(basepath), WesadDataLoader.DATASET_NAME, subject)
        if not os.path.isdir(basepath):
            raise Exception(f'Dataset path does not exist or is not a directory: {basepath}')
        data_file = os.path.join(basepath, f'{subject}.pkl')
        if not os.path.exists(data_file):
            raise Exception(f'Data file does not exists: {data_file}')
#         with open(subject + '.pkl', 'rb') as file:
#             data = pickle.load(file, encoding='latin1')
        self.data = pd.read_pickle(data_file)
    
    @staticmethod
    def download(basepath):
        filename = os.path.join(os.path.abspath(basepath), f'{WesadDataLoader.DATASET_NAME}.zip')
        data_folder = os.path.join(os.path.abspath(basepath), WesadDataLoader.DATASET_NAME)
        if not os.path.isdir(data_folder) and not os.path.exists(filename):
            print('Downloading dataset...')
            start = time.time()
            response = urlopen(WesadDataLoader.DATASET_URI)
            print(f'Elapsed: {time.time() - start} secs')
        if not os.path.isdir(data_folder):
            with open(filename, 'wb') as out_file:
                print('Saving dataset locally...')
                start = time.time()
                shutil.copyfileobj(response, out_file)
            out_file.close()
            print(f'Elapsed: {time.time() - start} secs')
            start = time.time()
            while not zipfile.is_zipfile(filename):
                print('Wait..')
            print('Found Zip...')
            print(f'Elapsed: {time.time() - start} secs')
            with zipfile.ZipFile(filename) as zf:
                print('Extracting files...')
                start = time.time()
                zf.extractall()
            print(f'Elapsed: {time.time() - start} secs')
            print('Done!')

    def get_labels(self):
        return self.data[WesadDataLoader.LABEL]

    def get_wrist_data(self):
        """"""
        #label = self.data[self.keys[0]]
#         assert subject == self.data[self.keys[1]]
        signal = self.data[WesadDataLoader.SIGNAL]
        wrist_data = signal[WesadDataLoader.WRIST_DEV]
        # Adding Resp modality from chest device
        wrist_data.update({'Resp': self.data[WesadDataLoader.SIGNAL][WesadDataLoader.CHEST_DEV]['Resp']})
        return wrist_data

    def get_chest_data(self):
        """"""
        signal = self.data[WesadDataLoader.SIGNAL]
        chest_data = signal[WesadDataLoader.CHEST_DEV]
        return chest_data

## Data Exploration - Initial settings

In [8]:
%%time
BASE_PATH = '../'
# WesadDataLoader.download('.')
DATASET_PATH = os.path.join(BASE_PATH, WesadDataLoader.DATASET_NAME)
subjects = [dir_ for dir_ in os.listdir(DATASET_PATH) if os.path.isdir(os.path.join(DATASET_PATH, dir_))]
# subjects = ['S3']
obj_data = {}

for subject in subjects:
    obj_data[subject] = WesadDataLoader(subject=subject, basepath=BASE_PATH)

CPU times: user 55 s, sys: 28.7 s, total: 1min 23s
Wall time: 1min 26s


In [41]:
# Checking dataset size
sampling_rate=700
window_size=1
window_shift=0.25

baseline_rec = 0
stress_rec = 0
amusement_rec = 0
total_segmented = 0
print('Subjects', obj_data.keys())
for sub in obj_data.keys():
    data = obj_data[sub].get_chest_data()
    labels = obj_data[sub].get_labels()
    baseline = np.asarray([idx for idx,val in enumerate(labels) if val == 1])
    stress = np.asarray([idx for idx,val in enumerate(labels) if val == 2])
    amusement = np.asarray([idx for idx,val in enumerate(labels) if val == 3])

    baseline_rec += baseline.shape[0]
    stress_rec += stress.shape[0]
    amusement_rec += amusement.shape[0]
    conditions = [baseline, stress, amusement]
    
    subtotal = 0
    for cond in conditions:
        subtotal += len(list(range(0, data['ECG'][cond].shape[0] - (sampling_rate * window_size), int(sampling_rate * window_shift))))
    print('Subject', sub, subtotal)
    total_segmented += subtotal

(baseline_rec + stress_rec + amusement_rec), total_segmented

Subjects dict_keys(['S5', 'S2', 'S3', 'S4', 'S17', 'S10', 'S11', 'S16', 'S8', 'S6', 'S7', 'S9', 'S13', 'S14', 'S15'])
Subject S5 8856
Subject S2 8472
Subject S3 8608
Subject S4 8649
Subject S17 9092
Subject S10 9096
Subject S11 8900
Subject S16 8873
Subject S8 8824
Subject S6 8796
Subject S7 8781
Subject S9 8776
Subject S13 8893
Subject S14 8897
Subject S15 8920


(23206404, 132433)

### Compute Features

In [46]:
# def compute_features(data, condition, sampling_rate=700, window_size=60, window_shift=0.25):
def compute_features(data, condition, sampling_rate=700, window_size=1, window_shift=0.25):

    index = 0
    init = time.time()

    # data cleaning
    ## ECG
    ecg_cleaned = nk.ecg_clean(data["ECG"][condition].flatten(), sampling_rate=sampling_rate)
    ## == OLD
    # ecg_rpeaks, _ = nk.ecg_peaks(ecg_cleaned, sampling_rate=sampling_rate)
    # ecg_hr = nk.signal_rate(ecg_rpeaks, sampling_rate=sampling_rate)
    ## ==
    ## EDA
    ## 5Hz lowpass filter
    eda_highcut = 5
    eda_filtered = nk.signal_filter(data['EDA'][condition].flatten(), sampling_rate=sampling_rate, highcut=eda_highcut)
    eda_cleaned = nk.standardize(eda_filtered)
    # TODO: not sure about the approach. cvxeda takes longer periods
    # phasic_tonic = nk.eda_phasic(cleaned, sampling_rate=700, method='cvxeda')
    eda_phasic_tonic = nk.eda_phasic(eda_cleaned, sampling_rate=sampling_rate)
    eda_phasic_tonic['t'] = [(1 / sampling_rate) * i for i in range(eda_phasic_tonic.shape[0])]
    eda_scr_peaks, scr_info = nk.eda_peaks(eda_phasic_tonic['EDA_Phasic'], sampling_rate=sampling_rate)
    ## EMG
    ## For 5 sec window signal
    ## More on DC Bias https://www.c-motion.com/v3dwiki/index.php/EMG:_Removing_DC_Bias
    emg_lowcut = 50
    emg_filtered_dc = nk.signal_filter(data['EMG'][condition].flatten(), sampling_rate=sampling_rate, lowcut=emg_lowcut)
    # OR 100 Hz highpass Butterworth filter followed by a constant detrending
    # filtered_dc = nk.emg_clean(chest_data_dict['EMG'][baseline].flatten(), sampling_rate=700)
    ## For 60 sec window signal
    # 50Hz lowpass filter
    emg_highcut = 50
    emg_filtered = nk.signal_filter(data['EMG'][condition].flatten(), sampling_rate=sampling_rate, highcut=emg_highcut)
    ## Resp
    ## Method biosppy important to appply bandpass filter 0.1 - 0.35 Hz
    resp_processed, _ = nk.rsp_process(data['Resp'][condition].flatten(), sampling_rate=sampling_rate, method='biosppy')

    print('Elapsed Preprocess', str(timedelta(seconds=time.time() - init)))
    init = time.time()

    chest_df_5 = pd.DataFrame() # For 5 sec window size
    chest_df = pd.DataFrame()

    window = int(sampling_rate * window_size)
    for i in range(0, data['ACC'][condition].shape[0] - window, int(sampling_rate * window_shift)):

        # ACC
        w_acc_data = data['ACC'][condition][i: window + i]
        acc_x_mean, acc_y_mean, acc_z_mean = np.mean(w_acc_data, axis=0)  # Feature

        ## ECG 
        w_ecg_cleaned = ecg_cleaned[i: window + i]
        w_ecg_mean = np.mean(w_ecg_cleaned)

        # EDA
        w_eda_data = eda_cleaned[i: window + i]
        w_eda_mean = np.mean(w_eda_data)

        # EMG
        w_emg_data = emg_filtered_dc[i: window + i]
        w_emg_mean = np.mean(w_emg_data)
        
        # Resp
        w_resp_data = resp_processed.loc[i: window + i, 'RSP_Clean']
        w_resp_mean = np.mean(w_resp_data)

        # Temp
        w_temp_data = data['Temp'][condition][i: window + i].flatten()
        w_temp_mean = np.mean(w_temp_data)  # Feature

        chest_df = chest_df.append({
            'ACC_x': acc_x_mean, 'ACC_y': acc_y_mean, 'ACC_z': acc_z_mean,
            'ECG': w_ecg_mean,
            'EDA': w_eda_mean,
            'EMG': w_emg_mean,
            'RESP': w_resp_mean,
            'TEMP': w_temp_mean
        }, ignore_index=True)


        # index += 1
        # if index % 10 == 0:
        #     break
    
    print('Elapsed Process', condition.shape[0], str(timedelta(seconds=time.time() - init)))
    return chest_df, chest_df_5


## Chest-worn device - Dataset Generation

In [49]:
def process_subject(subject_data, cond_to_process, max_workers=6):
    rs = dict()

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_label = {executor.submit(compute_features, subject_data, cond): label for label, cond in cond_to_process}
        for future in concurrent.futures.as_completed(future_to_label):
            label = future_to_label[future]
            try:
                data, _ = future.result()
                print(label, data.shape)
                rs[label] = data
            except Exception as exc:
                print('%r generated an exception: %s' % (label, exc))
    return rs

In [50]:
base_path = './raw_segmented'
subjects = ['S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9', 'S10', 'S11', 'S13', 'S14', 'S15', 'S16', 'S17']

for subject in subjects:
    print('Subject', subject)
    chest_data_dict = obj_data[subject].get_chest_data()
    labels = obj_data[subject].get_labels()
    chest_dict_length = {key: len(value) for key, value in chest_data_dict.items()}
    print(chest_dict_length)

    # Get labels
    baseline = np.asarray([idx for idx,val in enumerate(labels) if val == 1])
    stress = np.asarray([idx for idx,val in enumerate(labels) if val == 2])
    amusement = np.asarray([idx for idx,val in enumerate(labels) if val == 3])

    print("Baseline:", chest_data_dict['ECG'][baseline].shape)
    print("Stress:", chest_data_dict['ECG'][stress].shape)
    print("Amusement:", chest_data_dict['ECG'][amusement].shape)

    # Process Subject
    to_process = zip(['baseline', 'stress', 'amusement'], [baseline, stress, amusement])
    # to_process = zip(['baseline'], [baseline])
    %time subject_data = process_subject(chest_data_dict, cond_to_process=to_process)

    ## Labeling
    subject_data['baseline']['label'] = 1
    subject_data['baseline']['subject'] = subject
    subject_data['stress']['label'] = 2
    subject_data['stress']['subject'] = subject
    subject_data['amusement']['label'] = 3
    subject_data['amusement']['subject'] = subject
    ## Storing
    dfs = [v for k, v in subject_data.items()]
    df_subject = pd.concat(dfs)
    print('Generated dataset for', subject, df_subject.shape)
    df_subject.head()
    df_subject.reset_index().to_feather(f'{base_path}/{subject}.feather')

Subject S2
{'ACC': 4255300, 'ECG': 4255300, 'EMG': 4255300, 'EDA': 4255300, 'Temp': 4255300, 'Resp': 4255300}
Baseline: (800800, 1)
Stress: (430500, 1)
Amusement: (253400, 1)
Elapsed Preprocess 0:00:01.837103
Elapsed Preprocess 0:00:02.356836
Elapsed Preprocess 0:00:03.485379
Elapsed Process 253400 0:00:11.439508
amusement (1444, 8)
Elapsed Process 430500 0:00:26.130266
stress (2456, 8)
Elapsed Process 800800 0:01:18.545360
baseline (4572, 8)
CPU times: user 1min 45s, sys: 7.6 s, total: 1min 53s
Wall time: 1min 20s
Generated dataset for S2 (8472, 10)
Subject S3
{'ACC': 4545100, 'ECG': 4545100, 'EMG': 4545100, 'EDA': 4545100, 'Temp': 4545100, 'Resp': 4545100}
Baseline: (798000, 1)
Stress: (448000, 1)
Amusement: (262500, 1)
Elapsed Preprocess 0:00:02.385835
Elapsed Preprocess 0:00:02.534810
Elapsed Preprocess 0:00:05.054465
Elapsed Process 262500 0:00:13.044552
amusement (1496, 8)
Elapsed Process 448000 0:00:31.412621
stress (2556, 8)
Elapsed Process 798000 0:01:23.630023
baseline (4556,

Elapsed Process 826700 0:01:34.061676
baseline (4720, 8)
CPU times: user 2min 14s, sys: 10.5 s, total: 2min 24s
Wall time: 1min 37s
Generated dataset for S17 (9092, 10)


In [16]:
# Download files If running in Google Colab
# from google.colab import files
# [files.download(file) for file in os.listdir('.') if file.endswith('feather')]

In [57]:
feather_files = []
for f in os.listdir(base_path):
    feather_files.append(pd.read_feather(os.path.join(base_path, f)).drop(columns=['index']))

df_all = pd.concat(feather_files)
print(df_all.shape)
df_all.to_csv(os.path.join(base_path, 'all_raw.csv'), index=False)

(132433, 10)
