Unzipping Dataset

In [1]:
import pickle
import numpy as np
import pandas as pd
import os

directory = '../data/WESAD.zip'
output_directory = '../data/'

os.listdir(output_directory)

import zipfile
with zipfile.ZipFile(directory, 'r') as zip_ref:
    zip_ref.extractall(output_directory)

Plotting Individual Data

In [18]:
import pickle
import os
import numpy as np
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt
from ecgdetectors import Detectors
from scipy import signal
from scipy.stats import skew,kurtosis,iqr
import pickle
from peak_valley import compute_peak_valley
from respiration_feature import rip_cycle_feature_computation
filelists = ['../data/WESAD/'+a+'/'+a+'.pkl' for a in os.listdir('../data/WESAD/') if a[-1] not in ['s','f']]

In [19]:
%matplotlib notebook
def get_ecg_rr(ecg_data):
    detectors = Detectors(700)
    rpeaks = detectors.hamilton_detector(ecg_data[:,1])
    ecg_r_ts = ecg_data[np.array(rpeaks),0]
    ecg_rr_ts = ecg_r_ts[1:]
    ecg_rr_sample = np.diff(ecg_r_ts)
    ecg_rr = pd.DataFrame(np.vstack([ecg_rr_ts,ecg_rr_sample]).T,columns=['time','rr'])
    ecg_rr['timestamp'] = ecg_rr['time'].apply(lambda a:datetime.utcfromtimestamp(a))
    return ecg_rr

def bandpass_filter(data,Fs=64,fil_type='ppg'):
    X0 = data[:,1]
    X1 = signal.detrend(X0,axis=0,type='constant')
    X2 = np.zeros((np.shape(X1)[0],data.shape[1]))
    nyq = Fs/2
    b = signal.firls(219,np.array([0,0.6,0.7,3,3.5,nyq]),
                              np.array([0,0,1,1,0,0]),np.array([10,1,1]),fs=nyq*2)
    a = [1]
    X2[:,0] = data[:,0]
    X2[:,1] = signal.filtfilt(b, a, X1)
    return X2

def bandpass_filter_acc(data,Fs=32,fil_type='acl'):
    X1 = data[:,1:]
    X2 = np.zeros((np.shape(X1)[0],data.shape[1]))
    nyq = Fs/2
    b = signal.firls(219,np.array([0,0.6,0.7,3,3.5,nyq]),
                              np.array([0,0,1,1,0,0]),np.array([10,1,1]),fs=nyq*2)
    a = [1]
    X2[:,0] = data[:,0]
    X2[:,1] = signal.filtfilt(b, a, X1[:,0])
    X2[:,2] = signal.filtfilt(b, a, X1[:,1])
    X2[:,3] = signal.filtfilt(b, a, X1[:,2])
    return X2

def bandpass_filter_respiration(data,Fs=700,fil_type='ppg'):
    X0 = data[:,1]
    X1 = signal.detrend(X0,axis=0,type='constant')
    X2 = np.zeros((np.shape(X1)[0],data.shape[1]))
    nyq = Fs/2
    b = signal.firls(219,np.array([0,0.03,0.05,2,2.5,nyq]),
                              np.array([0,0,1,1,0,0]),np.array([10,1,1]),fs=nyq*2)
    a = [1]
    X2[:,0] = data[:,0]
    X2[:,1] = signal.filtfilt(b, a, X1)
    return X2

def get_quality_features(ppg_data,ppg_fs=64,window_size=2.5):
    ppg_data_final = []
    n = int(ppg_fs*window_size/2)
    for i in range(n,ppg_data.shape[0]-n,1):
        tmp = []
        tmp.append(ppg_data[i,0])
        tmp.append(ppg_data[i,1])
        tmp.append(ppg_data[i,2])
        tmp.extend([-1]*4)
#         sample = ppg_data[(i-n):(i+n),1]
#         tmp.append(skew(sample))
#         tmp.append(kurtosis(sample))
#         tmp.append(iqr(sample))
#         f,pxx = signal.welch(sample,fs=ppg_fs,nperseg=len(sample)//2,nfft=10000,axis=0)
#         tmp.append(np.trapz(pxx[np.where((f>=.8)&(f<=2.5))[0]])/np.trapz(pxx))
        ppg_data_final.append(np.array(tmp))
    return np.array(ppg_data_final)


def save_participant_data(filename,ecg_fs = 700,ppg_fs = 64,acc_fs=32,window_size=8):
    data = pickle.load(open(filename,'rb'),encoding='latin1')
    ppg_data = data['signal']['wrist']['BVP']
    acc_data = data['signal']['wrist']['ACC']/64
#     print(data['signal']['wrist']['ACC'].shape[0]*2,ppg_data.shape)
    ecg_data = data['signal']['chest']['ECG']
    respiration_data = data['signal']['chest']['Resp']
    total_seconds = ppg_data.shape[0]/ppg_fs
    start_ts = datetime.utcnow().timestamp()
    ecg_ts = start_ts + np.arange(0,total_seconds,1/ecg_fs)
    acc_ts = start_ts + np.arange(0,total_seconds,1/acc_fs)
    acc_data = np.concatenate([acc_ts.reshape(-1,1),acc_data],axis=1)
    acc_data1 = bandpass_filter_acc(acc_data.copy(),Fs=acc_fs)
    acc_data = np.concatenate([acc_data,acc_data1[:,1:]],axis=1)
#     print(acc_data[0])
    respiration_ts = ecg_ts
    respiration_data = np.vstack([respiration_ts,respiration_data.reshape(-1)]).T
    ecg_data = np.vstack([ecg_ts,ecg_data.reshape(-1)]).T
    ecg_rr1 = get_ecg_rr(ecg_data)
    ecg_rr = ecg_rr1.values
    ppg_ts = start_ts + np.arange(0,total_seconds,1/ppg_fs)
    ppg_data = np.vstack([ppg_ts,ppg_data.reshape(-1)]).T
    ppg_data1 = bandpass_filter(ppg_data.copy(),Fs=ppg_fs,fil_type='ppg')
    ppg_data = np.vstack([ppg_ts,ppg_data[:,1].reshape(-1),ppg_data1[:,1].reshape(-1)]).T
    respiration_data = bandpass_filter_respiration(respiration_data,Fs=ecg_fs,fil_type='ppg')
    respiration_data[:,0] = respiration_data[:,0]*1000
    peak_index,valley_index = compute_peak_valley(respiration_data)
    peak_data = respiration_data[peak_index]
    valley_data = respiration_data[valley_index]
    rip_feature = rip_cycle_feature_computation(peak_data,valley_data)[:,:5]
    rip_feature[:,:2] = rip_feature[:,:2]/1000
    ppg_data = get_quality_features(ppg_data)
#     plt.figure(figsize=(10,5))
# #     plt.plot(ppg_data[:,0],ppg_data[:,1]/np.max(ppg_data[:,1]))
# #     plt.bar(ppg_data[:,0],ppg_data[:,-1],.95/ppg_fs)
# #     plt.show()
    ppg_data = pd.DataFrame(ppg_data,columns=['time','ppg','filtered_ppg','skew','kurtosis','iqr','relative_power']).dropna().sort_values('time').reset_index(drop=True)
#     plt.figure(figsize=(10,5))
#     plt.hist(ppg_data['relative_power'])
# #     plt.bar(ppg_data[:,0],ppg_data[:,-1],.95/ppg_fs)
#     plt.show()

#     ppg_data['timestamp'] = ppg_data['time'].apply(lambda a:datetime.utcfromtimestamp(a))
    respiration_data[:,0] = respiration_data[:,0]/1000
    all_data = []
    for i in range(0,ppg_data.shape[0]-window_size*ppg_fs,window_size*ppg_fs//4):
        a = ppg_data.loc[i:i+window_size*ppg_fs-1]
#         print(a['time'].min()-a['time'].max())
        b = respiration_data[np.where((respiration_data[:,0]>=a['time'].min())&(respiration_data[:,0]<a['time'].max()))[0],1].reshape(-1,1)
        print(a.shape,b.shape)
        all_data.append([a['time'].min(),a['time'].max(),
                         a[['time','ppg','filtered_ppg','skew','kurtosis',
                            'iqr','relative_power']].sort_values('time').reset_index(drop=True),b])
    
    
    ppg_windows = pd.DataFrame(all_data,columns=['start_time','end_time','data','respiration'])
    ppg_windows['ecg_rr'] = ppg_windows.apply(lambda a:np.mean(ecg_rr[np.where((ecg_rr[:,0]>=a['start_time'])&(ecg_rr[:,0]<a['end_time']))[0],1]),axis=1)
    ppg_windows['inspiration_duration'] = ppg_windows.apply(lambda a:np.mean(rip_feature[np.where((rip_feature[:,1]>=a['start_time'])&(rip_feature[:,0]<a['end_time']))[0],2]),axis=1)
    ppg_windows['expiration_duration'] = ppg_windows.apply(lambda a:np.mean(rip_feature[np.where((rip_feature[:,1]>=a['start_time'])&(rip_feature[:,0]<a['end_time']))[0],3]),axis=1)
    ppg_windows['respiration_duration'] = ppg_windows.apply(lambda a:np.mean(rip_feature[np.where((rip_feature[:,1]>=a['start_time'])&(rip_feature[:,0]<a['end_time']))[0],4]),axis=1)
    ppg_windows['acc_window'] = ppg_windows.apply(lambda a: acc_data[np.where((acc_data[:,0]>=a['start_time'])&(acc_data[:,0]<a['end_time']))[0],:],axis=1)
#     for a in ppg_windows['acc_window'].values:
#         print(a.shape)

    print(ppg_windows.shape,ppg_windows.dropna().shape)
#     plt.figure(figsize=(18,8))
#     plt.plot(ppg_windows['start_time'],ppg_windows['respiration_duration'])
#     plt.plot(rip_feature[:,0],rip_feature[:,-1])
#     plt.show()
    print(ppg_windows.shape)
    if not os.path.isdir(output_directory+str(window_size)):
        os.makedirs(output_directory+str(window_size))
    final_path = output_directory+str(window_size)+'/'
    participant_name = filename.split('/')[-1]
    pickle.dump(ppg_windows,open(final_path+participant_name,'wb'))
    
from joblib import Parallel,delayed
output_directory = '../data/'
final = Parallel(n_jobs=20,verbose=2)(delayed(save_participant_data)(f,window_size=8) for f in filelists)
# final = [save_participant_data(f,window_size=8) for f in filelists[:1]]
# for filename in filelists:
filelists[0]
#     print(ecg_rr.head(),ppg_data.head())

[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   8 out of  15 | elapsed:  2.5min remaining:  2.2min
[Parallel(n_jobs=20)]: Done  15 out of  15 | elapsed:  2.9min finished


'../data/WESAD/S13/S13.pkl'

In [20]:
filelists = ['../data/8/'+a for a in os.listdir('../data/8/') if a[-1] not in ['s','f']]

In [23]:
for i in range(len(filelists)):
    data = pickle.load(open(filelists[i],'rb'))
#     data['acc_window'] = data['acc_window'].apply(lambda a:a[:255])
    data['shape'] = data['acc_window'].apply(lambda a:a.shape)
    print(data['shape'].unique())
#     plt.hist(data['data'].loc[0]['relative_power'])
#     plt.show()

[(256, 7)]
[(256, 7)]
[(256, 7)]
[(256, 7)]
[(256, 7)]
[(256, 7)]
[(256, 7)]
[(256, 7)]
[(256, 7)]
[(256, 7)]
[(256, 7)]
[(256, 7)]
[(256, 7)]
[(256, 7)]
[(256, 7)]


In [32]:
import os
import numpy as np
import pandas as pd

filepath = '../data/8/'
filelists = [filepath+a for a in os.listdir(filepath) if a[-1] not in ['s','f']]

from datetime import datetime
X_ppg = []
X_acl = []
y = []
y_participant = []
X_hr_windows = []
X_time = []
import pickle
for i,f in enumerate(filelists):
    data = pickle.load(open(f,'rb'))
    data = data.sort_values('start_time').reset_index(drop=True)
    X_time.extend(list(data['start_time'].values))
    X_ppg.extend([a.values[:,1:3].reshape(1,-1,2) for a in data['data'].values])
    X_acl.extend([a[:,1:].reshape(1,-1,6) for a in data['acc_window'].values])
    y.extend(list(data['ecg_rr'].values))
    y_participant.extend([i]*data.shape[0])
    data['participant'] = i
    data['timestamp'] = data['start_time'].apply(lambda a:datetime.utcfromtimestamp(a))
#     data = data.sort_values('timestamp').reset_index(drop=True)
#     hr_windows = [df[['ecg_rr','participant']].values for i,df in 
#                   data.groupby(pd.Grouper(key='timestamp',freq='20S')) if df.shape[0]==10]
#     X_hr_windows.extend(hr_windows)

X_acl = np.concatenate(X_acl)
X_ppg = np.concatenate(X_ppg)
y = np.array(y)
y_participant = np.array(y_participant)
X_time = np.array(X_time)
print(X_time.shape,X_acl.shape,X_ppg.shape,y.shape,y_participant.shape)

import pickle
pickle.dump([X_acl,X_ppg,y,y_participant,X_time],open('../data/tabular_data.p','wb'))

(43363,) (43363, 256, 6) (43363, 512, 2) (43363,) (43363,)


In [33]:
X_acl.shape,X_ppg.shape,y.shape,y_participant.shape,X_time.shape

((43363, 256, 6), (43363, 512, 2), (43363,), (43363,), (43363,))

In [31]:
data.head()

Unnamed: 0,start_time,end_time,data,respiration,ecg_rr,inspiration_duration,expiration_duration,respiration_duration,acc_window,ts
0,1609814000.0,1609814000.0,time ppg filtered_ppg skew ...,"[[-1.8670659995837127], [-1.8696326910072862],...",0.646786,1.84,1.369048,3.209048,"[[1609814417.204355, -0.890625, -0.5, 0.140625...",2021-01-05 02:40:17.204355
1,1609814000.0,1609814000.0,time ppg filtered_ppg skew ...,"[[-0.4141992476995746], [-0.4171445701153571],...",0.641548,1.84,1.369048,3.209048,"[[1609814419.204355, -0.890625, -0.484375, 0.1...",2021-01-05 02:40:19.204355
2,1609814000.0,1609814000.0,time ppg filtered_ppg skew ...,"[[2.2667497283872255], [2.2700182464424996], [...",0.63369,1.84,1.369048,3.209048,"[[1609814421.204355, -0.59375, 1.09375, 0.0781...",2021-01-05 02:40:21.204355
3,1609814000.0,1609814000.0,time ppg filtered_ppg skew ...,"[[2.4371848089886443], [2.44602861407473], [2....",0.64131,1.747143,1.478571,3.225714,"[[1609814423.204355, -0.359375, 0.921875, 0.10...",2021-01-05 02:40:23.204355
4,1609814000.0,1609814000.0,time ppg filtered_ppg skew ...,"[[-1.842251649003148], [-1.8406390461597588], ...",0.640879,1.781905,1.517619,3.299524,"[[1609814425.204355, -0.3125, 0.921875, 0.2968...",2021-01-05 02:40:25.204355
