In [1]:
from cerebralcortex.util.helper_methods import get_study_names
sn = get_study_names("/home/jupyter/cc3_conf/")
print(sn)
from pyspark.sql import functions as F
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark.sql.types import StructField, StructType, DoubleType,MapType, StringType,ArrayType, FloatType, TimestampType, IntegerType
from pyspark.sql.functions import minute, second, mean, window
from pyspark.sql import functions as F
import numpy as np
import pandas as pd
from cerebralcortex.core.datatypes import DataStream
from cerebralcortex.core.metadata_manager.stream.metadata import Metadata, DataDescriptor, \
ModuleMetadata
from typing import List
import numpy as np
from scipy import signal
import pandas as pd
from cerebralcortex import Kernel
from pyspark.sql import functions as F
CC = Kernel("/home/jupyter/cc3_moods_conf/", study_name='moods')

['admin', 'dartmouth_sobc', 'default', 'demo', 'jhu_cocaine', 'mars_study', 'mcontain', 'md2k_aa_rice', 'md2k_affsci', 'md2k_labtest', 'md2k_ses_utah', 'memphis-test', 'memphis_test_study', 'moffitt', 'moffitt-test', 'moods', 'moral', 'mperf', 'mperf-alabsi', 'mperf-buder', 'mperf-mit-ll', 'mperf-test', 'northwestern_smoking', 'nu', 'opioid_study', 'osu', 'rice', 'robas', 'robas_study', 'sobclab', 'test', 'utah', 'utah_p01', 'vermont', 'vermont_smoking']


Bandpass filter the PPG data

In [None]:
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark.sql.types import StructField, StructType, DoubleType, StringType, TimestampType, IntegerType
import numpy as np
from cerebralcortex.core.datatypes import DataStream
from cerebralcortex.core.metadata_manager.stream.metadata import Metadata, DataDescriptor, \
    ModuleMetadata
from scipy import signal
def filter_data(X,
                Fs=100,
                low_cutoff=.4,
                high_cutoff=3.0,
                filter_order=65):
    """
    Bandpass Filter of single channel

    :param X: input data
    :param Fs: sampling freq.
    :param low_cutoff: low passband
    :param high_cutoff: high passband
    :param filter_order: no of taps in FIR filter

    :return: filtered version of input data
    """
    X1 = X.reshape(-1,1)
    X1 = signal.detrend(X1,axis=0,type='constant')
    b = signal.firls(filter_order,np.array([0,low_cutoff-.1, low_cutoff, high_cutoff ,high_cutoff+.5,Fs/2]),np.array([0, 0 ,1 ,1 ,0, 0]),
                     np.array([100*0.02,0.02,0.02]),fs=Fs)
    X2 = signal.convolve(X1.reshape(-1),b,mode='same')
    return X2

def get_metadata(data,
                 wrist='left',
                 sensor_name='motionsensehrv',
                 ppg_columns=('red','infrared','green'),
                 acl_columns=('aclx','acly','aclz')):
    """
    :param data: input stream
    :param wrist: which wrist the data was collected from
    :param sensor_name: name of sensor
    :param ppg_columns: columns in the input dataframe referring to multiple ppg channels
    :param acl_columns: columns in the input dataframe referring to accelerometer channels

    :return: metadata of output stream
    """
    stream_name = "org.md2k."+str(sensor_name)+"."+str(wrist)+".wrist.bandpass.filtered"
    stream_metadata = Metadata()
    stream_metadata.set_name(stream_name).set_description("Bandpass Filtered PPG data") \
        .add_dataDescriptor(DataDescriptor().set_name("timestamp").set_type("datetime")) \
        .add_dataDescriptor(DataDescriptor().set_name("localtime").set_type("datetime")) \
        .add_dataDescriptor(DataDescriptor().set_name("version").set_type("int")) \
        .add_dataDescriptor(DataDescriptor().set_name("user").set_type("string"))

    for c in ppg_columns:
        stream_metadata.add_dataDescriptor(DataDescriptor().set_name(c).set_type("double").set_attribute("description",
                                                                                                    "ppg channel "+c))
    for c in acl_columns:
        stream_metadata.add_dataDescriptor(DataDescriptor().set_name(c).set_type("double").set_attribute("description",
                                                                                            "accelerometer channel "+c))

    stream_metadata.add_module(
        ModuleMetadata().set_name("ecg data quality").set_attribute("url", "http://md2k.org/").set_author(
            "Md Azim Ullah", "mullah@memphis.edu"))
    return stream_metadata


def bandpass_filter(
                   data,
                   Fs = 25,
                   low_cutoff = 0.4,
                   high_cutoff = 3.0,
                   filter_order = 65,
                   ppg_columns=('red','infrared','green'),
                   acl_columns=('aclx','acly','aclz'),
                   wrist='left',
                   sensor_name='motionsensehrv'):

    """

    :param data: PPG & ACL data stream
    :param Fs: sampling frequency
    :param low_cutoff: minimum frequency of pass band
    :param high_cutoff: Maximum Frequency of pass band
    :param filter_order: no. of taps in FIR filter
    :param ppg_columns: columns in the input dataframe referring to multiple ppg channels
    :param acl_columns: columns in the input dataframe referring to accelerometer channels
    :param wrist: which wrist the data was collected from
    :param sensor_name: name of sensor

    :return: Bandpass filtered version of input PPG data
    """

    ## check if all columns exist

    default_columns = ['user','version','localtime','timestamp']
    required_columns = default_columns+acl_columns+ppg_columns
    if len(set(required_columns)-set(data.columns))>0:
        raise Exception("Columns missing in input dataframe! " + str(list(set(required_columns)-set(data.columns))))

    ## select the columns from input dataframe

    data = data.select(*required_columns)

    ## udf

    default_schema = [StructField("timestamp", TimestampType()),
                      StructField("localtime", TimestampType()),
                      StructField("version", IntegerType()),
                      StructField("user", StringType())]
    schema = StructType(default_schema+[StructField(c, DoubleType()) for c in list(ppg_columns)+list(acl_columns)])
    @pandas_udf(schema, PandasUDFType.GROUPED_MAP)
    def ppg_bandpass(data):
        data = data.sort_values('timestamp').reset_index(drop=True)
        for c in ppg_columns:
            data[c] = filter_data(data[c].values,Fs=Fs,low_cutoff=low_cutoff,high_cutoff=high_cutoff,filter_order=filter_order)
        return data

    ## steps
    ppg_bandpass_filtered = data.compute(ppg_bandpass,windowDuration=60*60*10,startTime='0 seconds')
    output_data = ppg_bandpass_filtered._data
    ds = DataStream(data=output_data,metadata=get_metadata(data,wrist=wrist,sensor_name=sensor_name,
                                                           ppg_columns=ppg_columns,acl_columns=acl_columns))
    return ds

data = CC.get_stream('ppg--org.md2k.watch--fossil_watch_sport',user_id='afcfc1b5-365f-409b-918e-2f0ce8056ff9')
data  = data.withColumn('day',F.date_format('localtime',"YYYYMMdd"))
filtered_data = bandpass_filter(
                   data,
                   Fs = 100,
                   low_cutoff = 0.4,
                   high_cutoff = 3.0,
                   filter_order = 65,
                   ppg_columns=['ppg1'],
                   acl_columns=[],
                   wrist='left',
                   sensor_name='fossil')
CC.save_stream(filtered_data,overwrite=True)

In [None]:
data = CC.get_stream('accelerometer--org.md2k.watch--fossil_watch_sport',user_id='afcfc1b5-365f-409b-918e-2f0ce8056ff9')
data  = data.withColumn('day',F.date_format('localtime',"YYYYMMdd"))
data = data.filter(F.col('day').isin(['20200922','20200921']))._data.toPandas()
# data.sort(F.col('timestamp').desc()).show(5,False)

In [None]:
import pickle
pickle.dump(data,open('../data/ppg2122sep.p','wb'))

Compute RR and quality features every 5 seconds

In [None]:
from scipy.stats import skew,kurtosis
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from scipy import signal
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark.sql.types import StructField, StructType, DoubleType, StringType, TimestampType, IntegerType, ArrayType
import numpy as np
import pandas as pd
from cerebralcortex.core.datatypes import DataStream
from cerebralcortex.core.metadata_manager.stream.metadata import Metadata, DataDescriptor, \
    ModuleMetadata

### Peak Detection codes ##

def _datacheck_peakdetect(x_axis, y_axis):
    """
    check data for peak detection

    :param x_axis: time
    :param y_axis: values
    :return: same as input data
    """
    if x_axis is None:
        x_axis = range(len(y_axis))

    if len(y_axis) != len(x_axis):
        raise ValueError("Input vectors y_axis and x_axis must have same length")

    #needs to be a numpy array
    y_axis = np.array(y_axis)
    x_axis = np.array(x_axis)
    return x_axis, y_axis


def peakdetect(y_axis, x_axis = None, lookahead = 200, delta=0):
    """

    :param y_axis: values
    :param x_axis: time
    :param lookahead: steps ahead to look for
    :param delta:
    :return: peak locations

    """
    max_peaks = []
    min_peaks = []
    dump = []   #Used to pop the first hit

    # check input data
    x_axis, y_axis = _datacheck_peakdetect(x_axis, y_axis)
    # store data length for later use
    length = len(y_axis)


    #perform some checks
    if lookahead < 1:
        raise ValueError("Lookahead must be '1' or above in value")
    if not (np.isscalar(delta) and delta >= 0):
        raise ValueError("delta must be a positive number")

    #maxima and minima candidates are temporarily stored in
    #mx and mn respectively
    mn, mx = np.Inf, -np.Inf

    #Only detect peak if there is 'lookahead' amount of points after it
    for index, (x, y) in enumerate(zip(x_axis[:-lookahead],
                                       y_axis[:-lookahead])):
        if y > mx:
            mx = y
            mxpos = x
        if y < mn:
            mn = y
            mnpos = x

        ####look for max####
        if y < mx-delta and mx != np.Inf:
            #Maxima peak candidate found
            #look ahead in signal to ensure that this is a peak and not jitter
            if y_axis[index:index+lookahead].max() < mx:
                max_peaks.append([mxpos, mx])
                dump.append(True)
                #set algorithm to only find minima now
                mx = np.Inf
                mn = np.Inf
                if index+lookahead >= length:
                    #end is within lookahead no more peaks can be found
                    break
                continue

        ####look for min####
        if y > mn+delta and mn != -np.Inf:
            #Minima peak candidate found
            #look ahead in signal to ensure that this is a peak and not jitter
            if y_axis[index:index+lookahead].min() > mn:
                min_peaks.append([mnpos, mn])
                dump.append(False)
                #set algorithm to only find maxima now
                mn = -np.Inf
                mx = -np.Inf
                if index+lookahead >= length:
                    #end is within lookahead no more peaks can be found
                    break

    #Remove the false hit on the first value of the y_axis
    try:
        if dump[0]:
            max_peaks.pop(0)
        else:
            min_peaks.pop(0)
        del dump
    except IndexError:
        pass

    return [max_peaks, min_peaks]

### CQP quality features and heart rate estimation

def get_predict_prob(window):
    """
    Get CQP quality features
    :param window: Numpy array of PPG data
    :return: quality features
    """
    no_channels = window.shape[1]
    window[:,:] = signal.detrend(RobustScaler().fit_transform(window),axis=0)
    f,pxx = signal.welch(window,fs=100,nperseg=len(window),nfft=10000,axis=0)
    pxx = np.abs(pxx)
    pxx = MinMaxScaler().fit_transform(pxx)
    skews = skew(window,axis=0).reshape(no_channels,1)
    kurs = kurtosis(window,axis=0).reshape(no_channels,1)
    iqrs = np.std(window,axis=0).reshape(no_channels,1)
    rps = np.divide(np.trapz(pxx[np.where((f>=.8)&(f<=2.5))[0]],axis=0),np.trapz(pxx,axis=0)).reshape(no_channels,1)
    features = np.concatenate([skews,kurs,rps,iqrs],axis=1)
    return features

def get_rr_value(values,fs=100):
    """
    Get Mean RR interval

    :param values: single channel ppg data
    :param fs: sampling frequency
    :return: Mean RR interval Information
    """
    try:
        f, pxx = signal.welch(values,fs=fs,nperseg=values.shape[0],nfft=10000,axis=0)
        f = f.reshape(-1)
        pxx = pxx.reshape(-1,1)
        peakind =  peakdetect(pxx[:,0],lookahead=2)
        x = []
        y = []
        for a in peakind[0]:
            x.append(a[0])
            y.append(a[1])
        x = np.array(x)
        x = x[f[x]>.8]
        x = x[f[x]<2.5]
        f = f[x]
        pxx = pxx[x,0]
        return 60000/(60*f[np.argmax(pxx)])
    except Exception as e:
        return 0


def get_rr_and_features(window):
    """

    :param window:
    :return: tuple of mean RR interval and Quality features calculated
    """
    no_channels = window.shape[1]
    starts = [0]
    ends = [125]
    rrs = []
    features= []
    for i,s in enumerate(starts):
        e = ends[i]
        for j in range(window.shape[1]):
            rrs.append(get_rr_value(window[s:,j]))
        features.append(get_predict_prob(window[s:e,:]).reshape(1,no_channels,4))
    return np.array(rrs),np.concatenate(features).reshape(no_channels,4)



def get_metadata_features_rr(data,
                 wrist='left',
                 sensor_name='motionsensehrv'):
    """
    :param data: input stream
    :param wrist: which wrist the data was collected from
    :param sensor_name: name of sensor

    :return: metadata of output stream
    """
    stream_name = "org.md2k."+str(sensor_name)+"."+str(wrist)+".wrist.features.activity.std"
    stream_metadata = Metadata()
    stream_metadata.set_name(stream_name).set_description("PPG data quality features and mean RR interval computed from fixed window") \
        .add_dataDescriptor(DataDescriptor().set_name("timestamp").set_type("datetime")) \
        .add_dataDescriptor(DataDescriptor().set_name("localtime").set_type("datetime")) \
        .add_dataDescriptor(DataDescriptor().set_name("version").set_type("int")) \
        .add_dataDescriptor(DataDescriptor().set_name("user").set_type("string")) \
        .add_dataDescriptor(DataDescriptor().set_name("features").set_type("array")) \
        .add_dataDescriptor(DataDescriptor().set_name("rr").set_type("array")) \
        .add_dataDescriptor(DataDescriptor().set_name("activity").set_type("double")) \
        .add_dataDescriptor(DataDescriptor().set_name("start").set_type("timestamp")) \
        .add_dataDescriptor(DataDescriptor().set_name("end").set_type("timestamp"))

    stream_metadata.add_module(
        ModuleMetadata().set_name("PPG data quality features and  mean RR Interval computed from PPG")
            .set_attribute("url", "http://md2k.org/")
            .set_author("Md Azim Ullah", "mullah@memphis.edu"))
    return stream_metadata


def compute_quality_features_and_rr(data,
                                    Fs=100,
                                    window_size=5.0,
                                    acceptable_percentage=0.8,
                                    ppg_columns=['red','infrared','green'],
                                    acl_columns=['aclx','acly','aclz'],
                                    wrist='left',
                                    sensor_name='motionsensehrv'):
    """

    :param data: Input data
    :param Fs: Sampling Frequency
    :param window_size: Window size to compute features from
    :param acceptable_percentage: minimum acceptable data fraction
    :param ppg_columns: columns in input data belonging to PPG
    :param acl_columns: columns in input data belonging to Accelerometer
    :param wrist: wrist on which the sensor was worn
    :param sensor_name: name of sensor
    :return: Dataframe containing PPG data quality features and mean RR interval information
    """

    ## check if all columns exist

    default_columns = ['user','version','localtime','timestamp']
    required_columns = default_columns+list(acl_columns)+list(ppg_columns)
    if len(set(required_columns)-set(data.columns))>0:
        raise Exception("Columns missing in input dataframe! " + str(list(set(required_columns)-set(data.columns))))

    ## select the columns from input dataframe

    data = data.select(*required_columns)

    ## udf
    default_schema = [StructField("timestamp", TimestampType()),
                      StructField("localtime", TimestampType()),
                      StructField("version", IntegerType()),
                      StructField("user", StringType())]
    output_schema = [StructField("features", ArrayType(DoubleType())),
                     StructField("rr", ArrayType(DoubleType())),
                     StructField("activity", DoubleType()),
                     StructField("start", TimestampType()),
                     StructField("end", TimestampType())]
    schema = StructType(default_schema+output_schema)
    @pandas_udf(schema, PandasUDFType.GROUPED_MAP)
    def ppg_features_compute(key,data):
        if data.shape[0]>window_size*Fs*acceptable_percentage:
            data = data.sort_values('timestamp').reset_index(drop=True)
            rows = []
            rows.append(data['user'].loc[0])
            rows.append(data['version'].loc[0])
            rows.append(data['timestamp'].loc[0])
            rows.append(data['localtime'].loc[0])
            rrs , features = get_rr_and_features(data[list(ppg_columns)].values.reshape(-1,len(ppg_columns)))
            rows.append(rrs)
            rows.append(features.reshape(-1))
            data_acl = data[list(acl_columns)]
            values_acl = data_acl.values
            acl_std = np.std(values_acl,axis=0)
            acl_std = np.sqrt(np.sum(np.square(acl_std)))
            rows.append(acl_std)
            rows.append(key[2]['start'])
            rows.append(key[2]['end'])
            return pd.DataFrame([rows],columns=['user','version',
                                                'timestamp','localtime',
                                                'rr','features','activity',
                                                'start','end'])

        else:
            return pd.DataFrame([],columns=['user','version',
                                            'timestamp','localtime',
                                            'rr','features','activity',
                                            'start','end'])

    ppg_features_and_rr = data.compute(ppg_features_compute,windowDuration=window_size,slideDuration=2,startTime='0 seconds')
    output_data = ppg_features_and_rr._data
    ds = DataStream(data=output_data,metadata=get_metadata_features_rr(data,wrist=wrist,sensor_name=sensor_name))

    return ds


In [None]:
data = CC.get_stream('org.md2k.fossil.left.wrist.bandpass.filtered')

data = data.withColumn('aclx',F.lit(1))
data = data.withColumn('acly',F.lit(2))
data = data.withColumn('aclz',F.lit(3))

rr_data = compute_quality_features_and_rr(data,
                                    Fs=100,
                                    window_size=5.0,
                                    acceptable_percentage=0.8,
                                    ppg_columns=['ppg1'],
                                    acl_columns=['aclx','acly','aclz'],
                                    wrist='left',
                                    sensor_name='fossil')
final_data = rr_data.drop('activity')._data.withColumn('rr',F.col('rr').getItem(0))
schema = final_data.schema
stream_metadata = Metadata()
stream_metadata.set_name("org.md2k.fossil.left.wrist.features.rr").set_description("Bandpass Filtered PPG, ECG Rpeak")
for field in schema.fields:
    stream_metadata.add_dataDescriptor(
        DataDescriptor().set_name(str(field.name)).set_type(str(field.dataType))
    )
stream_metadata.add_module(
    ModuleMetadata().set_name("Bandpass Filtered PPG, ECG Rpeak") \
    .set_attribute("url", "https://md2k.org").set_author(
        "Md Azim Ullah", "mullah@memphis.edu"))
stream_metadata.is_valid()
ds = DataStream(data=final_data,metadata=stream_metadata)
CC.save_stream(ds,overwrite=True)

compute standard deviation of accelerometer and merge it with quality features

In [7]:
data = CC.get_stream('accelerometer--org.md2k.watch--fossil_watch_sport' ,user_id='afcfc1b5-365f-409b-918e-2f0ce8056ff9')

In [10]:
data.sort(F.col('localtime').desc()).show(1000,False)

+-----------------------+-----------------------+-------------------+-------------------+-----------------+-------+------------------------------------+
|timestamp              |localtime              |x                  |y                  |z                |version|user                                |
+-----------------------+-----------------------+-------------------+-------------------+-----------------+-------+------------------------------------+
|2020-09-23 16:10:25.351|2020-09-23 10:10:25.351|-1.2678544521331787|-1.2678544521331787|9.563812255859375|2      |afcfc1b5-365f-409b-918e-2f0ce8056ff9|
|2020-09-23 16:10:25.331|2020-09-23 10:10:25.331|-1.3229509592056274|-1.3229509592056274|9.568602561950684|2      |afcfc1b5-365f-409b-918e-2f0ce8056ff9|
|2020-09-23 16:10:25.312|2020-09-23 10:10:25.312|-1.3133690357208252|-1.3133690357208252|9.544648170471191|2      |afcfc1b5-365f-409b-918e-2f0ce8056ff9|
|2020-09-23 16:10:25.292|2020-09-23 10:10:25.292|-1.318160057067871 |-1.3181600570

In [2]:
win = F.window("timestamp", windowDuration='5 seconds',slideDuration='2 seconds',startTime='0 seconds')
groupbycols = ["user","version"] + [win]

import numpy as np
schema2 = StructType([
    StructField("version", IntegerType()),
    StructField("user", StringType()),
    StructField("start", TimestampType()),
    StructField("end", TimestampType()),
    StructField("activity", DoubleType())
])
@pandas_udf(schema2, PandasUDFType.GROUPED_MAP)
def compute_std(key,data1):
    activity = np.sqrt(data1['x'].std()**2 + data1['y'].std()**2 + data1['z'].std()**2)
    temp = [data1.version.values[0],
           data1.user.values[0],
           key[2]['start'],
           key[2]['end'],
           activity]
    return pd.DataFrame([temp],columns=['version','user','start','end','activity'])
activity_data = data._data.groupBy(groupbycols).apply(compute_std)


rr_data = CC.get_stream("org.md2k.fossil.left.wrist.features.rr")._data

final_data = activity_data.join(rr_data.drop('version'),how='inner',on=['user','start','end'])

schema = final_data.schema
stream_metadata = Metadata()
stream_metadata.set_name("org.md2k.fossil.left.wrist.features.rr.activity.std").set_description("Bandpass Filtered PPG, ECG Rpeak")
for field in schema.fields:
    stream_metadata.add_dataDescriptor(
        DataDescriptor().set_name(str(field.name)).set_type(str(field.dataType))
    )
stream_metadata.add_module(
    ModuleMetadata().set_name("Bandpass Filtered PPG, ECG Rpeak") \
    .set_attribute("url", "https://md2k.org").set_author(
        "Md Azim Ullah", "mullah@memphis.edu"))
stream_metadata.is_valid()
ds = DataStream(data=final_data,metadata=stream_metadata)
CC.save_stream(ds,overwrite=True)

left join to maximize the data 

In [None]:
data = CC.get_stream('accelerometer--org.md2k.watch--fossil_watch_sport',user_id='afcfc1b5-365f-409b-918e-2f0ce8056ff9')

win = F.window("timestamp", windowDuration='5 seconds',slideDuration='2 seconds',startTime='0 seconds')
groupbycols = ["user","version"] + [win]

import numpy as np
schema2 = StructType([
    StructField("version", IntegerType()),
    StructField("user", StringType()),
    StructField("start", TimestampType()),
    StructField("end", TimestampType()),
    StructField("activity", DoubleType())
])
@pandas_udf(schema2, PandasUDFType.GROUPED_MAP)
def compute_std(key,data1):
    activity = np.sqrt(data1['x'].std()**2 + data1['y'].std()**2 + data1['z'].std()**2)
    temp = [data1.version.values[0],
           data1.user.values[0],
           key[2]['start'],
           key[2]['end'],
           activity]
    return pd.DataFrame([temp],columns=['version','user','start','end','activity'])
activity_data = data._data.groupBy(groupbycols).apply(compute_std)


rr_data = CC.get_stream("org.md2k.fossil.left.wrist.features.rr.quality.likelihood")._data ## see below

final_data = activity_data.join(rr_data.drop('version'),how='left',on=['user','start','end'])

schema = final_data.schema
stream_metadata = Metadata()
stream_metadata.set_name("org.md2k.fossil.left.wrist.features.rr.activity.std.quality.likelihood.with.null").set_description("Bandpass Filtered PPG, ECG Rpeak")
for field in schema.fields:
    stream_metadata.add_dataDescriptor(
        DataDescriptor().set_name(str(field.name)).set_type(str(field.dataType))
    )
stream_metadata.add_module(
    ModuleMetadata().set_name("Bandpass Filtered PPG, ECG Rpeak") \
    .set_attribute("url", "https://md2k.org").set_author(
        "Md Azim Ullah", "mullah@memphis.edu"))
stream_metadata.is_valid()
ds = DataStream(data=final_data,metadata=stream_metadata)
CC.save_stream(ds,overwrite=True)

match with ecg rr

In [None]:
ecg_rr = CC.get_stream("org.md2k.autosense.ecg.rr.final.hamiltonian.5secs.average").withColumnRenamed('rr','ecg_rr')

ppg_rr = CC.get_stream("org.md2k.fossil.left.wrist.features.rr.activity.std")

rr = ppg_rr.join(ecg_rr.drop('version','timestamp','localtime'),how='inner',on=['user','start','end'])

data = rr._data.toPandas()

import pickle
pickle.dump(data,open('../data/ecg_ppg_rr_matched_5secs_motion.p','wb'))

plot ecg ppg rr based on motion

In [None]:
import pickle
data = pickle.load(open('../data/ecg_ppg_rr_matched_5secs_motion.p','rb'))
data = data[(data.rr>400)&(data.rr<1200)]
data['difference'] = np.abs(data['rr']-data['ecg_rr'])

levels = np.linspace(data['activity'].min(),.5,60)
data['level'] = data['activity'].apply(lambda a:min(levels, key=lambda x:abs(x-a)))
data['level'] = data['level'].apply(lambda a:'{:.4f}'.format(a))

import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size':20})
plt.figure(figsize=(25,15))
sns.boxplot(x='level',y='difference',data=data,showfliers = False)
plt.xlabel('Magnitude of standard deviation across 3 acl channels')
plt.ylabel('Absolute difference in milliseconds (5 second level mean rr int.)')
plt.ylim([0,400])
plt.xticks(rotation=60)
plt.show()

compute signal quality likelihood

In [None]:
import pickle
def get_quality_likelihood(data,
                           clf,
                           no_of_ppg_channels = 3,
                           no_of_quality_features = 4):
    ## helper method
    def convert_to_array(vals):
        return np.array(vals).reshape(no_of_ppg_channels,no_of_quality_features)

    ## udf
    schema = StructType([
        StructField("version", IntegerType()),
        StructField("user", StringType()),
        StructField("localtime", TimestampType()),
        StructField("timestamp", TimestampType()),
        StructField("likelihood_max", DoubleType()),
        StructField("rr", DoubleType()),
        StructField("likelihood_max_array", ArrayType(DoubleType())),
        StructField("rr_array", ArrayType(DoubleType())),
        StructField("activity", DoubleType()),
        StructField("start", TimestampType()),
        StructField("end", TimestampType()),
        StructField("features", ArrayType(DoubleType())),
    ])
    @pandas_udf(schema, PandasUDFType.GROUPED_MAP)
    def ppg_likelihood_compute(data):
        if data.shape[0]>0:
            data['features'] = data['features'].apply(convert_to_array)
            acl_features = np.concatenate(list(data['features'])).reshape(-1, no_of_ppg_channels,no_of_quality_features)
            likelihood = []
            for k in range(acl_features.shape[1]):
                tmp = np.nan_to_num(acl_features[:,k,:]).reshape(-1,no_of_quality_features)
                likelihood.append(clf.predict_proba(tmp)[:,1].reshape(-1,1))

            likelihood = np.concatenate(likelihood,axis=1)
            rrs = data['rr'].values
            rrs = np.array([np.array(a) for a in rrs])
            likelihood_max = []
            rr = []
            rr_array = []
            likelihood_max_array = []
            for i in range(likelihood.shape[0]):
                a = likelihood[i,:]
                likelihood_max_array.append(list(a))
                rr_array.append(list(rrs[i]))
                likelihood_max.append(np.max(a))
                rr.append(rrs[i][np.argmax(a)])
            data['likelihood_max'] = likelihood_max
            data['rr'] = rr
            data['likelihood_max_array'] = likelihood_max_array
            data['rr_array'] = rr_array
            data['features'] = data['features'].apply(lambda a:a.reshape(-1))
            return data
        else:
            return pd.DataFrame([],columns=['user','version','timestamp','localtime','likelihood_max',
                                            'rr','activity','likelihood_max_array','rr_array','start','end','features'])

    ppg_likelihood = data._data.groupBy(['user','version']).apply(ppg_likelihood_compute)
    return ppg_likelihood


without activity merging first

In [None]:
quality_clf  = pickle.load(open('/home/jupyter/mullah/Test/data_yield/classifier_sqi/classifier.p','rb'))

data = CC.get_stream("org.md2k.fossil.left.wrist.features.rr")

data = data.withColumn('rr',F.array('rr'))

data = data.withColumn('activity',F.lit(1))

final_data = get_quality_likelihood(data,
                           quality_clf,
                           no_of_ppg_channels = 1,
                           no_of_quality_features = 4).drop('activity')

schema = final_data.schema
stream_metadata = Metadata()
stream_metadata.set_name("org.md2k.fossil.left.wrist.features.rr.quality.likelihood").set_description("Bandpass Filtered PPG, ECG Rpeak")
for field in schema.fields:
    stream_metadata.add_dataDescriptor(
        DataDescriptor().set_name(str(field.name)).set_type(str(field.dataType))
    )
stream_metadata.add_module(
    ModuleMetadata().set_name("Bandpass Filtered PPG, ECG Rpeak") \
    .set_attribute("url", "https://md2k.org").set_author(
        "Md Azim Ullah", "mullah@memphis.edu"))
stream_metadata.is_valid()
ds = DataStream(data=final_data,metadata=stream_metadata)
CC.save_stream(ds,overwrite=True)

with activity merging done already

In [None]:
quality_clf  = pickle.load(open('/home/jupyter/mullah/Test/data_yield/classifier_sqi/classifier.p','rb'))

data = CC.get_stream("org.md2k.fossil.left.wrist.features.rr.activity.std")

data = data.withColumn('rr',F.array('rr'))

final_data = get_quality_likelihood(data,
                           quality_clf,
                           no_of_ppg_channels = 1,
                           no_of_quality_features = 4)

schema = final_data.schema
stream_metadata = Metadata()
stream_metadata.set_name("org.md2k.fossil.left.wrist.features.rr.activity.std.quality.likelihood").set_description("Bandpass Filtered PPG, ECG Rpeak")
for field in schema.fields:
    stream_metadata.add_dataDescriptor(
        DataDescriptor().set_name(str(field.name)).set_type(str(field.dataType))
    )
stream_metadata.add_module(
    ModuleMetadata().set_name("Bandpass Filtered PPG, ECG Rpeak") \
    .set_attribute("url", "https://md2k.org").set_author(
        "Md Azim Ullah", "mullah@memphis.edu"))
stream_metadata.is_valid()
ds = DataStream(data=final_data,metadata=stream_metadata)
CC.save_stream(ds,overwrite=True)

match with ecg rr

In [None]:
ecg_rr = CC.get_stream("org.md2k.autosense.ecg.rr.final.hamiltonian.5secs.average").withColumnRenamed('rr','ecg_rr')

ppg_rr = CC.get_stream("org.md2k.fossil.left.wrist.features.rr.activity.std.quality.likelihood")

rr = ppg_rr.join(ecg_rr.drop('version','timestamp','localtime'),how='inner',on=['user','start','end'])

data = rr._data.toPandas()

import pickle
pickle.dump(data,open('../data/ecg_ppg_rr_matched_5secs.p','wb'))

plot ecg ppg rr difference based on quality likelihood

In [None]:
import pickle
data = pickle.load(open('../data/ecg_ppg_rr_matched_5secs.p','rb'))
data = data[(data.rr>300)&(data.rr<1200)&(data.activity<1)]
data = data[(data.ecg_rr>300)&(data.ecg_rr<1200)]

data['difference'] = np.abs(data['rr']-data['ecg_rr'])
data['power'] = data['features'].apply(lambda a:a[2])
levels = np.arange(0,1,.1)
data['level'] = data['likelihood_max'].apply(lambda a:min(levels, key=lambda x:abs(x-a)))
# data['level'] = data['level'].apply(lambda a:'{:.4f}'.format(a))

import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size':20})
plt.figure(figsize=(25,15))
sns.boxplot(x='level',y='difference',data=data,showfliers = False)
plt.xlabel('Likelihood')
plt.ylabel('Absolute difference in milliseconds')
# plt.ylim([0,400])
plt.xticks(rotation=60)
plt.show()

60 seconds PPG RR computation

In [None]:
data = CC.get_stream("org.md2k.fossil.left.wrist.features.rr.activity.std.quality.likelihood")

In [None]:
data = data.filter(F.col('rr')>300)
data = data.filter(F.col('rr')<1500)

In [None]:
data = CC.get_stream("org.md2k.fossil.left.wrist.features.rr.activity.std.quality.likelihood.with.null")

In [None]:
win = F.window("timestamp", windowDuration='60 seconds',slideDuration='60 seconds',startTime='0 seconds')
groupbycols = ["user","version"] + [win]
schema2 = StructType([
    StructField("version", IntegerType()),
    StructField("user", StringType()),
    StructField("start", TimestampType()),
    StructField("end", TimestampType()),
    StructField("length", DoubleType()),
    StructField("rr_array", ArrayType(DoubleType())),
    StructField("timestamp", TimestampType()),
    StructField("localtime", TimestampType()),
    StructField("activity", ArrayType(DoubleType())),
    StructField("likelihood", ArrayType(DoubleType()))    
])
@pandas_udf(schema2, PandasUDFType.GROUPED_MAP)
def compute_60_secs(key,df):
#     if df.shape[0]<.33*30:
#         return pd.DataFrame([],columns=['version','user','start','end',
#                                         'rr','rr_array','timestamp','localtime',
#                                        'activity','likelihood'])
    temp = [df.version.values[0],
            df.user.values[0],
            key[2]['start'],
            key[2]['end'],
            df.shape[0],
            np.array(list(df['rr'])),
            df.timestamp.values[0],
            df.localtime.values[0],
            np.array(list(df['activity'])),
            np.array(list(df['likelihood_max']))]
    return pd.DataFrame([temp],columns=['version','user','start','end',
                                        'length','rr_array','timestamp','localtime',
                                       'activity','likelihood'])
final_data = data._data.groupby(groupbycols).apply(compute_60_secs)
schema = final_data.schema
stream_metadata = Metadata()
stream_metadata.set_name("org.md2k.fossil.left.wrist.rr.activity.likelihood.60.secs").set_description("Bandpass Filtered PPG, ECG Rpeak")
for field in schema.fields:
    stream_metadata.add_dataDescriptor(
        DataDescriptor().set_name(str(field.name)).set_type(str(field.dataType))
    )
stream_metadata.add_module(
    ModuleMetadata().set_name("Bandpass Filtered PPG, ECG Rpeak") \
    .set_attribute("url", "https://md2k.org").set_author(
        "Md Azim Ullah", "mullah@memphis.edu"))
stream_metadata.is_valid()
ds = DataStream(data=final_data,metadata=stream_metadata)
CC.save_stream(ds,overwrite=True)

In [None]:
data = CC.get_stream("org.md2k.fossil.left.wrist.rr.activity.likelihood.60.secs")
data.count()

Match with minute level RR from ECG

In [None]:
data = CC.get_stream("org.md2k.fossil.left.wrist.rr.activity.likelihood.60.secs")

ecg_data = CC.get_stream("org.md2k.autosense.ecg.rr.final.hamiltonian.60secs.average")

all_data = data.join(ecg_data.drop('version','timestamp','localtime'),how='left',on=['start','end','user'])

data = all_data._data.toPandas()

pickle.dump(data,open('../data/60_seconds_ecg_ppg.p','wb'))

experiment with 60 seconds data

In [None]:
data = pickle.load(open('../data/60_seconds_ecg_ppg.p','rb'))

data['activity_std'] = data['activity'].apply(lambda a:np.nanpercentile([b for b in a if b!=None],75))
data['ppg_rr'] = data['rr_array'].apply(lambda a:np.nanmean(a))
data['ppg_rr'] = data['rr_array'].apply(lambda a:np.nanmean([b for b in a if b>300 and b<1500 and b!=None]))
data['mean_likelihood'] = data['likelihood'].apply(lambda a:np.nanmean(a))

df = data[['ecg_rr','ppg_rr','activity_std','mean_likelihood','length']].dropna()
df['difference'] = np.abs(df['ecg_rr']-df['ppg_rr'])
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(20,10))
plt.scatter(df['ecg_rr'],df['ppg_rr'],c=df['activity_std'])
plt.xlabel('ecg mean rr (60 seconds)')
plt.ylabel('PPG mean rr (60 seconds)')
# sns.lineplot(x='mean_likelihood',y='difference',data=df)
plt.colorbar()
plt.show()

levels = np.linspace(0,1,100)
df['level'] = df['activity_std'].apply(lambda a:min(levels, key=lambda x:abs(x-a)))
df['level'] = df['level'].apply(lambda a:'{:.4f}'.format(a))
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size':20})
plt.figure(figsize=(25,15))
sns.boxplot(x='level',y='difference',data=df,showfliers = False)
plt.xlabel('75th percentile of activity standard deviation')
plt.ylabel('Absolute difference in milliseconds')
# plt.ylim([0,400])
plt.xticks(rotation=60)
plt.show()


df = df[df['length']>=10]
# df = df[df['activity_std']>.0204]
df['difference'] = np.abs(df['ecg_rr']-df['ppg_rr'])
levels = np.linspace(0,1,10)
df['level'] = df['mean_likelihood'].apply(lambda a:min(levels, key=lambda x:abs(x-a)))
# data['level'] = data['level'].apply(lambda a:'{:.4f}'.format(a))
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size':20})
plt.figure(figsize=(25,15))
sns.boxplot(x='level',y='difference',data=df,showfliers = False)
plt.xlabel('Mean Likelihood in a minute')
plt.ylabel('Absolute difference in milliseconds')
# plt.ylim([0,400])
plt.xticks(rotation=60)
plt.show()

df['activity_std'].min(),df['activity_std'].max()

In [None]:
plt.figure(figsize=(20,10))
data.activity_std.hist(bins=400)
plt.xlim([0.05,10])
plt.ylim([0,50])
plt.show()


In [None]:
ppg = pickle.load(open('../data/ppg.p','rb'))
acl = pickle.load(open('../data/acl.p','rb'))

In [None]:
from scipy import signal
def preProcessing(X1,Fs=100,fil_type='ppg'):
    X1 = X1.reshape(-1,1)
    X1 = signal.detrend(X1,axis=0,type='constant')
    b = signal.firls(65,np.array([0,0.2, 0.3, 3 ,3.5,Fs/2]),np.array([0, 0 ,1 ,1 ,0, 0]),
                     np.array([100*0.02,0.02,0.02]),fs=Fs)
    X2 = signal.convolve(X1.reshape(-1),b,mode='same')
    return X2

In [None]:
ppg = ppg.sort_values('timestamp').reset_index(drop=True)
ppg['filtered_ppg'] = preProcessing(ppg['ppg1'].values)

In [None]:
# import matplotlib.pyplot as plt
plt.figure(figsize=(20,10))
plt.plot(ppg['timestamp'][10000:30000],ppg['filtered_ppg'][10000:30000])
plt.show()

In [None]:
ppg_col = [df for i,df in ppg.groupby(pd.Grouper(key='timestamp',freq='5S'))]

In [None]:
for df in ppg_col:
    plt.figure(figsize=(20,10))
    plt.plot(df['timestamp'],df['filtered_ppg'])
    plt.show()