In [1]:
%%capture

import audata as aud
import h5py as h5
from tsfresh import extract_features
import pandas as pd
import numpy as np
import scipy.stats
import concurrent.futures
import threading
import traceback
import biosignalsnotebooks as bsnb
from multiprocessing import Pool, TimeoutError
from sklearn.linear_model import LinearRegression

import rpy2
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter
from rpy2.robjects.packages import importr
robjects.r['source']('pf.R')
robjects.r['source']('feature_extract_alert_p3.R')

files_dir = '/home/auvdata/projects/conditionc-new/originals/'

In [2]:
labels = pd.read_csv('../finalLabels.csv')

# Filter alerts with less than 3 minute timespan
labels['time'] = labels['right'] - labels['left']
before = labels.shape[0]
labels = labels[labels['time'] > 180]
labels.drop(columns=['time'], inplace=True)
print(f"Removed {before-labels.shape[0]} labels with less than a 3-minute timespan.")

Removed 80 labels with less than a 3-minute timespan.


# Function Definitions

In [59]:
def run(featurize, featurizeThese, columns):
    
    featurizedData = pd.DataFrame(columns=columns)

    i = 0
    print("Featurizing (sync)...\n")
    for row in featurizeThese.iterrows():
        i += 1
        print(f"{i} of {featurizeThese.shape[0]}")
        newRow = featurize(row)
        if newRow is not None:
            featurizedData = featurizedData.append(pd.DataFrame([newRow], columns=featurizedData.columns), ignore_index=True)
        else:
            print("None returned. Skipping!")
                
    return finalizeDataset(featurizedData)

def run_async(featurize, featurizeThese, columns, numProcs):
    
    featurizedData = pd.DataFrame(columns=columns)

    with Pool(numProcs) as p:

        i = 0
        num = len(featurizeThese)

        print("Featurizing (async)...\n")
        it = p.imap_unordered(featurize, featurizeThese.iterrows())
        while i < num:

            i += 1

            try:
                #newRow = it.next(timeout=600)
                newRow = it.next()
                print(f"{i} of {num} returned.")
                print("New row:", newRow, sep="\n")
                if newRow is not None:
                    featurizedData = featurizedData.append(pd.DataFrame([newRow], columns=featurizedData.columns), ignore_index=True)
                else:
                    print("None returned. Skipping!")
            except TimeoutError as te:
                print(f"Timed out. {te}")
                traceback.print_exc()
            except StopIteration:
                print("Pool iteration ended.")
                break
            except Exception as e:
                print(f"Other exception... {e}")
                traceback.print_exc()
            
    return finalizeDataset(featurizedData)
            
def finalizeDataset(ds):

    # Replace any inf values with nan
    return ds.replace([np.inf, -np.inf], np.nan)

# Open the file in both audata and h5py
def openFiles(fn):
    return aud.File.open(files_dir+fn, readonly=True), h5.File(files_dir+fn, 'r')
    
# Validate that data is at least minSamplesToBeValid, not a 0 timespan
def validateDataset(data):
    return False if data.shape[0] < minSamplesToBeValid or data.iloc[-1].time - data.iloc[0].time == 0 else True
   
# Convert a list/vector(?) response from R into a dict
def getDictFromR(o):
    return dict(zip(o.names, map(list,list(o))))

def isWaveform(s):
    return s.startswith('/data/waveforms')

# Return basic statistics for a pandas series
def getBasicStats(ds):
    
    features = []
    
    _mean = ds.mean()
    _median = ds.median()
    _std = ds.std()
    _min = ds.min()
    _max = ds.max()
    
    features.append(_mean) # mean
    features.append(_std) # sd
    features.append(_std/_mean if _mean != 0 else np.nan)# cv
    features.append((ds-_mean).median()) # mad
    features.append(hrrr.shape[0]) # n
    features.append(_min) # min
    features.append(_max) # max
    features.append(_median) # median
    features.append(_max-_min) # range
    features.append((_max-_min)/_median if _median != 0 else np.nan)# range_ratio
    
    return features

# Get best numeric series from available options
# f=file, so=series options, left=left timestamp
def getBestSeries(f, h5f, so, left):
    
    printout and print(f"getBestSeries(f, h5f, {so}, {left}) started.")
    
    finalSeries = None
    finalSeriesSampleCount = -1
    finalSeriesTREF = None
    finalSeriesDF = None
    finalSeriesData = None

    # Iterate through the series options
    for s in so:

        # If the series is not available in the file, skip
        if s not in f:
            continue
            
        tref = f[s].time_reference.timestamp()

        if isWaveform(s):
            
            df = h5f[s]
            data = pd.DataFrame(df[(df['time'] > left-tref) & (df['time'] < left+60*3-tref)])
        
        else:
            
            df = pd.DataFrame(f[s].get(raw=True)).drop_duplicates(subset=['time'], ignore_index=True)

            # Downsample if requested. Note we do not downsample NBP because it is already a low-rate measurement.
            if downsample_to_ppinnc1 and sk in ['hr', 'rr', 'spo2']:
                df = df.iloc[::20].reset_index(drop=True)

            data = df[ (df.time > left-tref) & (df.time < left+60*3-tref) ]

        # If data validates and has more samples than current final series, it's our new final series
        if validateDataset(data) and data.shape[0] > finalSeriesSampleCount:

            finalSeries = s
            finalSeriesSampleCount = data.shape[0]
            finalSeriesTREF = tref
            finalSeriesDF = df
            finalSeriesData = data
            
    printout and print(f"getBestSeries(f, h5f, {so}, {left}) returning.")

    return finalSeries, finalSeriesTREF, finalSeriesDF, finalSeriesData

def featurize(row):

    # Grab the row content
    row = row[1]
    
    printout and print(f"File: {row['filename']}\nAlert Series: {row['series']}\nLabel: {'REAL' if row['real_vs_art']==1 else 'ARTIFACT'}")

    # Open the file in both audata and h5py
    (f, h5f) = openFiles(row['filename'])
    
    # This will hold this alert's computed features
    features = [row['filename'], row['series'], row['left'], row['right'], row['real_vs_art']]
    
    # Iterate through the signal types (each signal type e.g. hr, spo2 will get one set of features)
    for sk, so in seriesOptions.items():
        
        # TODO(gus): If this sk is for the label target series category, use the label target series.
        
        (series, tref, df, data) = getBestSeries(f, h5f, so, row['left'])
        
        # If we have no available series for this signal type, add empty values for the features and move on.
        if series is None:
            print(f"No data available for {sk}!")
            
            # TODO(gus): temp
            if sk == 'pleth' or sk == 'ecgii':
                # this is not temp, just pull it out of the if
                features = features + ( [np.nan]*(len(featuresList) + (3 if sk == 'pleth' else 0)) )
            
            # If we have no data for our CRI target series (e.g. if we've invalidated it), skip this CRI alert entirely
            if row['series'].split(':')[0] in seriesOptions[sk]:
                print(f"Not enough / invalid data for for CRI target series. Discluding this CRI alert!")
                return None
            
            continue
        
        # TODO(gus): temp
        if sk != 'pleth' and sk != 'ecgii':
            print(f"Temporarily skipping {sk}")
            continue
        
        printout and print(f"Featurizing series {series}, timespan: {round((data.iloc[-1].time - data.iloc[0].time) / 60, 2)} minutes, sample count: {data.shape[0]}")
        
        _mean = data.value.mean()
        _median = data.value.median()
        _std = data.value.std()
        _min = data.value.min()
        _max = data.value.max()
        features.append(_mean) # mean
        features.append(_std) # sd
        features.append(_std/_mean if _mean != 0 else np.nan) # cv
        features.append((data.value - _mean).median()) # mad
        features.append(data.shape[0]) # n
        features.append(_min) # min
        features.append(_max) # max
        features.append(_median) # median
        features.append(_max-_min) # range
        features.append((_max-_min)/_median if _median != 0 else np.nan) # range_ratio
        
        printout and print("Retrieving trail.")
        trail = df[ (df['time'] < row['left']-tref) & (df['time'] > row['left']-tref-15*60) ]
        if isWaveform(series):
            trail = pd.DataFrame(trail)
        printout and print("Retrieving trail complete.")

        features.append( (data.shape[0])/(row['right']-row['left']) ) # data_den. note: use cri timespan, not sample timespan

        trail_count = trail.shape[0]
        if not validateDataset(trail):
            features = features + ( [np.nan]*2 ) # data_den_trail, data_den_trail2
        else:
            features.append(trail_count / (trail.time.iloc[-1]-trail.time.iloc[0])) # data_den_trail
            features.append(trail_count / (15*60)) # data_den_trail2

        # TODO(gus): Ignoring this for waveforms for now
        if isWaveform(series):
            features.append(np.nan) # delta_t
        else:
            features.append(df.loc[data.index[0], 'time'] - df.loc[data.index[0]-1, 'time'] if data.index[0] > 0 else np.nan) # delta_t
        features.append(data.time.diff().max()) # max_gap

        # skipping for now... spec_ratio
        # skipping for now... max_spec
#         with localconverter(robjects.default_converter + pandas2ri.converter):
#             ret = getDictFromR(robjects.r['get_spectral_lomb'](data['time'], data['value']))
#         print('spec:', ret)

        printout and print("Retrieving dl4.")
        dl4 = df[(df['time'] >= row['left']-tref-4*60) & (df['time'] < row['left']-tref)]
        if isWaveform(series):
            dl4 = pd.DataFrame(dl4)
        printout and print("Retrieving dl4 complete.")

        if not validateDataset(dl4):

            features = features + ( [np.nan]*10 ) # delta_mean, delta_std, MWW-test statistic & p-value, KS-test statistic & p-value, t-test statistic & p-value, f-test statistic & p-value

        else:

            features.append(abs(dl4.value.mean()-_mean)) # delta_mean
            features.append(abs(dl4.value.std()-_std)) # delta_std

            #print(data['value'], dl4['value'])
            try:
                (mww_stat, mww_pval) = scipy.stats.mannwhitneyu(data['value'], dl4['value'])
                features.append(mww_stat)# MWW-test statistic
                features.append(mww_pval)# MWW-test p-value
            except:
                features = features + ( [np.nan]*2 )

            (ks_stat, ks_pval) = scipy.stats.ks_2samp(data['value'], dl4['value'])
            features.append(ks_stat) # KS-test statistic
            features.append(ks_pval) # KS-test p-value

            (tt_stat, tt_pval) = scipy.stats.ttest_ind(data['value'], dl4['value'])
            features.append(tt_stat) # t-test statistic
            features.append(tt_pval) # t-test p-value

            try:
                with localconverter(robjects.default_converter + pandas2ri.converter):
                    ret = getDictFromR(robjects.r['var.test'](data['value'], dl4['value']))
                features.append(ret['statistic'][0]) # f-test statistic
                features.append(ret['p.value'][0]) # f-test p-value
            except Exception:
                traceback.print_exc()
                print(row, data['value'], dl4['value'], sep="\n")
                features.append(np.nan)
                features.append(np.nan)

        # this section calculates f-test in python, but results often differ from r
#         x = data['value']
#         y = dl4['value']
#         f = x.var()/y.var()
#         p = 2*scipy.stats.f.cdf(f, x.shape[0]-1, y.shape[0]-1)
#         print(f, p)
#         print('p-python/p-r', vtr['p.value'][0]/p)

        slrm = LinearRegression()
        slrm.fit(data[['time']], data['value'])
        features.append(slrm.coef_[0]) # slope

        with localconverter(robjects.default_converter + pandas2ri.converter):
            ret = robjects.r['get_robust_slope'](data['time'], data['value'])
        features.append(ret[0]) # rslope

        with localconverter(robjects.default_converter + pandas2ri.converter):
            ret = robjects.r['get_slope_break'](data['time'], data['value'])
        features.append(ret[0][0]) # slope_before
        features.append(ret[1][0]) # slope_after
        features.append(ret[2][0]) # num_breakpoint

        data_dropduptimes = data.drop_duplicates(subset=['time'], keep='last')
        dydx = (data_dropduptimes.value.diff() / data_dropduptimes.time.diff())[1:]
        features.append(dydx.max()) # diff1_max
        features.append(dydx.min()) # diff1_min

        # TODO(gus): Ignoring this for waveforms for now
        if isWaveform(series):
            features = features + ([np.nan]*2)
        else:
            with localconverter(robjects.default_converter + pandas2ri.converter):
                ret = getDictFromR(robjects.r['comp_grads'](data['time'], data['value']))
            features.append(ret['max_grad'][0]) # max_grad
            features.append(ret['min_grad'][0]) # min_grad

        with localconverter(robjects.default_converter + pandas2ri.converter):
            ret = getDictFromR(robjects.r['get_rsq'](data['time'], data['value']))
        features.append(ret['rsq'][0]) # quad_rsq
        features.append(ret['coef1'][0]) # quad_coef1
        features.append(ret['coef2'][0]) # quad_coef2
        features.append(ret['var.res'][0]) # quad_resvar

        with localconverter(robjects.default_converter + pandas2ri.converter):
            ret = getDictFromR(robjects.r['get.osci.index'](data['time'], data['value']))
        features.append(ret['up'][0]) # osi_up
        features.append(ret['down'][0]) # osi_down
        features.append(ret['ratio'][0]) # osi_ratio
        
        if sk == 'pleth':
            
            data['time'] = pd.to_datetime(data['time']+tref, unit='s')
            data.set_index('time', inplace=True)
            features.append(data.resample('3s').agg(lambda x: ((x-x.mean()/x.std())**3).sum()/x.shape[0]).mean()[0]) # skew
            features.append(data.resample('3s').agg(lambda x: ((x-x.mean()/x.std())**4).sum()/x.shape[0]).mean()[0]) # kurtosis
            features.append(data.resample('3s').agg(lambda x: (x**2 * np.log(x**2)).sum()*-1).mean()[0]) # entropy

    print("File done.")
    printout and print()
    
    return features

def featurize_targeted(row):
    
    # Grab the row content
    row = row[1]
    
    # Open the file in both audata and h5py
    (f, h5f) = openFiles(row['filename'])
    
    # This will hold this alert's computed features
    features = [row['filename'], row['series'], row['left'], row['right'], row['label']]
    
    sk = 'ecgii'
    
    (series, tref, df, data) = getBestSeries(f, h5f, seriesOptions[sk], row['left'])
    
    # If we have no available series for this signal type, add empty values for the features and move on.
    if series is None:
        
        print(f"No data available for {sk}!")
        features = features + [np.nan]
        
    else:
        
        try:
        
            freq = int(round(1/(data.loc[1, 'time']-data.loc[0, 'time'])))

            time_r_peaks, amplitude_r_peaks = bsnb.detect_r_peaks(data['value'], freq, time_units=True, plot_result=False)

            vpp_signal_ecg = np.ptp(data['value'])

            # For this task, we will follow the same procedure as shown before, but store the values in a list, so that we can then calculate the mean value.
            vpp_noise_ecg = []
            for t in time_r_peaks:
                start = int((t + 0.5) * freq) # 0.5 - time between a peak and a flat 
                end = int((t + 0.65)* freq) # 0.65 time between a peak and the end of the flat
                interval = data['value'][start:end]
                if interval.shape[0] == 0:
                    print("Skipping len=0 noise segment.")
                    continue
                vpp = np.ptp(interval)
                vpp_noise_ecg.append(vpp)

            vpp_noise_ecg = np.mean(vpp_noise_ecg)

            features.append(vpp_signal_ecg/vpp_noise_ecg)
            
        except Exception as e:
            
            print(f"Exception occurred while computing SNR: {e}")
            features.append(np.nan)
        
    (seriesHR, trefHR, dfHR, dataHR) = getBestSeries(f, h5f, seriesOptions['hr'], row['left'])
    (seriesRR, trefRR, dfRR, dataRR) = getBestSeries(f, h5f, seriesOptions['rr'], row['left'])
    if seriesHR is None or seriesRR is None:
        print(f"No data available for hr+rr!")
        features = features + [np.nan]*10
        
    else:
        
        dataHR.set_index('time', inplace=True)
        dataRR.set_index('time', inplace=True)
        hrrr = pd.concat([dataHR, dataRR], axis=1).dropna()
        hrrr.columns = ['hr', 'rr']
        features = features + getBasicStats(hrrr['hr'] / hrrr['rr'])

    return features

# Action

In [62]:
downsample_to_ppinnc1 = False
printout = False
minSamplesToBeValid = 5

# Standard list of features to compute for each series
featuresList = [
    'mean', 'sd', 'cv', 'mad', 'n', 'min', 'max', 'median', 'range', 'range_ratio',
    'data_den', 'data_den_trail', 'data_den_trail2', 'delta_t', 'max_gap',
    # 'spec_ratio', 'max_spec',
    'delta_mean', 'delta_sd', 'MWW_stat', 'MWW_pvalue', 'KS_stat', 'KS_pvalue', 't_stat', 't_pvalue', 'F_stat', 'F_pvalue',
    'slope', 'rslope', 'slope_before', 'slope_after', 'num_breakpoint', 'diff1_max', 'diff1_min', 'max_grad', 'min_grad', 'quad_rsq', 'quad_coef1', 'quad_coef2', 'quad_resvar', 'osi_up', 'osi_down', 'osi_ratio',
]

# List of series to compute features for, by signal type
seriesOptions = {
    'hr':    ['/data/numerics/HR.HR'],
    'rr':    ['/data/numerics/RR.RR'],
    'spo2':  ['/data/numerics/SpO₂.SpO₂', '/data/numerics/SpO₂T.SpO₂T'],
    'bpd':   ['/data/numerics/NBP.NBPd', '/data/numerics/NBP.NBP-D'],
    'bps':   ['/data/numerics/NBP.NBPs', '/data/numerics/NBP.NBP-S'],
}
if not downsample_to_ppinnc1:
    seriesOptions['abpd'] = ['/data/numerics/AR1-D', '/data/numerics/ART.Diastolic']
    seriesOptions['abps'] = ['/data/numerics/AR1-S', '/data/numerics/ART.Systolic']
    seriesOptions['ecgii'] = ['/data/waveforms/II']
    seriesOptions['pleth'] = ['/data/waveforms/Pleth', '/data/waveforms/PlethT']

# The columns end up being filename, series, left, right, label, and then one set of featuresList column names for each series.
columns = \
    ['filename', 'series', 'left', 'right', 'label']+ \
    [ser +'_' + feat for ser in ['hr', 'rr', 'spo2', 'bpd', 'bps'] for feat in featuresList]+ \
    ([ser +'_' + feat for ser in ['abpd', 'abps'] for feat in featuresList] if not downsample_to_ppinnc1 else [])+ \
    ['ecgii_' + feat for feat in featuresList]+ \
    ['pleth_' + feat for feat in featuresList+['skew', 'kurtosis', 'entropy']]
    
# Label set to featurize
# k=0; featurizeThese = labels.iloc[k:k+1]
# featurizeThese = labels[20:45]
# featurizeThese = labels.drop(labels.index[97])[95:]
featurizeThese = labels

# run(featurize, featurizeThese, columns)


# Label set to featurize
prevFeatureSet = pd.read_csv('data_full_pleth2.csv')[317:318]
columns = \
    ['filename', 'series', 'left', 'right', 'label', 'ecgii_snr']+ \
    ['hrrr_ratio_'+feat for feat in ['mean', 'sd', 'cv', 'mad', 'n', 'min', 'max', 'median', 'range', 'range_ratio']]
# featurizedData = run(featurize_targeted, prevFeatureSet, columns)
featurizedData = run_async(featurize_targeted, prevFeatureSet, columns, 18)




print("Run complete.")

with pd.option_context("display.max_columns", 1000):
    print("Featurized Data:")
    display(featurizedData)

Featurizing (async)...

Exception occurred while computing SNR: The length of the input vector x must be greater than padlen, which is 150.
No data available for hr+rr!
1 of 1 returned.
New row:
['20190805_1157669_1498169.h5', '/data/numerics/HR.HR:value', 1539680314, 1539681809, 0, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]
Run complete.
Featurized Data:


Unnamed: 0,filename,series,left,right,label,ecgii_snr,hrrr_ratio_mean,hrrr_ratio_sd,hrrr_ratio_cv,hrrr_ratio_mad,hrrr_ratio_n,hrrr_ratio_min,hrrr_ratio_max,hrrr_ratio_median,hrrr_ratio_range,hrrr_ratio_range_ratio
0,20190805_1157669_1498169.h5,/data/numerics/HR.HR:value,1539680314,1539681809,0,,,,,,,,,,,


In [None]:
labels

In [None]:
prevFeatureSet

# Export

In [63]:
featurizedDataMissingOneSafe = featurizedDataMissingOne.copy()

In [None]:
# ### Export to model-specific datasets

# suf = '_pleth'

# ds = featurizedData_PLETH

# ds.to_csv(f'data_full{suf}.csv', index=False)
# ds.drop(columns=['series', 'left', 'right']).to_csv(f'data_withfn{suf}.csv', index=False)

# ds[ds['series'] == '/data/numerics/HR.HR:value'].drop(columns=['series', 'left', 'right']).to_csv(f'data_hr_withfn{suf}.csv', index=False)
# ds[ds['series'] == '/data/numerics/RR.RR:value'].drop(columns=['series', 'left', 'right']).to_csv(f'data_rr_withfn{suf}.csv', index=False)
# ds[ds['series'].isin(['/data/numerics/SpO₂.SpO₂:value', '/data/numerics/SpO₂T.SpO₂T:value'])].drop(columns=['series', 'left', 'right']).to_csv(f'data_spo2_withfn{suf}.csv', index=False)

In [44]:
featurizedData.sort_values(by=['filename', 'series', 'left'])

Unnamed: 0,filename,series,left,right,label,ecgii_snr,hrrr_ratio_mean,hrrr_ratio_sd,hrrr_ratio_cv,hrrr_ratio_mad,hrrr_ratio_n,hrrr_ratio_min,hrrr_ratio_max,hrrr_ratio_median,hrrr_ratio_range,hrrr_ratio_range_ratio
16,20190523_1053954_1173378.h5,/data/numerics/HR.HR:value,1515595508,1515596030,1,14.909494,8.677217,1.638820,0.188865,0.197783,141.0,4.800000,11.916667,8.875000,7.116667,0.801878
17,20190523_1053954_1173378.h5,/data/numerics/HR.HR:value,1515638422,1515643674,1,14.332886,7.477863,1.204421,0.161065,-0.007275,141.0,5.111111,10.538462,7.470588,5.427350,0.726496
9,20190523_1053954_1173378.h5,/data/numerics/HR.HR:value,1515912676,1515930671,1,39.233044,,,,,,,,,,
8,20190523_1053954_1173378.h5,/data/numerics/HR.HR:value,1516035392,1516037744,1,,,,,,,,,,,
5,20190523_1053954_1173378.h5,/data/numerics/HR.HR:value,1516039795,1516052674,1,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
591,20200630_1934279_1011611.h5,/data/numerics/HR.HR:value,1535443106,1535443609,1,1.945634,5.836121,0.431140,0.073874,0.063879,141.0,4.640000,6.823529,5.900000,2.183529,0.370090
589,20200630_1934279_1011611.h5,/data/numerics/RR.RR:value,1535421926,1535422231,1,17.361591,2.456410,0.502780,0.204681,-0.087632,141.0,1.652174,3.545455,2.368778,1.893281,0.799265
590,20200630_1934279_1011611.h5,/data/numerics/RR.RR:value,1535597897,1535599266,1,32.575806,4.291524,0.379917,0.088527,-0.041524,141.0,3.432432,5.565217,4.250000,2.132785,0.501832
592,20200630_1934279_1011611.h5,/data/numerics/SpO₂.SpO₂:value,1535465918,1535466465,1,29.606163,3.319307,0.085594,0.025787,0.014026,141.0,3.136364,3.500000,3.333333,0.363636,0.109091


In [42]:
prevFeatureSet.sort_values(by=['filename', 'series', 'left'])

Unnamed: 0,filename,series,left,right,label,hr_mean,hr_sd,hr_cv,hr_mad,hr_n,...,pleth_mad,pleth_n,pleth_min,pleth_max,pleth_median,pleth_range,pleth_range_ratio,pleth_skew,pleth_kurtosis,pleth_entropy
0,20190523_1053954_1173378.h5,/data/numerics/HR.HR:value,1515595508,1515596030,1,139.865248,3.468676,0.024800,1.134752,141.0,...,,,,,,,,,,
1,20190523_1053954_1173378.h5,/data/numerics/HR.HR:value,1515638422,1515643674,1,138.085106,3.485663,0.025243,-0.085106,141.0,...,-0.001627,22500.0,0.000000,1.000000,0.492796,1.000000,2.029237,-859.174011,14581.756836,116.537605
2,20190523_1053954_1173378.h5,/data/numerics/HR.HR:value,1515912676,1515930671,1,63.564286,3.393203,0.053382,0.435714,140.0,...,,,,,,,,,,
3,20190523_1053954_1173378.h5,/data/numerics/HR.HR:value,1516035392,1516037744,1,111.957447,5.856464,0.052310,-0.957447,141.0,...,,,,,,,,,,
4,20190523_1053954_1173378.h5,/data/numerics/HR.HR:value,1516039795,1516052674,1,161.765957,1.830840,0.011318,0.234043,141.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,20200630_1934279_1011611.h5,/data/numerics/HR.HR:value,1535443106,1535443609,1,115.426136,2.403616,0.020824,-0.426136,176.0,...,0.014405,22500.0,0.250305,0.750183,0.514774,0.499878,0.971063,-39.487469,136.033524,118.272507
596,20200630_1934279_1011611.h5,/data/numerics/RR.RR:value,1535421926,1535422231,1,79.392045,3.450837,0.043466,-0.392045,176.0,...,0.004972,22500.0,0.250305,0.750183,0.505006,0.499878,0.989845,-29.736944,93.264023,116.483162
597,20200630_1934279_1011611.h5,/data/numerics/RR.RR:value,1535597897,1535599266,1,127.142045,1.236236,0.009723,-0.142045,176.0,...,0.007035,22500.0,0.250305,0.750183,0.505739,0.499878,0.988411,-51.701866,200.224915,119.124802
598,20200630_1934279_1011611.h5,/data/numerics/SpO₂.SpO₂:value,1535465918,1535466465,1,69.863636,0.712048,0.010192,0.136364,176.0,...,-0.014382,22500.0,0.250305,0.750183,0.485958,0.499878,1.028643,-35.061020,115.517738,117.092415


In [46]:
a=featurizedData.sort_values(by=['filename', 'series', 'left']).reset_index()
b=prevFeatureSet.sort_values(by=['filename', 'series', 'left']).reset_index()

In [47]:
a[~a.isin(b).all(1)]

Unnamed: 0,index,filename,series,left,right,label,ecgii_snr,hrrr_ratio_mean,hrrr_ratio_sd,hrrr_ratio_cv,hrrr_ratio_mad,hrrr_ratio_n,hrrr_ratio_min,hrrr_ratio_max,hrrr_ratio_median,hrrr_ratio_range,hrrr_ratio_range_ratio
0,16,20190523_1053954_1173378.h5,/data/numerics/HR.HR:value,1515595508,1515596030,1,14.909494,8.677217,1.638820,0.188865,0.197783,141.0,4.800000,11.916667,8.875000,7.116667,0.801878
1,17,20190523_1053954_1173378.h5,/data/numerics/HR.HR:value,1515638422,1515643674,1,14.332886,7.477863,1.204421,0.161065,-0.007275,141.0,5.111111,10.538462,7.470588,5.427350,0.726496
2,9,20190523_1053954_1173378.h5,/data/numerics/HR.HR:value,1515912676,1515930671,1,39.233044,,,,,,,,,,
3,8,20190523_1053954_1173378.h5,/data/numerics/HR.HR:value,1516035392,1516037744,1,,,,,,,,,,,
4,5,20190523_1053954_1173378.h5,/data/numerics/HR.HR:value,1516039795,1516052674,1,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
594,591,20200630_1934279_1011611.h5,/data/numerics/HR.HR:value,1535443106,1535443609,1,1.945634,5.836121,0.431140,0.073874,0.063879,141.0,4.640000,6.823529,5.900000,2.183529,0.370090
595,589,20200630_1934279_1011611.h5,/data/numerics/RR.RR:value,1535421926,1535422231,1,17.361591,2.456410,0.502780,0.204681,-0.087632,141.0,1.652174,3.545455,2.368778,1.893281,0.799265
596,590,20200630_1934279_1011611.h5,/data/numerics/RR.RR:value,1535597897,1535599266,1,32.575806,4.291524,0.379917,0.088527,-0.041524,141.0,3.432432,5.565217,4.250000,2.132785,0.501832
597,592,20200630_1934279_1011611.h5,/data/numerics/SpO₂.SpO₂:value,1535465918,1535466465,1,29.606163,3.319307,0.085594,0.025787,0.014026,141.0,3.136364,3.500000,3.333333,0.363636,0.109091


In [48]:
a=a[['filename', 'series', 'left']]
b=b[['filename', 'series', 'left']]

In [52]:
a.values.difference(b.values)

AttributeError: 'numpy.ndarray' object has no attribute 'difference'

In [53]:
ds1 = set([tuple(line) for line in a.values])
ds2 = set([tuple(line) for line in b.values])

In [55]:
ds2.difference(ds1)

{('20190805_1157669_1498169.h5', '/data/numerics/HR.HR:value', 1539680314)}

In [69]:
pd.concat([featurizedDataMissingOne, featurizedData], ignore_index=True).copy()[['filename', 'series', 'left', 'right']].drop_duplicates()

Unnamed: 0,filename,series,left,right
0,20190523_1070382_1193704.h5,/data/numerics/HR.HR:value,1501725628,1501726167
1,20190523_1070382_1193704.h5,/data/numerics/HR.HR:value,1501605430,1501612529
2,20190523_1070382_1193704.h5,/data/numerics/HR.HR:value,1501660023,1501660527
3,20190523_1070382_1193704.h5,/data/numerics/SpO₂T.SpO₂T:value,1501591667,1501592233
4,20190523_1070382_1193704.h5,/data/numerics/HR.HR:value,1501698845,1501705251
...,...,...,...,...
595,20191112_1903642_1927237.h5,/data/numerics/SpO₂.SpO₂:value,1530421160,1530421684
596,20191112_1903642_1927237.h5,/data/numerics/SpO₂.SpO₂:value,1530390852,1530391282
597,20191112_1903642_1927237.h5,/data/numerics/SpO₂.SpO₂:value,1530316252,1530316656
598,20191112_1903642_1927237.h5,/data/numerics/SpO₂.SpO₂:value,1530512687,1530513128


In [89]:
for f in list(nd.columns):
    print(f)

ecgii_snr
hrrr_ratio_mean
hrrr_ratio_sd
hrrr_ratio_cv
hrrr_ratio_mad
hrrr_ratio_n
hrrr_ratio_min
hrrr_ratio_max
hrrr_ratio_median
hrrr_ratio_range
hrrr_ratio_range_ratio


In [72]:
### Combine datasets (for targeted featurization)

# new data
#nd = featurizedData.copy()
nd = pd.concat([featurizedDataMissingOne, featurizedData], ignore_index=True).copy()
nd.set_index(['filename', 'series', 'left', 'right'], inplace=True)
nd.sort_index(inplace=True)
nd.drop(columns=['label'], inplace=True)

# old data
#od = pd.read_csv('data_full_pleth15s.csv')
#od = prevFeatureSet.copy()
od = pd.read_csv('data_full_pleth2.csv')
od.set_index(['filename', 'series', 'left', 'right'], inplace=True)
od.sort_index(inplace=True)
#od.drop(columns=['pleth_skew', 'pleth_kurtosis', 'pleth_entropy'], inplace=True)

combined = pd.concat([od, nd], axis=1).reset_index()

In [78]:
with pd.option_context("display.max_columns", 1000):
    display(combined)

Unnamed: 0,filename,series,left,right,label,hr_mean,hr_sd,hr_cv,hr_mad,hr_n,hr_min,hr_max,hr_median,hr_range,hr_range_ratio,hr_data_den,hr_data_den_trail,hr_data_den_trail2,hr_delta_t,hr_max_gap,hr_delta_mean,hr_delta_sd,hr_MWW_stat,hr_MWW_pvalue,hr_KS_stat,hr_KS_pvalue,hr_t_stat,hr_t_pvalue,hr_F_stat,hr_F_pvalue,hr_slope,hr_rslope,hr_slope_before,hr_slope_after,hr_num_breakpoint,hr_diff1_max,hr_diff1_min,hr_max_grad,hr_min_grad,hr_quad_rsq,hr_quad_coef1,hr_quad_coef2,hr_quad_resvar,hr_osi_up,hr_osi_down,hr_osi_ratio,rr_mean,rr_sd,rr_cv,rr_mad,rr_n,rr_min,rr_max,rr_median,rr_range,rr_range_ratio,rr_data_den,rr_data_den_trail,rr_data_den_trail2,rr_delta_t,rr_max_gap,rr_delta_mean,rr_delta_sd,rr_MWW_stat,rr_MWW_pvalue,rr_KS_stat,rr_KS_pvalue,rr_t_stat,rr_t_pvalue,rr_F_stat,rr_F_pvalue,rr_slope,rr_rslope,rr_slope_before,rr_slope_after,rr_num_breakpoint,rr_diff1_max,rr_diff1_min,rr_max_grad,rr_min_grad,rr_quad_rsq,rr_quad_coef1,rr_quad_coef2,rr_quad_resvar,rr_osi_up,rr_osi_down,rr_osi_ratio,spo2_mean,spo2_sd,spo2_cv,spo2_mad,spo2_n,spo2_min,spo2_max,spo2_median,spo2_range,spo2_range_ratio,spo2_data_den,spo2_data_den_trail,spo2_data_den_trail2,spo2_delta_t,spo2_max_gap,spo2_delta_mean,spo2_delta_sd,spo2_MWW_stat,spo2_MWW_pvalue,spo2_KS_stat,spo2_KS_pvalue,spo2_t_stat,spo2_t_pvalue,spo2_F_stat,spo2_F_pvalue,spo2_slope,spo2_rslope,spo2_slope_before,spo2_slope_after,spo2_num_breakpoint,spo2_diff1_max,spo2_diff1_min,spo2_max_grad,spo2_min_grad,spo2_quad_rsq,spo2_quad_coef1,spo2_quad_coef2,spo2_quad_resvar,spo2_osi_up,spo2_osi_down,spo2_osi_ratio,bpd_mean,bpd_sd,bpd_cv,bpd_mad,bpd_n,bpd_min,bpd_max,bpd_median,bpd_range,bpd_range_ratio,bpd_data_den,bpd_data_den_trail,bpd_data_den_trail2,bpd_delta_t,bpd_max_gap,bpd_delta_mean,bpd_delta_sd,bpd_MWW_stat,bpd_MWW_pvalue,bpd_KS_stat,bpd_KS_pvalue,bpd_t_stat,bpd_t_pvalue,bpd_F_stat,bpd_F_pvalue,bpd_slope,bpd_rslope,bpd_slope_before,bpd_slope_after,bpd_num_breakpoint,bpd_diff1_max,bpd_diff1_min,bpd_max_grad,bpd_min_grad,bpd_quad_rsq,bpd_quad_coef1,bpd_quad_coef2,bpd_quad_resvar,bpd_osi_up,bpd_osi_down,bpd_osi_ratio,bps_mean,bps_sd,bps_cv,bps_mad,bps_n,bps_min,bps_max,bps_median,bps_range,bps_range_ratio,bps_data_den,bps_data_den_trail,bps_data_den_trail2,bps_delta_t,bps_max_gap,bps_delta_mean,bps_delta_sd,bps_MWW_stat,bps_MWW_pvalue,bps_KS_stat,bps_KS_pvalue,bps_t_stat,bps_t_pvalue,bps_F_stat,bps_F_pvalue,bps_slope,bps_rslope,bps_slope_before,bps_slope_after,bps_num_breakpoint,bps_diff1_max,bps_diff1_min,bps_max_grad,bps_min_grad,bps_quad_rsq,bps_quad_coef1,bps_quad_coef2,bps_quad_resvar,bps_osi_up,bps_osi_down,bps_osi_ratio,ecgii_mean,ecgii_sd,ecgii_cv,ecgii_mad,ecgii_n,ecgii_min,ecgii_max,ecgii_median,ecgii_range,ecgii_range_ratio,pleth_mean,pleth_sd,pleth_cv,pleth_mad,pleth_n,pleth_min,pleth_max,pleth_median,pleth_range,pleth_range_ratio,pleth_skew,pleth_kurtosis,pleth_entropy,ecgii_snr,hrrr_ratio_mean,hrrr_ratio_sd,hrrr_ratio_cv,hrrr_ratio_mad,hrrr_ratio_n,hrrr_ratio_min,hrrr_ratio_max,hrrr_ratio_median,hrrr_ratio_range,hrrr_ratio_range_ratio
0,20190523_1053954_1173378.h5,/data/numerics/HR.HR:value,1515595508,1515596030,1,139.865248,3.468676,0.024800,1.134752,141.0,126.0,146.0,141.0,20.0,0.141844,0.270115,0.782809,0.781111,2.048,2.048,52.464179,0.724842,0.0,1.161034e-54,1.000000,0.000000e+00,120.650057,1.440056e-272,0.684180,1.830132e-02,-0.011264,-0.016833,-0.068257,-0.066920,1.0,9.765626,-1.953125,6.616793,-1.135976,0.028826,1.934979e+03,-0.011264,11.684885,0.0,0.0,,16.865248,4.144221,0.245725,-0.865248,141.0,12.0,30.0,16.0,18.0,1.125000,0.270115,0.782809,0.781111,2.048,2.048,1.501612,1.345150,10750.5,1.971148e-03,0.222589,5.499935e-04,3.911740,1.115378e-04,2.192088,6.087411e-07,-0.059093,-0.051371,-0.204129,-0.020814,1.0,5.859375,-2.929688,1.621756,-0.939216,0.555763,9.434094e+03,-0.059093,7.629583,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.459110,0.309493,0.674116,0.010890,45000.0,-1.300,2.150,0.470,3.450,7.340425,,,,,,,,,,,,,,14.909494,8.677217,1.638820,0.188865,0.197783,141.0,4.800000,11.916667,8.875000,7.116667,0.801878
1,20190523_1053954_1173378.h5,/data/numerics/HR.HR:value,1515638422,1515643674,1,138.085106,3.485663,0.025243,-0.085106,141.0,117.0,144.0,138.0,27.0,0.195652,0.026847,0.776544,0.586667,1.024,2.048,56.192058,0.294731,0.0,6.993467e-55,1.000000,0.000000e+00,151.718107,1.944322e-304,1.193262,2.600455e-01,-0.012794,-0.018803,-0.021840,0.049788,1.0,19.531247,-9.765626,5.024068,-2.704814,0.036836,2.726082e+03,-0.012794,11.702299,1.0,0.0,1.0,18.971631,3.253445,0.171490,-0.971631,141.0,13.0,27.0,18.0,14.0,0.777778,0.026847,0.774769,0.584444,1.024,2.048,1.607995,0.367932,10009.5,8.864709e-05,0.183790,7.337787e-03,4.156996,4.126091e-05,0.807123,1.813444e-01,-0.029295,-0.032894,-0.111594,-0.087204,1.0,2.441406,-2.929688,1.371091,-1.531050,0.221674,5.944728e+03,-0.029295,8.238511,0.0,0.0,,88.338636,4.005962,0.045348,1.211364,132.0,79.9,93.6,89.55,13.7,0.152987,0.025133,0.777834,0.556667,1.024,7.168,3.750668,1.757404,5782.0,3.047274e-16,0.411765,2.802647e-12,-10.649352,7.668429e-23,3.173989,5.937473e-13,-0.037619,-3.764631e-02,-7.535343e-02,-1.351417e-01,1.0,5.566405,-5.371093,3.133064e+00,-3.063096e+00,0.240654,7697.806195,-3.761896e-02,1.218578e+01,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.494740,0.164845,0.333195,0.005260,45000.0,-1.050,1.760,0.500,2.810,5.620000,0.494423,0.136606,0.276293,-0.001627,22500.0,0.000000,1.000000,0.492796,1.000000,2.029237,-859.174011,14581.756836,116.537605,14.332886,7.477863,1.204421,0.161065,-0.007275,141.0,5.111111,10.538462,7.470588,5.427350,0.726496
2,20190523_1053954_1173378.h5,/data/numerics/HR.HR:value,1515912676,1515930671,1,63.564286,3.393203,0.053382,0.435714,140.0,55.0,70.0,64.0,15.0,0.234375,0.007780,0.783030,0.782222,2.048,2.048,0.930395,0.003225,11892.0,6.686202e-02,0.096809,4.068422e-01,-2.457539,1.450944e-02,1.001903,9.842280e-01,0.020824,0.017530,0.467584,0.022988,1.0,2.929688,-2.929688,1.267112,-1.259496,0.101484,-9.859979e+03,0.020824,10.345360,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.498979,0.075012,0.150331,0.006021,45000.0,0.055,1.105,0.505,1.050,2.079208,,,,,,,,,,,,,,39.233044,,,,,,,,,,
3,20190523_1053954_1173378.h5,/data/numerics/HR.HR:value,1516035392,1516037744,1,111.957447,5.856464,0.052310,-0.957447,141.0,102.0,124.0,111.0,22.0,0.198198,0.059949,0.781917,0.781111,1.024,2.048,19.775629,3.733466,1708.5,7.289977e-42,0.780749,3.900219e-52,21.628953,5.746822e-65,0.372941,2.407997e-09,-0.062851,-0.061547,0.175011,-0.164305,1.0,1.953125,-1.953125,1.199176,-1.072285,0.314895,3.777537e+04,-0.062851,23.497845,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,20190523_1053954_1173378.h5,/data/numerics/HR.HR:value,1516039795,1516052674,1,161.765957,1.830840,0.011318,0.234043,141.0,158.0,165.0,162.0,7.0,0.043210,0.010948,,,1903.941,2.048,,,,,,,,,,,-0.004349,-0.004759,-0.153117,0.001938,1.0,1.953125,-2.929687,0.732543,-0.942657,0.015426,2.786945e+03,-0.004349,3.300268,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,20200630_1934279_1011611.h5,/data/numerics/HR.HR:value,1535443106,1535443609,1,115.426136,2.403616,0.020824,-0.426136,176.0,112.0,127.0,115.0,15.0,0.130435,0.349901,0.977675,0.976667,1.024,1.760,33.434683,1.535831,0.0,5.391275e-68,1.000000,0.000000e+00,99.505896,3.019458e-288,0.372271,2.245009e-11,-0.023485,-0.018539,0.077921,-0.009226,1.0,31.249988,-3.472224,0.582559,-1.051217,0.259943,4.173479e+03,-0.023485,4.275586,0.0,0.0,,19.903409,1.784630,0.089665,-0.903409,176.0,17.0,25.0,19.0,8.0,0.421053,0.349901,0.977675,0.976667,1.024,1.760,1.036762,0.142322,13201.5,1.025505e-10,0.413607,2.220446e-15,-5.564834,4.760130e-08,0.857738,2.835905e-01,-0.007596,-0.002622,0.159754,-0.006482,1.0,2.929688,-2.604166,0.018913,-1.104058,0.049326,1.332406e+03,-0.007596,3.027804,0.0,0.0,,99.992614,0.033840,0.000338,0.007386,176.0,99.8,100.0,100.00,0.2,0.002000,0.349901,0.977675,0.976667,1.024,1.760,0.031930,0.074172,18488.0,4.882792e-04,0.098436,2.625434e-01,3.783605,1.777246e-04,0.098156,1.018921e-46,-0.000146,-5.922519e-09,6.181072e-17,3.359473e-03,1.0,0.097656,-0.097656,3.300989e-02,-4.652267e-02,0.050785,125.245270,-1.461418e-04,1.086975e-03,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.485182,0.403606,0.831865,-0.025182,90000.0,-1.875,2.845,0.460,4.720,10.260870,0.500369,0.129312,0.258434,0.014405,22500.0,0.250305,0.750183,0.514774,0.499878,0.971063,-39.487469,136.033524,118.272507,1.945634,5.836121,0.431140,0.073874,0.063879,141.0,4.640000,6.823529,5.900000,2.183529,0.370090
596,20200630_1934279_1011611.h5,/data/numerics/RR.RR:value,1535421926,1535422231,1,79.392045,3.450837,0.043466,-0.392045,176.0,76.0,93.0,79.0,17.0,0.215190,0.577049,,,1.024,1.760,,,,,,,,,,,-0.019995,-0.018556,0.008151,-0.113329,1.0,6.944448,-13.888895,2.857029,-1.825827,0.413745,-1.855242e+07,244.749810,6.981283,0.0,0.0,,33.642045,6.646352,0.197561,0.357955,176.0,22.0,46.0,34.0,24.0,0.705882,0.577049,,,1.024,1.760,,,,,,,,,,,0.079324,0.082224,-0.116522,0.042194,1.0,10.416663,-13.888895,3.107215,-2.151029,0.389297,2.495987e+06,-33.003941,26.977194,0.0,0.0,,100.000000,0.000000,0.000000,0.000000,176.0,100.0,100.0,100.00,0.0,0.000000,0.577049,,,1.024,1.760,,,,,,,,,,,0.000000,2.569851e-18,-9.579663e-18,2.729429e-16,1.0,0.000000,0.000000,6.869778e-16,-6.805813e-16,,100.000000,-5.085772e-13,5.761134e-28,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.506923,0.276200,0.544856,-0.021923,90000.0,-0.245,2.490,0.485,2.735,5.639175,0.500034,0.139870,0.279721,0.004972,22500.0,0.250305,0.750183,0.505006,0.499878,0.989845,-29.736944,93.264023,116.483162,17.361591,2.456410,0.502780,0.204681,-0.087632,141.0,1.652174,3.545455,2.368778,1.893281,0.799265
597,20200630_1934279_1011611.h5,/data/numerics/RR.RR:value,1535597897,1535599266,1,127.142045,1.236236,0.009723,-0.142045,176.0,123.0,129.0,127.0,6.0,0.047244,0.128561,0.952093,0.951111,1.024,1.184,0.721202,0.210418,16481.0,1.787891e-04,0.216735,1.237551e-04,-5.313310,1.774726e-07,0.730253,2.837761e-02,-0.002935,-0.002563,-0.005924,0.110142,1.0,1.157408,-1.157408,0.849449,-0.815164,0.015345,1.088670e+03,-0.002935,1.504828,0.0,0.0,,29.846591,2.544302,0.085246,0.153409,176.0,23.0,37.0,30.0,14.0,0.466667,0.128561,0.945419,0.944444,1.024,1.184,0.632916,3.159140,17820.5,9.567834e-03,0.232226,2.936324e-05,1.372730,1.705905e-01,0.199005,8.059221e-26,-0.008123,-0.010543,-0.004047,0.348581,1.0,6.835938,-3.906250,3.223265,-2.014541,0.027746,2.690871e+03,-0.008123,6.293862,0.0,0.0,,99.494318,0.276036,0.002774,0.005682,176.0,98.2,100.0,99.50,1.8,0.018090,0.128561,0.977675,0.976667,1.024,1.184,0.262519,0.135879,9427.5,2.209440e-22,0.591540,7.002516e-34,-7.309133,1.429320e-12,0.449073,4.405554e-08,-0.000873,-6.775469e-04,-8.419840e-04,-4.644952e-02,1.0,0.292969,-0.347222,2.719849e-01,-1.879956e-01,0.027221,385.449400,-8.729135e-04,7.412199e-02,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-0.017968,2.530071,-140.813751,-0.122033,90000.0,-6.050,7.510,-0.140,13.560,-96.857147,0.498704,0.121720,0.244072,0.007035,22500.0,0.250305,0.750183,0.505739,0.499878,0.988411,-51.701866,200.224915,119.124802,32.575806,4.291524,0.379917,0.088527,-0.041524,141.0,3.432432,5.565217,4.250000,2.132785,0.501832
598,20200630_1934279_1011611.h5,/data/numerics/SpO₂.SpO₂:value,1535465918,1535466465,1,69.863636,0.712048,0.010192,0.136364,176.0,69.0,72.0,70.0,3.0,0.042857,0.321755,0.978478,0.975556,1.760,1.760,0.794483,0.641138,13805.5,2.784331e-10,0.255002,2.957748e-06,-7.084753,6.151344e-12,0.276888,1.330018e-17,0.002394,0.002365,0.000320,-0.052086,1.0,3.472221,-6.944442,0.505167,-0.418071,0.030762,-3.983732e+02,0.002394,0.491416,0.0,0.0,,21.062500,0.615572,0.029226,-0.062500,176.0,20.0,22.0,21.0,2.0,0.095238,0.321755,0.978478,0.975556,1.760,1.760,1.434295,0.728073,4867.0,2.670425e-44,0.729798,7.187776e-55,13.158046,3.304572e-33,0.209888,2.045124e-24,-0.002677,-0.000002,-0.004816,-0.060342,1.0,3.472221,-0.976563,0.463604,-0.583128,0.051483,5.447350e+02,-0.002677,0.359420,0.0,0.0,,89.328977,0.517037,0.005788,-0.028977,176.0,87.9,90.7,89.30,2.8,0.031355,0.321755,0.978478,0.975556,1.760,1.760,2.277433,0.427294,446.0,6.365932e-65,0.909091,1.465504e-104,-28.896865,9.610131e-101,0.299774,8.116501e-16,-0.002521,-1.993900e-03,2.809373e-03,2.643335e-02,1.0,0.347222,-0.694444,2.534762e-01,-3.987057e-01,0.064692,582.384316,-2.520635e-03,2.500330e-01,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.503277,0.242900,0.482636,-0.018277,90000.0,0.030,2.305,0.485,2.275,4.690722,0.500340,0.133728,0.267274,-0.014382,22500.0,0.250305,0.750183,0.485958,0.499878,1.028643,-35.061020,115.517738,117.092415,29.606163,3.319307,0.085594,0.025787,0.014026,141.0,3.136364,3.500000,3.333333,0.363636,0.109091


In [83]:
with pd.option_context("display.max_columns", 1000):
    #display(combined.drop(columns=[pre+'_'+feat for pre in ['pleth', 'ecgii'] for feat in ['mean', 'sd', 'cv', 'mad', 'n', 'min', 'max', 'median', 'range', 'range_ratio']]))
    display(combined.drop(columns=[pre+'_'+feat for pre in ['bps', 'bpd'] for feat in featuresList]+[pre+'_'+feat for pre in ['pleth', 'ecgii'] for feat in ['mean', 'sd', 'cv', 'mad', 'n', 'min', 'max', 'median', 'range', 'range_ratio']]))

Unnamed: 0,filename,series,left,right,label,hr_mean,hr_sd,hr_cv,hr_mad,hr_n,hr_min,hr_max,hr_median,hr_range,hr_range_ratio,hr_data_den,hr_data_den_trail,hr_data_den_trail2,hr_delta_t,hr_max_gap,hr_delta_mean,hr_delta_sd,hr_MWW_stat,hr_MWW_pvalue,hr_KS_stat,hr_KS_pvalue,hr_t_stat,hr_t_pvalue,hr_F_stat,hr_F_pvalue,hr_slope,hr_rslope,hr_slope_before,hr_slope_after,hr_num_breakpoint,hr_diff1_max,hr_diff1_min,hr_max_grad,hr_min_grad,hr_quad_rsq,hr_quad_coef1,hr_quad_coef2,hr_quad_resvar,hr_osi_up,hr_osi_down,hr_osi_ratio,rr_mean,rr_sd,rr_cv,rr_mad,rr_n,rr_min,rr_max,rr_median,rr_range,rr_range_ratio,rr_data_den,rr_data_den_trail,rr_data_den_trail2,rr_delta_t,rr_max_gap,rr_delta_mean,rr_delta_sd,rr_MWW_stat,rr_MWW_pvalue,rr_KS_stat,rr_KS_pvalue,rr_t_stat,rr_t_pvalue,rr_F_stat,rr_F_pvalue,rr_slope,rr_rslope,rr_slope_before,rr_slope_after,rr_num_breakpoint,rr_diff1_max,rr_diff1_min,rr_max_grad,rr_min_grad,rr_quad_rsq,rr_quad_coef1,rr_quad_coef2,rr_quad_resvar,rr_osi_up,rr_osi_down,rr_osi_ratio,spo2_mean,spo2_sd,spo2_cv,spo2_mad,spo2_n,spo2_min,spo2_max,spo2_median,spo2_range,spo2_range_ratio,spo2_data_den,spo2_data_den_trail,spo2_data_den_trail2,spo2_delta_t,spo2_max_gap,spo2_delta_mean,spo2_delta_sd,spo2_MWW_stat,spo2_MWW_pvalue,spo2_KS_stat,spo2_KS_pvalue,spo2_t_stat,spo2_t_pvalue,spo2_F_stat,spo2_F_pvalue,spo2_slope,spo2_rslope,spo2_slope_before,spo2_slope_after,spo2_num_breakpoint,spo2_diff1_max,spo2_diff1_min,spo2_max_grad,spo2_min_grad,spo2_quad_rsq,spo2_quad_coef1,spo2_quad_coef2,spo2_quad_resvar,spo2_osi_up,spo2_osi_down,spo2_osi_ratio,pleth_skew,pleth_kurtosis,pleth_entropy,ecgii_snr,hrrr_ratio_mean,hrrr_ratio_sd,hrrr_ratio_cv,hrrr_ratio_mad,hrrr_ratio_n,hrrr_ratio_min,hrrr_ratio_max,hrrr_ratio_median,hrrr_ratio_range,hrrr_ratio_range_ratio
0,20190523_1053954_1173378.h5,/data/numerics/HR.HR:value,1515595508,1515596030,1,139.865248,3.468676,0.024800,1.134752,141.0,126.0,146.0,141.0,20.0,0.141844,0.270115,0.782809,0.781111,2.048,2.048,52.464179,0.724842,0.0,1.161034e-54,1.000000,0.000000e+00,120.650057,1.440056e-272,0.684180,1.830132e-02,-0.011264,-0.016833,-0.068257,-0.066920,1.0,9.765626,-1.953125,6.616793,-1.135976,0.028826,1.934979e+03,-0.011264,11.684885,0.0,0.0,,16.865248,4.144221,0.245725,-0.865248,141.0,12.0,30.0,16.0,18.0,1.125000,0.270115,0.782809,0.781111,2.048,2.048,1.501612,1.345150,10750.5,1.971148e-03,0.222589,5.499935e-04,3.911740,1.115378e-04,2.192088,6.087411e-07,-0.059093,-0.051371,-0.204129,-0.020814,1.0,5.859375,-2.929688,1.621756,-0.939216,0.555763,9.434094e+03,-0.059093,7.629583,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,14.909494,8.677217,1.638820,0.188865,0.197783,141.0,4.800000,11.916667,8.875000,7.116667,0.801878
1,20190523_1053954_1173378.h5,/data/numerics/HR.HR:value,1515638422,1515643674,1,138.085106,3.485663,0.025243,-0.085106,141.0,117.0,144.0,138.0,27.0,0.195652,0.026847,0.776544,0.586667,1.024,2.048,56.192058,0.294731,0.0,6.993467e-55,1.000000,0.000000e+00,151.718107,1.944322e-304,1.193262,2.600455e-01,-0.012794,-0.018803,-0.021840,0.049788,1.0,19.531247,-9.765626,5.024068,-2.704814,0.036836,2.726082e+03,-0.012794,11.702299,1.0,0.0,1.0,18.971631,3.253445,0.171490,-0.971631,141.0,13.0,27.0,18.0,14.0,0.777778,0.026847,0.774769,0.584444,1.024,2.048,1.607995,0.367932,10009.5,8.864709e-05,0.183790,7.337787e-03,4.156996,4.126091e-05,0.807123,1.813444e-01,-0.029295,-0.032894,-0.111594,-0.087204,1.0,2.441406,-2.929688,1.371091,-1.531050,0.221674,5.944728e+03,-0.029295,8.238511,0.0,0.0,,88.338636,4.005962,0.045348,1.211364,132.0,79.9,93.6,89.55,13.7,0.152987,0.025133,0.777834,0.556667,1.024,7.168,3.750668,1.757404,5782.0,3.047274e-16,0.411765,2.802647e-12,-10.649352,7.668429e-23,3.173989,5.937473e-13,-0.037619,-3.764631e-02,-7.535343e-02,-1.351417e-01,1.0,5.566405,-5.371093,3.133064e+00,-3.063096e+00,0.240654,7697.806195,-3.761896e-02,1.218578e+01,0.0,0.0,,-859.174011,14581.756836,116.537605,14.332886,7.477863,1.204421,0.161065,-0.007275,141.0,5.111111,10.538462,7.470588,5.427350,0.726496
2,20190523_1053954_1173378.h5,/data/numerics/HR.HR:value,1515912676,1515930671,1,63.564286,3.393203,0.053382,0.435714,140.0,55.0,70.0,64.0,15.0,0.234375,0.007780,0.783030,0.782222,2.048,2.048,0.930395,0.003225,11892.0,6.686202e-02,0.096809,4.068422e-01,-2.457539,1.450944e-02,1.001903,9.842280e-01,0.020824,0.017530,0.467584,0.022988,1.0,2.929688,-2.929688,1.267112,-1.259496,0.101484,-9.859979e+03,0.020824,10.345360,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,39.233044,,,,,,,,,,
3,20190523_1053954_1173378.h5,/data/numerics/HR.HR:value,1516035392,1516037744,1,111.957447,5.856464,0.052310,-0.957447,141.0,102.0,124.0,111.0,22.0,0.198198,0.059949,0.781917,0.781111,1.024,2.048,19.775629,3.733466,1708.5,7.289977e-42,0.780749,3.900219e-52,21.628953,5.746822e-65,0.372941,2.407997e-09,-0.062851,-0.061547,0.175011,-0.164305,1.0,1.953125,-1.953125,1.199176,-1.072285,0.314895,3.777537e+04,-0.062851,23.497845,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,20190523_1053954_1173378.h5,/data/numerics/HR.HR:value,1516039795,1516052674,1,161.765957,1.830840,0.011318,0.234043,141.0,158.0,165.0,162.0,7.0,0.043210,0.010948,,,1903.941,2.048,,,,,,,,,,,-0.004349,-0.004759,-0.153117,0.001938,1.0,1.953125,-2.929687,0.732543,-0.942657,0.015426,2.786945e+03,-0.004349,3.300268,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,20200630_1934279_1011611.h5,/data/numerics/HR.HR:value,1535443106,1535443609,1,115.426136,2.403616,0.020824,-0.426136,176.0,112.0,127.0,115.0,15.0,0.130435,0.349901,0.977675,0.976667,1.024,1.760,33.434683,1.535831,0.0,5.391275e-68,1.000000,0.000000e+00,99.505896,3.019458e-288,0.372271,2.245009e-11,-0.023485,-0.018539,0.077921,-0.009226,1.0,31.249988,-3.472224,0.582559,-1.051217,0.259943,4.173479e+03,-0.023485,4.275586,0.0,0.0,,19.903409,1.784630,0.089665,-0.903409,176.0,17.0,25.0,19.0,8.0,0.421053,0.349901,0.977675,0.976667,1.024,1.760,1.036762,0.142322,13201.5,1.025505e-10,0.413607,2.220446e-15,-5.564834,4.760130e-08,0.857738,2.835905e-01,-0.007596,-0.002622,0.159754,-0.006482,1.0,2.929688,-2.604166,0.018913,-1.104058,0.049326,1.332406e+03,-0.007596,3.027804,0.0,0.0,,99.992614,0.033840,0.000338,0.007386,176.0,99.8,100.0,100.00,0.2,0.002000,0.349901,0.977675,0.976667,1.024,1.760,0.031930,0.074172,18488.0,4.882792e-04,0.098436,2.625434e-01,3.783605,1.777246e-04,0.098156,1.018921e-46,-0.000146,-5.922519e-09,6.181072e-17,3.359473e-03,1.0,0.097656,-0.097656,3.300989e-02,-4.652267e-02,0.050785,125.245270,-1.461418e-04,1.086975e-03,0.0,0.0,,-39.487469,136.033524,118.272507,1.945634,5.836121,0.431140,0.073874,0.063879,141.0,4.640000,6.823529,5.900000,2.183529,0.370090
596,20200630_1934279_1011611.h5,/data/numerics/RR.RR:value,1535421926,1535422231,1,79.392045,3.450837,0.043466,-0.392045,176.0,76.0,93.0,79.0,17.0,0.215190,0.577049,,,1.024,1.760,,,,,,,,,,,-0.019995,-0.018556,0.008151,-0.113329,1.0,6.944448,-13.888895,2.857029,-1.825827,0.413745,-1.855242e+07,244.749810,6.981283,0.0,0.0,,33.642045,6.646352,0.197561,0.357955,176.0,22.0,46.0,34.0,24.0,0.705882,0.577049,,,1.024,1.760,,,,,,,,,,,0.079324,0.082224,-0.116522,0.042194,1.0,10.416663,-13.888895,3.107215,-2.151029,0.389297,2.495987e+06,-33.003941,26.977194,0.0,0.0,,100.000000,0.000000,0.000000,0.000000,176.0,100.0,100.0,100.00,0.0,0.000000,0.577049,,,1.024,1.760,,,,,,,,,,,0.000000,2.569851e-18,-9.579663e-18,2.729429e-16,1.0,0.000000,0.000000,6.869778e-16,-6.805813e-16,,100.000000,-5.085772e-13,5.761134e-28,0.0,0.0,,-29.736944,93.264023,116.483162,17.361591,2.456410,0.502780,0.204681,-0.087632,141.0,1.652174,3.545455,2.368778,1.893281,0.799265
597,20200630_1934279_1011611.h5,/data/numerics/RR.RR:value,1535597897,1535599266,1,127.142045,1.236236,0.009723,-0.142045,176.0,123.0,129.0,127.0,6.0,0.047244,0.128561,0.952093,0.951111,1.024,1.184,0.721202,0.210418,16481.0,1.787891e-04,0.216735,1.237551e-04,-5.313310,1.774726e-07,0.730253,2.837761e-02,-0.002935,-0.002563,-0.005924,0.110142,1.0,1.157408,-1.157408,0.849449,-0.815164,0.015345,1.088670e+03,-0.002935,1.504828,0.0,0.0,,29.846591,2.544302,0.085246,0.153409,176.0,23.0,37.0,30.0,14.0,0.466667,0.128561,0.945419,0.944444,1.024,1.184,0.632916,3.159140,17820.5,9.567834e-03,0.232226,2.936324e-05,1.372730,1.705905e-01,0.199005,8.059221e-26,-0.008123,-0.010543,-0.004047,0.348581,1.0,6.835938,-3.906250,3.223265,-2.014541,0.027746,2.690871e+03,-0.008123,6.293862,0.0,0.0,,99.494318,0.276036,0.002774,0.005682,176.0,98.2,100.0,99.50,1.8,0.018090,0.128561,0.977675,0.976667,1.024,1.184,0.262519,0.135879,9427.5,2.209440e-22,0.591540,7.002516e-34,-7.309133,1.429320e-12,0.449073,4.405554e-08,-0.000873,-6.775469e-04,-8.419840e-04,-4.644952e-02,1.0,0.292969,-0.347222,2.719849e-01,-1.879956e-01,0.027221,385.449400,-8.729135e-04,7.412199e-02,0.0,0.0,,-51.701866,200.224915,119.124802,32.575806,4.291524,0.379917,0.088527,-0.041524,141.0,3.432432,5.565217,4.250000,2.132785,0.501832
598,20200630_1934279_1011611.h5,/data/numerics/SpO₂.SpO₂:value,1535465918,1535466465,1,69.863636,0.712048,0.010192,0.136364,176.0,69.0,72.0,70.0,3.0,0.042857,0.321755,0.978478,0.975556,1.760,1.760,0.794483,0.641138,13805.5,2.784331e-10,0.255002,2.957748e-06,-7.084753,6.151344e-12,0.276888,1.330018e-17,0.002394,0.002365,0.000320,-0.052086,1.0,3.472221,-6.944442,0.505167,-0.418071,0.030762,-3.983732e+02,0.002394,0.491416,0.0,0.0,,21.062500,0.615572,0.029226,-0.062500,176.0,20.0,22.0,21.0,2.0,0.095238,0.321755,0.978478,0.975556,1.760,1.760,1.434295,0.728073,4867.0,2.670425e-44,0.729798,7.187776e-55,13.158046,3.304572e-33,0.209888,2.045124e-24,-0.002677,-0.000002,-0.004816,-0.060342,1.0,3.472221,-0.976563,0.463604,-0.583128,0.051483,5.447350e+02,-0.002677,0.359420,0.0,0.0,,89.328977,0.517037,0.005788,-0.028977,176.0,87.9,90.7,89.30,2.8,0.031355,0.321755,0.978478,0.975556,1.760,1.760,2.277433,0.427294,446.0,6.365932e-65,0.909091,1.465504e-104,-28.896865,9.610131e-101,0.299774,8.116501e-16,-0.002521,-1.993900e-03,2.809373e-03,2.643335e-02,1.0,0.347222,-0.694444,2.534762e-01,-3.987057e-01,0.064692,582.384316,-2.520635e-03,2.500330e-01,0.0,0.0,,-35.061020,115.517738,117.092415,29.606163,3.319307,0.085594,0.025787,0.014026,141.0,3.136364,3.500000,3.333333,0.363636,0.109091


In [85]:
### Export to model-specific datasets

suf = '_try2'

ds = combined.drop(columns=[pre+'_'+feat for pre in ['bps', 'bpd'] for feat in featuresList]+[pre+'_'+feat for pre in ['pleth', 'ecgii'] for feat in ['mean', 'sd', 'cv', 'mad', 'n', 'min', 'max', 'median', 'range', 'range_ratio']])

ds.to_csv(f'data_full{suf}.csv', index=False)
ds.drop(columns=['series', 'left', 'right']).to_csv(f'data_withfn{suf}.csv', index=False)

ds[ds['series'] == '/data/numerics/HR.HR:value'].drop(columns=['series', 'left', 'right']).to_csv(f'data_hr_withfn{suf}.csv', index=False)
ds[ds['series'] == '/data/numerics/RR.RR:value'].drop(columns=['series', 'left', 'right']).to_csv(f'data_rr_withfn{suf}.csv', index=False)
ds[ds['series'].isin(['/data/numerics/SpO₂.SpO₂:value', '/data/numerics/SpO₂T.SpO₂T:value'])].drop(columns=['series', 'left', 'right']).to_csv(f'data_spo2_withfn{suf}.csv', index=False)