In [1]:
import audata as aud
import h5py as h5
import pandas as pd
from pathlib import Path

In [2]:
files_dir = Path('/media/zfsmladi/originals/')#'/home/auvdata/projects/conditionc-new/originals/'
labels = pd.read_csv('../finalLabels.csv')

In [3]:
# We'll perform two tasks:
#   - Task 1: Read in the sample data for the target series
#   - Task 2: Read in sample data from a waveform series in the equivalent time period
for label in labels.iterrows():
    
    label = label[1]
    left = label['left']
    right = label['right']
    
    print(f"Our label data:\n\n{label}\n")
    
    
    
    ### TASK 1
    
    # Grab the series name (the series in the labels dataset also includes a column name, so we remove that)
    series_name = label['series'].split(':')[0]
    
    # Open the file using the audata lib
    aufile = aud.File.open(files_dir / label['filename'], readonly=True)
    
    # Our label timestamps do not include the basetime, so let's grab the basetime for the file
    basetime = aufile.time_reference.timestamp()
    
    # Get a reference to the audata series object
    audata_series = aufile[series_name]
    
    # Read the entire series into memory as a pandas dataframe. We ask for raw time values (meaning
    # the time column will be a numerical time offset in seconds, instead of datetime objects).
    # Also, the get() function returns a numpy ndarray, so we convert it to a dataframe.
    df = pd.DataFrame(audata_series.get(raw=True))
    
    print("Our overall series dataframe:")
    display(df)
    
    # Clean the series of any duplicate timestamps
    df = df.drop_duplicates(subset=['time'], ignore_index=True)
    
    # Now, let's grab the data from the alert time period
    sample_data = df[ (df.time > left-basetime) & (df.time < right-basetime) ]
    
    print("Our sample dataframe:")
    display(sample_data)
    
    
    
    ### TASK 2
    
    # NOTE: In task 1, we simply used the audata library to work with the file. When reading in waveform
    # data, however, the problem with the above approach is that it requires reading in the entire series
    # into memory before slicing. With the h5py library, however, we can actually slice the data before
    # reading the entire dataset into memory. Therefore, we will use h5py for the waveform series.
    
    # Open the file using the h5py lib
    h5file = h5.File(files_dir / label['filename'], mode='r')
    
    # Get a reference to the h5py waveform series object
    h5py_wf_series = h5file['/data/waveforms/II']
    
    # Read a slice of the data into memory with the time boundaries
    waveform_sample_data = pd.DataFrame(h5py_wf_series[(h5py_wf_series['time'] > left-basetime) & (h5py_wf_series['time'] < right-basetime)])
    
    print("Our waveform sample dataframe:")
    display(waveform_sample_data)
    
    
    
    # We'll do this for just one file for now, but if you wanted to process all labels, you would continue.
    break

Our label data:

filename       20190523_1628452_1284339.h5
series          /data/numerics/HR.HR:value
left                            1524281417
right                           1524281836
real_vs_art                              1
Name: 0, dtype: object

Our overall series dataframe:


Unnamed: 0,time,value
0,1652560.602,63.0
1,1652561.626,63.0
2,1652562.650,64.0
3,1652563.674,64.0
4,1652565.722,68.0
...,...,...
769345,2573994.023,84.0
769346,2573996.071,87.0
769347,2573997.095,92.0
769348,2573998.119,92.0


Our sample dataframe:


Unnamed: 0,time,value
19342,1677438.682,54.0
19343,1677439.706,54.0
19344,1677440.730,55.0
19345,1677441.754,55.0
19346,1677443.802,56.0
...,...,...
19664,1677850.330,60.0
19665,1677851.354,60.0
19666,1677853.402,60.0
19667,1677854.426,62.0


Our waveform sample dataframe:


Unnamed: 0,time,value
0,1.677437e+06,0.800
1,1.677437e+06,0.805
2,1.677437e+06,0.805
3,1.677437e+06,0.805
4,1.677437e+06,0.800
...,...,...
104745,1.677856e+06,0.175
104746,1.677856e+06,0.250
104747,1.677856e+06,0.325
104748,1.677856e+06,0.370
