In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import wfdb
from wfdb import processing

from gc import collect as collect_garbage
from psutil import virtual_memory
from os import scandir

In [2]:
%matplotlib widget

In [3]:
#folder = "../../Deidentified-Raw-Waveforms/"
folder = "C:/Users/aidan/Box/Deidentified-Raw-Waveforms/"
coldict = {
    "raw_waves_data_1a.csv": ["time", "257"], "raw_waves_data_1b.csv": ["time", "257", "258"], "raw_waves_data_1c.csv": ["time", "257", "258"], "raw_waves_data_1d.csv": ["time", "257", "258", "317"], 
    "raw_waves_data_1e.csv": ["time", "258"],

    "raw_waves_data_2a.csv": ["time", "257", "258"], "raw_waves_data_2b.csv": ["time", "258"], "raw_waves_data_2c.csv": ["time", "257"], "raw_waves_data_2d.csv": ["time", "257", "258"], 
    "raw_waves_data_2e.csv": ["time", "257", "258"],

    "raw_waves_data_3a.csv": ["time", "258"], "raw_waves_data_3b.csv": ["time", "258"], "raw_waves_data_3c.csv": ["time", "258"], "raw_waves_data_3d.csv": ["time", "258"], 
    "raw_waves_data_3e.csv": ["time", "257", "258", "317"],

    "raw_waves_data_4a.csv": ["time", "257", "258"], "raw_waves_data_4b.csv": ["time", "257", "258"], "raw_waves_data_4c.csv": ["time", "257"], "raw_waves_data_4d.csv": ["time", "257", "258"], 
    "raw_waves_data_4e.csv": ["time", "257", "258"],

    "raw_waves_data_5a.csv": ["time", "258"], "raw_waves_data_5b.csv": ["time", "258"], "raw_waves_data_5c.csv": ["time", "258"], "raw_waves_data_5d.csv": ["time", "258", "317"],
    "raw_waves_data_5e.csv": ["time", "258"],

    "raw_waves_data_6a.csv": ["time", "257", "258"], "raw_waves_data_6b.csv": ["time", "258"], "raw_waves_data_6c.csv": ["time", "258"], "raw_waves_data_6d.csv": ["time", "258"], "raw_waves_data_6e.csv": ["time", "258"],
    
    "raw_waves_data_7a.csv": ["time", "257", "258"], "raw_waves_data_7b.csv": ["time", "258"], "raw_waves_data_7c.csv": ["time", "258"], "raw_waves_data_7d.csv": ["time", "257", "258", "317"], 
    "raw_waves_data_7e.csv": ["time", "258"]
}

namedict = {
    "raw_waves_data_1a.csv": "1a", "raw_waves_data_1b.csv": "1b", "raw_waves_data_1c.csv": "1c", "raw_waves_data_1d.csv": "1d", "raw_waves_data_1e.csv": "1e",
    "raw_waves_data_2a.csv": "2a", "raw_waves_data_2b.csv": "2b", "raw_waves_data_2c.csv": "2c", "raw_waves_data_2d.csv": "2d", "raw_waves_data_2e.csv": "2e",
    "raw_waves_data_3a.csv": "3a", "raw_waves_data_3b.csv": "3b", "raw_waves_data_3c.csv": "3c", "raw_waves_data_3d.csv": "3d", "raw_waves_data_3e.csv": "3e",
    "raw_waves_data_4a.csv": "4a", "raw_waves_data_4b.csv": "4b", "raw_waves_data_4c.csv": "4c", "raw_waves_data_4d.csv": "4d", "raw_waves_data_4e.csv": "4e",
    "raw_waves_data_5a.csv": "5a", "raw_waves_data_5b.csv": "5b", "raw_waves_data_5c.csv": "5c", "raw_waves_data_5d.csv": "5d", "raw_waves_data_5e.csv": "5e",
    "raw_waves_data_6a.csv": "6a", "raw_waves_data_6b.csv": "6b", "raw_waves_data_6c.csv": "6c", "raw_waves_data_6d.csv": "6d", "raw_waves_data_6e.csv": "6e",
    "raw_waves_data_7a.csv": "7a", "raw_waves_data_7b.csv": "7b", "raw_waves_data_7c.csv": "7c", "raw_waves_data_7d.csv": "7d", "raw_waves_data_7e.csv": "7e"    
}

In [4]:
collect_garbage()
virtual_memory()

svmem(total=12655771648, available=5637320704, percent=55.5, used=7018450944, free=5637320704)

In [5]:
all_files = [folder + file.name for file in scandir(folder) if ".csv" in file.name]

# We need to replace some of the files with the shifted files produced in notebook 06
all_files[5] =  folder+'06-shifted-2-and-3/raw_waves_data_2a.csv'
all_files[6] =  folder+'06-shifted-2-and-3/raw_waves_data_2b.csv'
all_files[10] =  folder+'06-shifted-2-and-3/raw_waves_data_3a.csv'
all_files[11] =  folder+'06-shifted-2-and-3/raw_waves_data_3b.csv'

In [6]:
# This is the code that detects QRS complexes in the ECG
for i in range(1,8):
    files = [file for file in all_files if "_"+str(i) in file]

    for file in files:
        # Preliminaries
        key = file.split("/")[-1]
        cols = coldict[key]
        freq = 250
        print("Starting now with " + key)

        # Read in the data
        df = pd.read_csv(file, usecols=cols)
        print("Data loaded in")

        # Complete the signal in the order of 257 (ECG1), then  258 (ECG2), and 
        # then 317 (EG3), then ffill for remaining missing values
        signal = pd.Series(df[cols[1]])
        i=2
        while True:
            try:
                signal = signal.combine_first(df[cols[i]])
                i+=1
            except IndexError:
                break
        signal = signal.fillna(method="ffill")
        signal = pd.to_numeric(signal)
        print("Signals combined and filled in")

        # Remove spikes and troughs by pinpointing values out of bounds and then 
        # erasing left and right of those pinpoints by delta indices
        delta = 125
        filt = (signal <= -10) | (signal >= 10)
        filt.loc[~filt] = np.nan
        filt.fillna(method="ffill", limit=delta, inplace=True)
        filt.fillna(method="bfill", limit=delta, inplace=True)
        filt.fillna(value=False, inplace=True)

        signal.loc[filt] = np.nan
        signal.fillna(method="ffill", inplace=True)
        print("Troughs and spikes removed")

        # Initialize the rpeak list
        rpeaks = []

        # Create a counter for breaking the signal into chunks
        i=0
        N = len(signal)
        chunk = 10000
        num_chunks = N//chunk + 1
        print("Signal broken into " + str(num_chunks) + " chunks")

        # Find R peaks in all but the last chunk (that just tends to cause a problem)
        while True:
            try:
                if i%1000 == 0:
                    # I've found this choice of progress marker works for this chunk
                    # size and signal length. If those values change, then this 
                    # condition will need to be modified too
                    print( str(round(i/num_chunks,4)*100) + "% percent done" )

                lo = i*chunk
                hi = min( (i+1)*chunk, N)
                xqrs = processing.XQRS(sig=signal[lo:hi], fs=freq)
                xqrs.detect(verbose=False)

                # xqrs recognized the chunk as starting from 0, so we have to shift 
                # the R peaks according to the left endpoint of the chunk
                rpeaks += list( lo + xqrs.qrs_inds )

                i+=1
            except IndexError:
                # This is the main way in which we'd expect to break this loop
                break
            except ValueError:
                # More often than not, we get this case because the last chunk isn't 
                # long enough, hence the next block
                break
        print("R peaks outside of the last chunk located")

        # Delineate an ending chunk of like 20000 indices that gets the end of the
        # signal, find R peaks
        hi = len(signal)
        lo = hi - 20000
        xqrs = processing.XQRS(sig=signal[lo:hi], fs=freq)
        xqrs.detect(verbose=False)

        rpeaks += [ peak for peak in xqrs.qrs_inds if peak > max(rpeaks)]
        print("Peaks in final chunk located")

        # Grab the time stamps, write them to a file
        df.loc[rpeaks, "time"].to_csv("01-outputs/00-rpeaks/rpeaks_" + namedict[key] + ".csv", )
        print("Output file written")

        # Delete all of the variables to save space
        del df
        del xqrs
        del signal
        del rpeaks
        collect_garbage()

        print(str(virtual_memory()[2]) + " memory usage")




Starting now with raw_waves_data_1a.csv
Data loaded in
Signals combined and filled in
Troughs and spikes removed
Signal broken into 6585 chunks
0.0% percent done
15.190000000000001% percent done
30.37% percent done
45.56% percent done
60.74% percent done
75.92999999999999% percent done
91.12% percent done
R peaks outside of the last chunk located
Peaks in final chunk located
Output file written
58.6 memory usage
Starting now with raw_waves_data_1b.csv
Data loaded in
Signals combined and filled in
Troughs and spikes removed
Signal broken into 6524 chunks
0.0% percent done
15.329999999999998% percent done
30.659999999999997% percent done
45.98% percent done
61.309999999999995% percent done
76.64% percent done
91.97% percent done
R peaks outside of the last chunk located
Peaks in final chunk located
Output file written
56.8 memory usage
Starting now with raw_waves_data_1c.csv
Data loaded in
Signals combined and filled in
Troughs and spikes removed
Signal broken into 6480 chunks
0.0% perce

In [8]:
virtual_memory()

svmem(total=12655771648, available=5862969344, percent=53.7, used=6792802304, free=5862969344)

In [9]:
input_folder = "01-outputs/00-rpeaks/"
output_folder = "01-outputs/01-rr-intervals/"
for i in range(2,8):
    print("Starting now with infant " + str(i))

    # Get the names of the rpeak files
    rpeak_files = sorted([file.name for file in scandir("01-outputs/00-rpeaks/") if str(i) in file.name])

    # Concatenate all of these dataframes
    df = pd.read_csv( input_folder + rpeak_files.pop(0) )
    for file in rpeak_files:
        df = df.append( pd.read_csv(input_folder + file) )
        collect_garbage()
    print("Infant " + str(i) + " R peaks all loaded in")

    # Clean up df, calculate the RR intervals
    df.drop("Unnamed: 0", inplace=True, axis=1)
    df.reset_index(inplace=True, drop=True)
    df["interval"] = df["time"].diff()
    print("Uncleaned RR Intervals Written")

    # Recalculate RR intervals while ignoring beats whose RR intervals are < 0.25 seconds
    filt = df["interval"] >= 0.25

    new_df = pd.DataFrame.copy(df.loc[filt], deep=True)
    new_df.reset_index(inplace=True, drop=True)
    new_df["interval"] = new_df["time"].diff()

    del df
    collect_garbage()
    df = pd.DataFrame.copy(new_df, deep=True)
    del new_df
    collect_garbage()

    # Remove intervals of length greater than 5 seconds (arbitrary threshold)
    # The resulting gaps will just be treated as missing data
    filt = df["interval"] <= 5
    df = df.loc[filt]
    df.reset_index(inplace=True, drop=True)
    print("Intervals outside of [0.25, 5] filtered out")

    # Get rid of any remaining multiple intervals
    df.set_index(df["time"], inplace=True)
    df.drop("time", inplace=True, axis=1)

    df_buffer = pd.DataFrame.copy(df, deep=True)

    # THE FOLLOWING CODE IS EXTREMELY SLOW
    # For a generator to iterate over the rows
    rows = df_buffer.iterrows()
    
    # Get the initial values
    prev_idx, prev_row = next(rows)
    prev_ivl = prev_row["interval"]

    # A counter to see how many beats were imputed
    imputed = 0
    max_imputed = 0

    # Counter for progress measurement
    counter = 0
    progress_chunk = len(df["interval"])//20

    for curr_idx, curr_row in rows:
        if counter % progress_chunk == 0:
            print( str(round(counter/len(df["interval"]), 4)*100 ) + "% Complete" )
        counter += 1

        curr_ivl = curr_row["interval"]

        pieces = round(curr_ivl/prev_ivl)

        if pieces >= 2: # Then it is likely that the current interval is a multiple interval
            fill_value = curr_ivl / pieces

            while fill_value < 0.25: # We have too many pieces and the fill value is too small
                pieces -= 1
                fill_value = curr_ivl/pieces
                if pieces == 1:
                    break
            
            if pieces == 1: # Then there's no point in carrying on with this iteration
                prev_idx = curr_idx
                prev_ivl = curr_ivl
                continue

            # Otherwise, we impute the RR intervals, modifying df and NOT df_buffer
            imputed += pieces
            max_imputed = max(max_imputed, pieces)

            endpoints = [prev_idx + i*fill_value for i in range(1,pieces)] + [curr_idx]
            for t in endpoints:
                df.loc[t,"interval"] = fill_value
            
            # Now, we update the previous values ahead of the next iteration
            prev_idx = curr_idx
            prev_ivl = fill_value
            continue

        # If we didn't enter the pieces >= 2 case, then twe need to update the previous values in a different way
        prev_idx = curr_idx
        prev_ivl = curr_ivl

    print("Multiple intervals all broken up")
    df.sort_index(inplace=True)

    del df_buffer

    # Now, we write the new dataframe to 5 different csvs so that they can be pushed to GitHub
    N = len(df["interval"])
    file_bounds = [k*N//5 for k in range(0,6)]
    parts = ["a", "b", "c", "d", "e"]
    
    for k in range(5):
        df.iloc[file_bounds[k]:file_bounds[k+1]].to_csv(output_folder + "rr_intervals_"+str(i)+parts[k]+".csv")

    print("Imputed:", imputed)
    print("Max Imputed:", max_imputed)
    del df
    del rows
    collect_garbage()
    print("Cleaned RR intervals written to file, "+str(virtual_memory()[2])+"% memory usage\n")

    break # This is to check that everything is good with infant 1 before spending tons of time on the others
            


Starting now with infant 1
Infant 1 R peaks all loaded in
Uncleaned RR Intervals Written
Intervals outside of [0.25, 5] filtered out
0.0% Complete
5.0% Complete
10.0% Complete
14.99% Complete
19.98% Complete
24.97% Complete
29.959999999999997% Complete
34.94% Complete
39.93% Complete
44.91% Complete
49.88% Complete
54.86% Complete
59.830000000000005% Complete
64.8% Complete
69.76% Complete
74.72999999999999% Complete
79.67999999999999% Complete
84.63000000000001% Complete
89.59% Complete
94.53% Complete
99.47% Complete
Multiple intervals all broken up


KeyError: 'time'

In [10]:
N = len(df["interval"])
file_bounds = [k*N//5 for k in range(0,6)]
parts = ["a", "b", "c", "d", "e"]
    
for k in range(5):
    df.iloc[file_bounds[k]:file_bounds[k+1]].to_csv(output_folder + "rr_intervals_"+str(i)+parts[k]+".csv")