![alt text](./Cerny_logo_1.jpg)

# The effect of ambulance acceleration on mechanical ventilation during neonatal transport

#### Author: Dr Gusztav Belteki

### 1. Import the required libraries and set options

In [None]:
import IPython
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib
import matplotlib.pyplot as plt

import os
import sys
import pickle

from pandas import Series, DataFrame
from datetime import datetime, timedelta

%matplotlib inline
matplotlib.style.use('classic')
matplotlib.rcParams['figure.facecolor'] = 'w'

pd.set_option('display.max_rows', 250)
pd.set_option('display.max_columns', 100)

In [None]:
print("Python version: {}".format(sys.version))
print("pandas version: {}".format(pd.__version__))
print("matplotlib version: {}".format(matplotlib.__version__))
print("NumPy version: {}".format(np.__version__))
print("SciPy version: {}".format(sp.__version__))
print("IPython version: {}".format(IPython.__version__))

### 2. List and set the working directory and the directory to write out data

In [None]:
# Topic of the Notebook which will also be the name of the subfolder containing results
TOPIC = 'accelerometer_ventilated'

# Name of the external hard drives
DRIVE = 'GUSZTI'

# Directory containing clinical and blood gas data
CWD = os.path.join('/Users','guszti', 'ventilation_fabian')

# Directory on external drive to read the ventilation data from
#DIR_READ = '/Volumes/%s/Raw_data/Fabian/accelerometer_data' % DRIVE1
DIR_READ = os.path.join('/Volumes', DRIVE, 'Fabian', 'accelerometer_data')

DIR_WRITE = os.path.join(CWD, 'Analyses', TOPIC)
if not os.path.exists(DIR_WRITE):
    os.mkdir(DIR_WRITE)

DATA_DUMP = os.path.join('/Volumes', DRIVE, 'data_dump', 'fabian')

In [None]:
DIR_READ, DIR_WRITE, DATA_DUMP

### 3. Import the start time of the acceleration recordings

In [None]:
start_time_accel = {}

flist = [file for file in os.listdir(DIR_READ) if not file.startswith('.')]
flist  = sorted(flist, key = lambda x: int(x.split('.')[0].split('__')[1]))
len(flist)

In [None]:
%%time
# Get the start time of the acceleration recordings as Timestamp
for fle in flist:
    with open(os.path.join(DIR_READ, fle), 'r') as infile:
        infile.readline()
        line = infile.readline().split('@')[1].split()
        line = ' '.join(line[:4] + line[5:])
        start_time_accel[fle] = pd.to_datetime(line)     

### 4. Calculate stop time of accelerometer recordings

In [None]:
%%time

stop_time_accel = {}
for fle in flist:
    if int(fle[9:-4]) % 10 == 0:
        print('Working on %s' % fle)
    temp_frame = pd.read_csv('%s/%s' % (DIR_READ, fle), header = None, 
        names = ['X', 'Y', 'Z', 'timedelta'], delim_whitespace = True, comment = '#', low_memory = False)
    cum_time = temp_frame['timedelta'].sum()
    stop_time_accel[fle] = start_time_accel[fle] + pd.to_timedelta(cum_time, unit='ms')
    temp_frame = None

In [None]:
accel_time_frame = DataFrame([start_time_accel, stop_time_accel]).T
accel_time_frame.columns = ['accelerometer_start_time', 'accelerometer_stop_time']
accel_time_frame.sort_values('accelerometer_start_time', inplace = True)

### 5. Remove or correct wrong timestamps

`default__100.txt` and `default__101.txt` have got completely wrong time stamps that cannot be tracked down. Remove them. 

In [None]:
del start_time_accel['default__100.txt'], start_time_accel['default__101.txt']
del stop_time_accel['default__100.txt'], stop_time_accel['default__101.txt']

accel_time_frame.drop(index = ['default__100.txt', 'default__101.txt'], inplace = True)

Recordings `default__102.txt` - `default__122.txt` have been wrongly recorded with time stamps one year ahead actual time. Shift the time back by one year

In [None]:
accel_time_frame.sort_values('accelerometer_start_time', inplace = True )
accel_time_frame[375:430]

In [None]:
rec_list = list(accel_time_frame.index)

On inspection of the start and stop times it is clear that for recordings `default__102.txt` to `default__122.txt` the year is one year ahead of the actual time of the recording (2019 instead or 2018). Therefore, the timestamp needs to be shifted back by one year (365 days)

In [None]:
to_shift = ['default__102.txt', 'default__103.txt', 'default__104.txt', 'default__105.txt', 'default__106.txt',
            'default__107.txt', 'default__108.txt', 'default__109.txt', 'default__110.txt', 'default__111.txt',
            'default__112.txt', 'default__113.txt', 'default__114.txt', 'default__115.txt', 'default__116.txt',
            'default__117.txt', 'default__118.txt', 'default__119.txt', 'default__120.txt', 'default__121.txt',
            'default__122.txt',]

In [None]:
for rec in to_shift:
    start_time_accel[rec] = start_time_accel[rec] - pd.Timedelta(value = 365, unit = 'D')
    stop_time_accel[rec] = stop_time_accel[rec] - pd.Timedelta(value = 365, unit = 'D')

In [None]:
# Recreate DataFrame now with the correct start and stop times

accel_time_frame = DataFrame([start_time_accel, stop_time_accel]).T
accel_time_frame.columns = ['accelerometer_start_time', 'accelerometer_stop_time']
accel_time_frame.sort_values('accelerometer_start_time', inplace = True)

accel_time_frame

### 6. Import ventilator recordings with mechanical ventilation

In [None]:
%%time

# Import ventilator parameters, settings and alarms

with open('%s/%s.pickle' % (DATA_DUMP, 'data_pars_measurements_ventilated_1_300'), 'rb') as handle:
    data_pars_measurements_ventilated_1_300 = pickle.load(handle)    
with open('%s/%s.pickle' % (DATA_DUMP, 'data_pars_settings_ventilated_1_300'), 'rb') as handle:
    data_pars_settings_ventilated_1_300 = pickle.load(handle)   
with open('%s/%s.pickle' % (DATA_DUMP, 'data_pars_alarms_ventilated_1_300'), 'rb') as handle:
    data_pars_alarms_ventilated_1_300 = pickle.load(handle)
    
with open('%s/%s.pickle' % (DATA_DUMP, 'data_pars_measurements_ventilated_301_600'), 'rb') as handle:
    data_pars_measurements_ventilated_301_600 = pickle.load(handle)    
with open('%s/%s.pickle' % (DATA_DUMP, 'data_pars_settings_ventilated_301_600'), 'rb') as handle:
    data_pars_settings_ventilated_301_600 = pickle.load(handle)    
with open('%s/%s.pickle' % (DATA_DUMP, 'data_pars_alarms_ventilated_301_600'), 'rb') as handle:
    data_pars_alarms_ventilated_301_600 = pickle.load(handle)
    
with open('%s/%s.pickle' % (DATA_DUMP, 'data_pars_measurements_ventilated_601_900'), 'rb') as handle:
    data_pars_measurements_ventilated_601_900 = pickle.load(handle)   
with open('%s/%s.pickle' % (DATA_DUMP, 'data_pars_settings_ventilated_601_900'), 'rb') as handle:
    data_pars_settings_ventilated_601_900 = pickle.load(handle)    
with open('%s/%s.pickle' % (DATA_DUMP, 'data_pars_alarms_ventilated_601_900'), 'rb') as handle:
    data_pars_alarms_ventilated_601_900 = pickle.load(handle)
        
data_pars_measurements_ventilated = {**data_pars_measurements_ventilated_1_300, 
                                     **data_pars_measurements_ventilated_301_600,
                                     **data_pars_measurements_ventilated_601_900}

data_pars_settings_ventilated = {**data_pars_settings_ventilated_1_300, 
                                 **data_pars_settings_ventilated_301_600,
                                 **data_pars_settings_ventilated_601_900}

data_pars_alarms_ventilated = {**data_pars_alarms_ventilated_1_300, 
                               **data_pars_alarms_ventilated_301_600,
                               **data_pars_alarms_ventilated_601_900}

In [None]:
len(data_pars_measurements_ventilated), len(data_pars_settings_ventilated), len(data_pars_alarms_ventilated),

### 7. Generate dictionaries with the start and stop time of ventilator recordings

In [None]:
start_time_vent = {}
stop_time_vent = {}

for recording in data_pars_measurements_ventilated:
    start_time_vent[recording] = data_pars_measurements_ventilated[recording].index[0]
    stop_time_vent[recording] = data_pars_measurements_ventilated[recording].index[-1]

In [None]:
# Create DataFrame now with the start and stop times of ventilator data for ventilated recordings

vent_time_frame = DataFrame([start_time_vent, stop_time_vent]).T
vent_time_frame.columns = ['ventilator_start_time', 'ventilator_stop_time']
vent_time_frame.sort_values('ventilator_start_time', inplace = True)

#vent_time_frame

In [None]:
# Save the acceleremoter and ventilator recording times in an excel file

writer = pd.ExcelWriter('%s/%s' % (DIR_WRITE, 'accelerometer_ventilator_times.xlsx'))
accel_time_frame.to_excel(writer, 'accelerometer')
vent_time_frame.to_excel(writer, 'ventilator')
writer.save()

### 8. Identify matches between accelerometer and ventilator recordings

A ventilator and accelerometer recording can only overlap if one does not start after the other ended or does not end before the other started

In [None]:
matches_ventilated = []

for accel_key in start_time_accel:
    for vent_key in start_time_vent:
        # Do not include if it starts later or ends earlier
        if start_time_accel[accel_key] > stop_time_vent[vent_key] or \
           stop_time_accel[accel_key] < start_time_vent[vent_key]:
            continue
        
        else:
            matches_ventilated.append((accel_key, vent_key))

len(matches_ventilated)

In [None]:
matches_ventilated_frame = DataFrame(matches_ventilated, columns = ['accel_rec', 'vent_rec'])
matches_ventilated_frame = matches_ventilated_frame.merge(right = accel_time_frame, 
                                    how = 'left', left_on = 'accel_rec', right_index = True)
matches_ventilated_frame = matches_ventilated_frame.merge(right = vent_time_frame, 
                                               how = 'left', left_on = 'vent_rec', right_index = True)

#matches_ventilated_frame.head()

In [None]:
# Save the acceleromoter and ventilator recording times in an excel file
writer = pd.ExcelWriter('%s/%s' % (DIR_WRITE, 'accel_vent_matching_times.xlsx'))
matches_ventilated_frame.to_excel(writer, 'matches')
writer.save()

### 9. Import relevant accelerometer data

In [None]:
# How many accelerometer recordings have a matching ventilator recording
len(sorted(set(matches_ventilated_frame['accel_rec'])))

In [None]:
%%time

# Recordings with mechanical ventilation
accelero_ventilated = {}

for i, fle in enumerate(sorted(set(matches_ventilated_frame['accel_rec']), key = lambda x : int(x[9:-4]))):
    if i % 10 == 0:
        print('Imported %d recordings' % (i+1))
    accelero_ventilated[fle] = pd.read_csv('%s/%s' % (DIR_READ, fle), header = None, 
        names = ['X', 'Y', 'Z', 'timedelta'], delim_whitespace = True, comment = '#', low_memory = False)

len(accelero_ventilated)

### 10. Generate acceleration DataFrames with timestamps

In [None]:
%%time

# Recordings with mechanical ventilation
for i, rec in enumerate(accelero_ventilated):
    if i % 10 == 0:
        print('Processed %d recordings' % i)
        
    # Calculate cumulative times from timedelta [ms]
    accelero_ventilated[rec]['ms'] = accelero_ventilated[rec]['timedelta'].cumsum()
    
    # Create timestamps from milisec data, this starts at '1970:00:00:00 00:00:00'
    accelero_ventilated[rec]['timestamp'] = pd.to_datetime(accelero_ventilated[rec]['ms'], unit='ms')
    
    # Correct timestamp to actual one
    init_time = pd.Timestamp('1970-01-01 00:00:00.000')
    delta = start_time_accel[rec] - init_time
    accelero_ventilated[rec]['timestamp_2'] = accelero_ventilated[rec]['timestamp'] + delta
    
    # use times as index
    accelero_ventilated[rec].index = accelero_ventilated[rec]['timestamp_2']
    accelero_ventilated[rec].index.name = 'time'
    
    # Remove unnecessary columns
    accelero_ventilated[rec] = accelero_ventilated[rec][['X', 'Y', 'Z', 'timedelta', 'ms']]  

In [None]:
%%time

# Drop rows with na values
for accel_rec in accelero_ventilated:
    accelero_ventilated[accel_rec].dropna(how = 'any', inplace = True)

len(accelero_ventilated)

### 11. Limit the accelerometer and ventilator recordings to the parts when data are avaliable for both

In [None]:
start_time_comb_vent = {}
stop_time_comb_vent = {}

for accel_rec, vent_rec in matches_ventilated:
    if vent_rec in data_pars_measurements_ventilated.keys():
        start_time_comb_vent[(accel_rec, vent_rec)] = max(accelero_ventilated[accel_rec].index[0], 
                                                 data_pars_measurements_ventilated[vent_rec].index[0])
        stop_time_comb_vent[(accel_rec, vent_rec)]  = min(accelero_ventilated[accel_rec].index[-1], 
                                                 data_pars_measurements_ventilated[vent_rec].index[-1])

### 12. Trim ventilator and accelerometer data to contain only the overlapping regions

Some accelerometer recordings are long and pair up with several consecutive ventilator recordings

In [None]:
%%time

accelero_ventilated_trimmed = {}
data_pars_measurements_ventilated_accelero = {}
data_pars_settings_ventilated_accelero = {}
data_pars_alarms_ventilated_accelero = {}

for accel_rec, vent_rec in matches_ventilated:
    
    if vent_rec in data_pars_measurements_ventilated:
        
        start_time = start_time_comb_vent[accel_rec, vent_rec]
        stop_time = stop_time_comb_vent[accel_rec, vent_rec]
    
        accelero_ventilated_trimmed[accel_rec, vent_rec] = \
            accelero_ventilated[accel_rec][start_time : stop_time].copy()
        
        data_pars_measurements_ventilated_accelero[accel_rec, vent_rec] =  \
            data_pars_measurements_ventilated[vent_rec][start_time : stop_time].copy()
        
        data_pars_settings_ventilated_accelero[accel_rec, vent_rec] =  \
            data_pars_settings_ventilated[vent_rec][start_time : stop_time].copy()
        
        data_pars_alarms_ventilated_accelero[accel_rec, vent_rec] =  \
            data_pars_alarms_ventilated[vent_rec][start_time : stop_time].copy()

#### Remove those recordings where there is too much (>10%) missing data

As the sampling rate of ventilator data is 0.5 Hz and the accelerometer data is 100 Hz, the proportion of the number of data points should be ideally ~200. Because of missing data this does not hold exactly. Allow for ~10% mismatch, that is a proportion between 180 - 220. 

In [None]:
for rec, value in data_pars_measurements_ventilated_accelero.items():
    print(rec, len(value), len(accelero_ventilated_trimmed[rec]),
              len(accelero_ventilated_trimmed[rec]) / len(value), sep =  ' ' * 8 )

In [None]:
len(data_pars_measurements_ventilated_accelero)

In [None]:
data_pars_measurements_ventilated_accelero_sel = { key : value for key, value 
            in data_pars_measurements_ventilated_accelero.items()
            if 180 < len(accelero_ventilated_trimmed[key]) / len(value) < 220 }

data_pars_settings_ventilated_accelero_sel = { key : value for key, value 
            in data_pars_settings_ventilated_accelero.items()
            if 180 < len(accelero_ventilated_trimmed[key]) / 
                     len(data_pars_measurements_ventilated_accelero[key]) < 220 }

data_pars_alarms_ventilated_accelero_sel = { key : value for key, value 
            in data_pars_alarms_ventilated_accelero.items()
            if 180 < len(accelero_ventilated_trimmed[key]) / 
                     len(data_pars_measurements_ventilated_accelero[key]) < 220 }

accelero_ventilated_trimmed_sel = { key : value for key, value 
            in accelero_ventilated_trimmed.items()
            if 180 < len(accelero_ventilated_trimmed[key]) / 
                     len(data_pars_measurements_ventilated_accelero[key]) < 220 }

(len(data_pars_measurements_ventilated_accelero_sel), len(data_pars_settings_ventilated_accelero_sel),
len(data_pars_alarms_ventilated_accelero_sel), len(accelero_ventilated_trimmed_sel))

In [None]:
for rec, value in data_pars_measurements_ventilated_accelero_sel.items():
    print(rec, len(value), len(accelero_ventilated_trimmed_sel[rec]),
              len(accelero_ventilated_trimmed_sel[rec]) / len(value), sep =  ' ' * 8 )

### 13. Correct data types as appropriate

In [None]:
accelero_ventilated_trimmed_sel[('default__393.txt', 'AL000628')].info()

In [None]:
data_pars_measurements_ventilated_accelero_sel[('default__393.txt', 'AL000628')].info()

In [None]:
for rec, item in data_pars_measurements_ventilated_accelero_sel.items():
    for col in item.columns:
        item[col] = item[col].astype('float')    

In [None]:
data_pars_measurements_ventilated_accelero_sel[('default__393.txt', 'AL000628')].info()

In [None]:
data_pars_settings_ventilated_accelero_sel['default__393.txt', 'AL000628'].info()

In [None]:
numeric = ['PIP_set', 'PEEP_set', 'FiO2_set',
           'Flow_insp_set', 'Flow_exp_set', 'Ti_set', 'Te_set', 'RR_set',
           'IE_I_set', 'IE_E_set', 'VG_set', 'Trigger_sens_set',
           'PIP_lim_high_set', 'PIP_lim_low_set', 
           'P_man_breath_duoPAP_NCPAP_set', 'FiO2_flush_time_set',
           'FiO2_flush_set', 'VG_set_kg']

categorical = ['Patient_range', 'Ventilator_mode', 'Powerstate', 'Measuring_unit_pressure_set',
               'Flow_sensor_state', 'Oxy_sensor_state', 'Ventilation_stopped',  'VG_state', 
               'Ventilator_range', 'Trigger_mode', 'Pressure_rise_control',]

In [None]:
for rec, item in data_pars_settings_ventilated_accelero_sel.items():
    for col in item.columns:
        if col in numeric:
            item[col] = item[col].astype('float')
        elif col in categorical:
            item[col] = item[col].astype('category')

In [None]:
data_pars_settings_ventilated_accelero_sel['default__393.txt', 'AL000628'].info()

In [None]:
data_pars_alarms_ventilated_accelero_sel['default__393.txt', 'AL000628'].info()

### 14. Export trimmed ventilator and accelerometer data to pickle archives

In [None]:
with open('%s/%s.pickle' % (DATA_DUMP, 'data_pars_measurements_ventilated_accelero'), 'wb') as handle:
    pickle.dump(data_pars_measurements_ventilated_accelero_sel, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('%s/%s.pickle' % (DATA_DUMP, 'data_pars_settings_ventilated_accelero'), 'wb') as handle:
    pickle.dump(data_pars_settings_ventilated_accelero_sel, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('%s/%s.pickle' % (DATA_DUMP, 'data_pars_alarms_ventilated_accelero'), 'wb') as handle:
    pickle.dump(data_pars_alarms_ventilated_accelero_sel, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
%%time

with open('%s/%s.pickle' % (DATA_DUMP, 'accelero_ventilated_1'), 'wb') as handle:
    pickle.dump(accelero_ventilated_trimmed_sel, handle, protocol=pickle.HIGHEST_PROTOCOL)