![alt text](./pageheader_rose2_babies.jpg)

# Preprocessing HFOV-VG data

**Author: Dr Gusztav Belteki**

This Notebook imports the pickle archive produced by **HFOV_all.ipynb**: *slow_measurements_hfov*. It then selects the recordings only which contain at least 12 hours of HFOV-VG ventilation. After some preprocessing (adding VThf and Pmax data) it export them as a pickle archive: *slow_measurements_hfov_vg*, *slow_measurements_hfov_no_vg*, *vent_settings_selected_hfov_vg*, *vent_settings_selected_hfov_no_vg*, *clinical_details_hfov_vg*.

Processing steps:
* Keep only recordings which have HFOV-VG mode
* Remove recordings which are < 12 hours long
* Keep only one recording per patient
* Separate periods of VG and no VG in the recordings which have both
* Remove DG062 as its VG component is only a couple minutes
* Calculate Vthf / dP ratio and add it to the DataFrames
* Limit ventilator settings to the VG or noVG periods
* Retrieve relevant ventilator settings normalize to body weight and add them to HFOV data
* Calculate difference between the set and actual VThf in case of VG
* Calculate difference between the set and actual and amplitude
* Limit clinical details to the selected HFOV-VG recordings¶


### Import the necessary libraries and setting options

In [None]:
import IPython
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk

import os
import sys
import re
import pickle

from scipy import stats
from pandas import Series, DataFrame
from datetime import datetime, timedelta

%matplotlib inline

matplotlib.style.use('classic')
matplotlib.rcParams['figure.facecolor'] = 'w'

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)


In [None]:
print("Python version: {}".format(sys.version))
print("pandas version: {}".format(pd.__version__))
print("matplotlib version: {}".format(matplotlib.__version__))
print("NumPy version: {}".format(np.__version__))
print("SciPy version: {}".format(sp.__version__))
print("IPython version: {}".format(IPython.__version__))
print("scikit-learn version: {}".format(sk.__version__))

### Import custom functions from own module

In [None]:
from gb_loader import *
from gb_stats import *
from gb_transform import *
from gb_visualizer import *

### List and set the working directory and the directory to write out data

In [None]:
# Topic of the Notebook which will also be the name of the subfolder containing results
TOPIC = 'HFOV_VG'

# Name of the external hard drive
DRIVE = 'GUSZTI'

# Directory containing clinical and blood gas data
CWD = '/Users/guszti/ventilation_draeger'

# Directory on external drive to read the ventilation data from
DIR_READ = '/Volumes/%s/ventilation_data' % DRIVE

# Directory on external drive to read in dump large datasets
DIR_READ_2 = '/Volumes/%s/data_dump/draeger/%s' % (DRIVE, 'HFOV_all')

# Directory to write results and selected images to 
if not os.path.isdir('%s/%s/%s' % (CWD, 'Analyses', TOPIC)):
    os.makedirs('%s/%s/%s' % (CWD, 'Analyses', TOPIC))
DIR_WRITE = '%s/%s/%s' % (CWD, 'Analyses', TOPIC)

# Images and raw data will be written on an external hard drive
if not os.path.isdir('/Volumes/%s/data_dump/draeger/%s' % (DRIVE, TOPIC)):
    os.makedirs('/Volumes/%s/data_dump/draeger/%s' % (DRIVE, TOPIC))
DATA_DUMP = '/Volumes/%s/data_dump/draeger/%s' % (DRIVE, TOPIC)

In [None]:
os.chdir(CWD)

In [None]:
os.getcwd()

In [None]:
DIR_READ

In [None]:
DIR_READ_2

In [None]:
DIR_WRITE

In [None]:
DATA_DUMP

### Import processed HFOV data from pickle archive

In [None]:
with open('%s/%s.pickle' % (DIR_READ_2, 'slow_measurements_hfov_1'), 'rb') as handle:
    slow_measurements_1 = pickle.load(handle)
    
with open('%s/%s.pickle' % (DIR_READ_2, 'slow_measurements_hfov_2'), 'rb') as handle:
    slow_measurements_2 = pickle.load(handle)

In [None]:
slow_measurements = {**slow_measurements_1, **slow_measurements_2}
del slow_measurements_1; del slow_measurements_2

In [None]:
with open('%s/%s.pickle' % (DIR_READ_2, 'vent_settings_selected_hfov'), 'rb') as handle:
    vent_settings_selected = pickle.load(handle)

In [None]:
with open('%s/%s.pickle' % (DIR_READ_2, 'clinical_details_hfov'), 'rb') as handle:
    clinical_details = pickle.load(handle)

In [None]:
# List of recordings

recordings = sorted(slow_measurements.keys())
print(recordings)

In [None]:
len(slow_measurements)

### Keep only recordings which have HFOV-VG mode

In [None]:
# Identify recordings which have HFOV mode and collect their name in a list
# Print those ones which do not have PC_AC periods

recordings = [recording for recording in recordings if
             'Ampl hf max' in  vent_settings_selected[recording]['Name'].values]

for recording in sorted(slow_measurements.keys()):
    if recording not in recordings:
        slow_measurements.pop(recording)

In [None]:
len(recordings)

In [None]:
print(recordings)

### Remove recordings which are < 12 hours long

In [None]:
for recording in recordings:
    if len(slow_measurements[recording]) < 12 * 3600:
        slow_measurements.pop(recording)

recordings = sorted(slow_measurements.keys())
print(recordings)

In [None]:
len(recordings)

### Keep only one recording per patient

In [None]:
to_remove = ['DG005_3', 'DG038_2', 'DG040_2']

for recording in to_remove:
    slow_measurements.pop(recording)

recordings = sorted(slow_measurements.keys())
print(recordings)

In [None]:
len(recordings)

### Which of the recordings containg also periods without volume guarantee? 

In [None]:
recordings_with_no_vg = [recording for recording in recordings 
                        if 'Ampl hf' in vent_settings_selected[recording]['Id'].unique()]

print(recordings_with_no_vg)

### Keep only HFOV-VG periods

This is better done by manual inspection of files than by writing a complicated script

In [None]:
slow_measurements['DG005_1'] = slow_measurements['DG005_1']['2015-10-13 14:48:34':'2015-10-14 19:57:07'].copy()
slow_measurements['DG018_1'] = slow_measurements['DG018_1']['2015-12-13 01:22:29':].copy()
slow_measurements['DG032_2'] = slow_measurements['DG032_2'][:'2016-03-24 00:57:00'].copy()
slow_measurements['DG038_1'] = slow_measurements['DG038_1']['2016-05-06 20:31:56':'2016-05-11 11:21:15'].copy()
slow_measurements['DG040_1'] = slow_measurements['DG040_1']['2016-06-09 17:12:57':'2016-06-11 17:45:14'].copy()
slow_measurements['DG049'] = slow_measurements['DG049']['2016-09-02 08:19:34':].copy()
slow_measurements['DG050'] = slow_measurements['DG050']['2016-09-06 00:23:02':].copy()
slow_measurements['DG053'] = slow_measurements['DG053']['2016-10-15 10:32:44':].copy()
slow_measurements['DG062'] = slow_measurements['DG062']['2017-07-21 18:41:57': '2017-07-21 18:49:14'].copy()

In [None]:
# Recording duration of the HFOV-VG recordings

recording_times =[(recording, round((len(slow_measurements[recording]) / 3600), 2)) for recording in recordings]
recording_times = DataFrame(recording_times, columns = ['recording', 'duration (hours)'])
recording_times.set_index('recording', inplace = True)
recording_times

### Remove DG062 as its VG component is only a couple minutes

In [None]:
del slow_measurements['DG062']

In [None]:
recordings = sorted(slow_measurements.keys())

In [None]:
print(sorted(slow_measurements.keys()))

In [None]:
len(slow_measurements)

### Calculate duration and length of the final HFOV-VG recordings

In [None]:
recording_periods = {}
for recording in recordings:
    start = str(slow_measurements[recording].index[0])
    end = str(slow_measurements[recording].index[-1])
    recording_periods[recording] = [start, end]

In [None]:
recording_duration_frame = DataFrame(recording_periods, index = ['start', 'end'])
recording_duration_frame

In [None]:
recording_times_VG =[(recording, round((len(slow_measurements[recording]) / 3600), 2)) 
                     for recording in recordings]
recording_times_VG = DataFrame(recording_times_VG, columns = ['recording', 'duration (hours)'])
recording_times_VG.set_index('recording', inplace = True)
recording_times_VG

In [None]:
recording_time_total_VG = 0

for recording in recordings:
    recording_time_total_VG += len(slow_measurements[recording])
print('Total recording time is %d seconds' % recording_time_total_VG)
print('Total recording time is %d hours' % (recording_time_total_VG / 3600))
print('Total recording time is %.2f days' % (recording_time_total_VG / 86400))

### Combine slow_measurement DataFrames into one 

In [None]:
total = []
for recording in recordings:
    total.append(slow_measurements[recording])
slow_measurements_all = pd.concat(total)    

In [None]:
len(slow_measurements_all)

In [None]:
# How many days of recording in total?

len(slow_measurements_all) / 3600 /24

In [None]:
slow_measurements_all.info()

In [None]:
slow_measurements_all.head()

### Missing data

In [None]:
# How many percent of points are missing for the different parameters?
missing = slow_measurements_all.isnull().sum()
missing_pc = round((missing / len(slow_measurements_all)) * 100, 3)
missing_pc.sort_values()

This is very low percentage. There is no need to remove any parameter (column). Instead, remove the rows with missing data

In [None]:
a = len(slow_measurements_all)
print('Before removal: %d rows' % a)

for recording in recordings:
    slow_measurements[recording].dropna(axis = 0, how = 'any', inplace = True)
    
total = []
for recording in recordings:
    total.append(slow_measurements[recording])
slow_measurements_all = pd.concat(total)

b = len(slow_measurements_all)
print('After removal: %d rows' % b)
print('Removed %d rows' % (a-b))
print('Removed %.2f percent of the data' % (100 * (a-b) / a))

### Limit ventilator settings to the recordings containing HFOV-VG

In [None]:
# Limit to recordings containing HFOV-VG

vent_settings_selected = {key : value  for key, value in vent_settings_selected.items() if
                          key in recordings}

### Retrieve relevant ventilator settings and add them to HFOV data

**dPmax**, and **VThf**

In [None]:
# Create DataFrame with changes in Ampl max (dP_max) and reindex it according it to slow_measurements data

dP_set_VG = {}
for recording in recordings:
    dP_set_VG[recording] = \
        vent_settings_selected[recording][vent_settings_selected[recording].Id == 'Ampl hf max'].copy()
    dP_set_VG[recording]['dPmax_set'] = dP_set_VG[recording]['Value New']
    dP_set_VG[recording] = dP_set_VG[recording][['Date_Time', 'dPmax_set']]
    # reindex the Dataframe with the index of slow_measurements will allow concatenation
    # with the settings data filled in for all rows of the slow_measurements DataFrame
    dP_set_VG[recording] = dP_set_VG[recording].reindex(slow_measurements[recording].index, method = 'ffill')

In [None]:
# Create DataFrame with changes in VThf and reindex it according it to slow_measurements data

VThf_set = {}
for recording in recordings:
    VThf_set[recording] = \
        vent_settings_selected[recording][vent_settings_selected[recording].Id == 'VThf'].copy()
    VThf_set[recording]['VThf_set'] = VThf_set[recording]['Value New']
    VThf_set[recording] = VThf_set[recording][['Date_Time', 'VThf_set']]
    VThf_set[recording] = VThf_set[recording].reindex(slow_measurements[recording].index, method = 'ffill')

In [None]:
set_values = {}
for recording in recordings:
    set_values[recording] = pd.concat([dP_set_VG[recording]['dPmax_set'], VThf_set[recording]['VThf_set']], 
                                       join = 'inner', axis = 1)

In [None]:
for recording in recordings:
    slow_measurements[recording] = pd.concat([slow_measurements[recording], set_values[recording]],
                                            join = 'inner', axis = 1)

### Create additional features by normalizing parameters to the body weight or the square of the body weight and also calculate VThf2

In [None]:
# Add weight-normalized parameters to the 1/sec data

for recording in recordings:  
        
    # These columns normalize VThf, MV, MVi, MVe, MVleak to the body weight
    slow_measurements[recording]['VThf_kg'] = \
         slow_measurements[recording]['VThf']  / slow_measurements[recording]['weight']
    slow_measurements[recording]['VThf_set_kg'] = \
         slow_measurements[recording]['VThf_set']  / slow_measurements[recording]['weight']
    slow_measurements[recording]['MV_kg'] = \
         slow_measurements[recording]['MV']  / slow_measurements[recording]['weight']
    slow_measurements[recording]['MVi_kg'] = \
         slow_measurements[recording]['MVi']  / slow_measurements[recording]['weight']
    slow_measurements[recording]['MVe_kg'] = \
         slow_measurements[recording]['MVe']  / slow_measurements[recording]['weight']
    slow_measurements[recording]['MVleak_kg'] = \
         slow_measurements[recording]['MVleak']  / slow_measurements[recording]['weight']

In [None]:
# Calculate the weight square-normalized DCO2

for recording in recordings:  
        
    slow_measurements[recording]['DCO2_kg2'] = \
         slow_measurements[recording]['DCO2']  / (slow_measurements[recording]['weight'] ** 2)

### Calculate difference between the set and actual VThf in case of VG

In [None]:
for recording in recordings:
    slow_measurements[recording]['VThf_diff_kg'] = abs(slow_measurements[recording]['VThf_set_kg'] - \
        slow_measurements[recording]['VThf_kg'])

### Calculate difference between the set and actual and amplitude

In [None]:
for recording in recordings:
    slow_measurements[recording]['dP_diff'] = abs(slow_measurements[recording]['dPmax_set'] - \
        slow_measurements[recording]['amplitude'])

### Check distribution of VThf data in detail

In [None]:
qcats_VThf_kg = pd.qcut(slow_measurements_all.VThf_kg, q = 10,)
qcats_VThf_kg.value_counts().sort_index()

In [None]:
bins = list(range(0, 22, 2)) + [500]
cats_VThf_kg = pd.cut(slow_measurements_all.VThf_kg, bins, right = False)
cats_VThf_kg.value_counts().sort_index()

### Remove rows with VThf > 6 mL/kg 

These are clearly outliers probably reflecting an open ventilator circuit

In [None]:
a = len(slow_measurements_all)
print('Before removal: %d rows' % a)

for recording in recordings:
    slow_measurements[recording] = slow_measurements[recording][slow_measurements[recording]['VThf_kg'] <= 6]
    
total = []
for recording in recordings:
    total.append(slow_measurements[recording])
slow_measurements_all = pd.concat(total)   

b = len(slow_measurements_all)
print('After removal: %d rows' % b)
print('Removed %d rows' % (a-b))
print('Removed %.2f percent of rows' % ((a-b) / a * 100))

In [None]:
#  How many rows from which recording?

recs = slow_measurements_all.groupby('recording')
recs.size()

### Limit clinical details to the selected HFOV-VG recordings

In [None]:
clinical_details = clinical_details.loc[recordings]

### Export processed data to to pickle archive

In [None]:
with open('%s/%s.pickle' % (DATA_DUMP, 'slow_measurements_hfov_vg'), 'wb') as handle:
    pickle.dump(slow_measurements, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open('%s/%s.pickle' % (DATA_DUMP, 'vent_settings_selected_hfov_vg'), 'wb') as handle:
    pickle.dump(vent_settings_selected, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open('%s/%s.pickle' % (DATA_DUMP, 'clinical_details_hfov_vg'), 'wb') as handle:
    pickle.dump(clinical_details, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Export ventilation settings, clinical details and recording durations as Excel files

In [None]:
writer = pd.ExcelWriter('%s/%s' % (DIR_WRITE, 'ventilator_settings_selected.xlsx'))
for recording in recordings:
    vent_settings_selected[recording].to_excel(writer,'%s' % recording)
writer.save()

In [None]:
writer = pd.ExcelWriter('%s/%s' % (DIR_WRITE, 'clinical_details.xlsx'))
clinical_details.to_excel(writer,'clinical_details')
writer.save()

In [None]:
writer = pd.ExcelWriter('%s/%s' % (DIR_WRITE, 'recording_periods.xlsx'))
recording_duration_frame.T.to_excel(writer,'rec_periods.xlsx')
writer.save()