![alt text](./pageheader_rose2_babies.jpg)

# Preprocessing HFOV data

#### Author: Dr Gusztav Belteki

Contact: gbelteki@aol.com

This notebook considers all the recordings from **both ventilator service evaluations** but only keeps the data which contain HFOV ventilation. It performs preprocessing (e.g. resampling to 1 second periods, normalising relevant parameters to birth weight, removing irrelevant parameters, adding some ventilator settings). It then exports data a pickle archive: *slow_measurements_hfov_1*, *slow_measurements_hfov_2*, *vent_settings_selected_hfov*, *clinical_details_hfov*.

Data processing performed in this notebook:

* Resample data to 1/sec to remove half-empty rows
* Remove non-HFOV periods
* Normalise relevant parameters to the body weight
* Correct the problem with Phf column names
* Retrieve the set frequency (Hz) and adding it to the DataFrames
* Add the recording's name to the DataFrames as a categorical variable
* Limit ventilation settings to the duration of the HFOV recordings
* Limit clinical details to the hfov recordings

### Import the necessary libraries and setting options

In [None]:
import IPython
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk

import os
import sys
import re
import pickle

from scipy import stats
from pandas import Series, DataFrame
from datetime import datetime, timedelta

%matplotlib inline

matplotlib.style.use('classic')
matplotlib.rcParams['figure.facecolor'] = 'w'

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

In [None]:
print("Python version: {}".format(sys.version))
print("pandas version: {}".format(pd.__version__))
print("matplotlib version: {}".format(matplotlib.__version__))
print("NumPy version: {}".format(np.__version__))
print("SciPy version: {}".format(sp.__version__))
print("IPython version: {}".format(IPython.__version__))

### Import custom functions from own module

In [None]:
from gb_loader import *
from gb_stats import *
from gb_transform import *
from gb_visualizer import *

### List and set the working directory and the directory to write out data

In [None]:
# Topic of the Notebook which will also be the name of the subfolder containing results
TOPIC = 'HFOV_all'

# Name of the external hard drive
DRIVE = 'GUSZTI'

# Directory containing clinical and blood gas data
CWD = '/Users/guszti/ventilation_draeger'

# Directory on external drive to read the ventilation data from
DIR_READ_1 = '/Volumes/%s/Draeger/service_evaluation_old' % DRIVE
DIR_READ_2 = '/Volumes/%s/Draeger/service_evaluation_new' % DRIVE

# Directory to write results and selected images to 
if not os.path.isdir('%s/%s/%s' % (CWD, 'Analyses', TOPIC)):
    os.makedirs('%s/%s/%s' % (CWD, 'Analyses', TOPIC))
DIR_WRITE = '%s/%s/%s' % (CWD, 'Analyses', TOPIC)

# Images and raw data will be written on an external hard drive
if not os.path.isdir('/Volumes/%s/data_dump/draeger/%s' % (DRIVE, TOPIC)):
    os.makedirs('/Volumes/%s/data_dump/draeger/%s' % (DRIVE, TOPIC))
DATA_DUMP = '/Volumes/%s/data_dump/draeger/%s' % (DRIVE, TOPIC)

In [None]:
os.chdir(CWD)
os.getcwd()

In [None]:
DIR_WRITE

In [None]:
DATA_DUMP

### List of the  recordings

In [None]:
# This is a list of all recordings

recordings_1 = ['DG001', 'DG002_1', 'DG002_2', 'DG003', 'DG004', 'DG005_1', 'DG005_2', 'DG005_3', 
              'DG006_1', 'DG006_2', 'DG006_3', 'DG007', 'DG008', 'DG009', 'DG010', 'DG011', 
              'DG012', 'DG013', 'DG014', 'DG015', 'DG016', 'DG017', 'DG018_1', 'DG018_2', 'DG019',
              'DG020', 'DG021', 'DG022', 'DG023', 'DG024',  'DG025', 'DG026', 'DG027', 'DG028', 
              'DG029', 'DG030', 'DG031', 'DG032_1', 'DG032_2', 'DG033', 'DG034', 'DG035', 'DG036', 
              'DG037', 'DG038_1', 'DG038_2', 'DG039', 'DG040_1', 'DG040_2', 'DG041', 'DG042', 
              'DG043', 'DG044', 'DG045', 'DG046_1', 'DG046_2', 'DG047', 'DG048', 'DG049', 'DG050',
              'DG051_1', 'DG051_2', 'DG052', 'DG053', 'DG054', 'DG055', 'DG056', 'DG057', 'DG058',
              'DG059', 'DG060']

recordings_2 = ['DG061', 'DG062', 'DG063', 'DG064', 'DG065', 'DG066', 'DG067_1', 'DG067_2', 'DG068', 
                'DG069_1', 'DG069_2', 'DG070', 'DG071', 'DG072', 'DG073', 'DG074', 'DG075', 'DG076',
                'DG077', 'DG078', 'DG079', 'DG080', 'DG081', 'DG082', 'DG083', 'DG084', 'DG085',
                'DG086', 'DG087', 'DG088', 'DG089', 'DG090']

recordings = recordings_1 + recordings_2

In [None]:
len(recordings)

### Import clinical details

In [None]:
# Clinical details of the both service evaluation
clinical_details = pd.read_excel('%s/data_grabber_patient_data_combined_all.xlsx' % CWD)
clinical_details.index = clinical_details['Recording']

In [None]:
# Limit to the recordings listed
clinical_details = clinical_details.loc[recordings]
clinical_details;

In [None]:
clinical_details['Recording start'] = pd.to_datetime(clinical_details['Recording period'].apply(lambda x: x[:10]),
                dayfirst = True)
clinical_details['Postnatal age'] = clinical_details['Recording start'] - clinical_details['Date of birth']
clinical_details['Corrected gestation'] = clinical_details['Gestation'] + \
                clinical_details['Postnatal age'].astype(int) / (1E+9 * 3600 * 24 * 7)

In [None]:
clinical_details.info()

In [None]:
current_weights = {}
for recording in recordings:
    current_weights[recording] = clinical_details.loc[recording, 'Current weight' ] / 1000

### Import ventilator modes 

In [None]:
vent_modes_1 = {}

for recording in recordings_1:
    flist = os.listdir('%s/%s' % (DIR_READ_1, recording))
    flist = [file for file in flist if not file.startswith('.')] # There are some hidden 
    # files on the hard drive starting with '.'; this step is necessary to ignore them
    files = slow_text_finder(flist)
    # print('Loading recording %s' % recording)
    # print(files)
    fnames = ['%s/%s/%s' % (DIR_READ_1, recording, filename) for filename in files]
    vent_modes_1[recording] =  data_loader(fnames)

In [None]:
vent_modes_2 = {}

for recording in recordings_2:
    flist = os.listdir('%s/%s' % (DIR_READ_2, recording))
    flist = [file for file in flist if not file.startswith('.')] # There are some hidden 
    # files on the hard drive starting with '.'; this step is necessary to ignore them
    files = slow_text_finder(flist)
    # print('Loading recording %s' % recording)
    # print(files)
    fnames = ['%s/%s/%s' % (DIR_READ_2, recording, filename) for filename in files]
    vent_modes_2[recording] =  data_loader(fnames)

In [None]:
vent_modes = {**vent_modes_1, **vent_modes_2}

In [None]:
vent_modes_selected = {} # only important mode parameters are kept in this one

for recording in recordings:
    vent_modes_selected[recording] = vent_mode_cleaner(vent_modes[recording])

### Import ventilator settings

In [None]:
vent_settings_1 = {}

for recording in recordings_1:
    flist = os.listdir('%s/%s' % (DIR_READ_1, recording))
    flist = [file for file in flist if not file.startswith('.')] # There are some hidden 
    # files on the hard drive starting with '.'; this step is necessary to ignore them
    files = slow_setting_finder(flist)
    # print('Loading recording %s' % recording)
    # print(files)
    fnames = ['%s/%s/%s' % (DIR_READ_1, recording, filename) for filename in files]
    vent_settings_1[recording] =  data_loader(fnames)

In [None]:
vent_settings_2 = {}

for recording in recordings_2:
    flist = os.listdir('%s/%s' % (DIR_READ_2, recording))
    flist = [file for file in flist if not file.startswith('.')] # There are some hidden 
    # files on the hard drive starting with '.'; this step is necessary to ignore them
    files = slow_setting_finder(flist)
    # print('Loading recording %s' % recording)
    # print(files)
    fnames = ['%s/%s/%s' % (DIR_READ_2, recording, filename) for filename in files]
    vent_settings_2[recording] =  data_loader(fnames)

In [None]:
vent_settings = {**vent_settings_1, **vent_settings_2}

In [None]:
vent_settings_selected = {} # only important mode parameters are kept in this one

for recording in recordings:
    vent_settings_selected[recording] = vent_settings_cleaner(vent_settings[recording])

### Identify recordings that have HFOV periods

In [None]:
# Identify recordings which have HFOV mode and collect their name in a list
# Print those ones which do not have PC_AC periods
recordings_hfov = []
 
for recording in recordings:
    a = (vent_modes_selected[recording]['Text'])
    if ' Mode PC-HFO' in a.values:
        # 'recordings_hfov' is the list of recording names that contain 
        # HFOV ventilation periods
        recordings_hfov.append(recording)

In [None]:
len(recordings_hfov)

In [None]:
print(recordings_hfov)

In [None]:
recordings_hfov_1 = [recording for recording in recordings_hfov if recording in recordings_1]
recordings_hfov_2 = [recording for recording in recordings_hfov if recording in recordings_2]

### Import ventilator parameters obtained with 1Hz sampling rate ("slow measurements")

In [None]:
slow_measurements_1 = {}

for recording in recordings_hfov_1:
    flist = os.listdir('%s/%s' % (DIR_READ_1, recording))
    flist = [file for file in flist if not file.startswith('.')] # There are some hidden 
    # files on the hard drive starting with '.'; this step is necessary to ignore them
    files = slow_measurement_finder(flist)
    print('Loading recording %s' % recording)
    print(files)
    fnames = ['%s/%s/%s' % (DIR_READ_1, recording, filename) for filename in files]
    slow_measurements_1[recording] =  data_loader(fnames)

In [None]:
slow_measurements_2 = {}

for recording in recordings_hfov_2:
    flist = os.listdir('%s/%s' % (DIR_READ_2, recording))
    flist = [file for file in flist if not file.startswith('.')] # There are some hidden 
    # files on the hard drive starting with '.'; this step is necessary to ignore them
    files = slow_measurement_finder(flist)
    print('Loading recording %s' % recording)
    print(files)
    fnames = ['%s/%s/%s' % (DIR_READ_2, recording, filename) for filename in files]
    slow_measurements_2[recording] =  data_loader(fnames)

In [None]:
slow_measurements = {**slow_measurements_1, **slow_measurements_2}

### Resample to remove half-empty rows

In [None]:
%%time

for recording in recordings_hfov:
    slow_measurements[recording] = slow_measurements[recording].resample('1S').mean()

### Remove non-HFOV periods

In [None]:
# Remove rows which have no DCO2 data - 
# this keeps HFOV periods only in 'slow_measurements_hfov'

for recording in recordings_hfov:
    print(recording)
    print('before_removal: %d seconds' % len(slow_measurements[recording]))
    # considers only 'DCO2' column when removing rows with NA values
    slow_measurements[recording].dropna(subset = ['5001|DCO2 [10*mL^2/s]'], inplace = True)
    print('after_removal:  %d seconds' % len(slow_measurements[recording]), '\n')

### Add current body weight to the columns

In [None]:
for recording in recordings_hfov:  
        
    slow_measurements[recording]['weight'] =  current_weights[recording]

### Correct DCO2 data 

In [None]:
# The original values need to be multiplied by 10 as in the the downloaded data 
# they are expressed as 1/10th of the DCO2 readings (see original column labels)

for recording in recordings_hfov:

    slow_measurements[recording]['5001|DCO2 [mL^2/s]'] = \
        slow_measurements[recording]['5001|DCO2 [10*mL^2/s]'] * 10

### Correct the problem with Phf column names

The amplitude column is named differently in some recordings: *DG025* has only '5001|ΔPhf [mbar]' while *DG005_1*  and *DG069_2* has both

In [None]:
a = [recording for recording in recordings_hfov
         if '5001|?Phf [mbar]' in slow_measurements[recording].columns]
print(a)

In [None]:
b = [recording for recording in recordings_hfov 
     if '5001|ΔPhf [mbar]' in slow_measurements[recording].columns]
print(b)

##### Correct DG025

In [None]:
slow_measurements['DG025']['5001|?Phf [mbar]'] = \
            slow_measurements['DG025']['5001|ΔPhf [mbar]']

##### Correct DG005_1

In [None]:
slow_measurements['DG005_1']['5001|ΔPhf [mbar]'].value_counts(dropna = False);

In [None]:
sum(slow_measurements['DG005_1']['5001|ΔPhf [mbar]'].notnull())

In [None]:
slow_measurements['DG005_1']['5001|?Phf [mbar]'].value_counts(dropna = False);

In [None]:
sum(slow_measurements['DG005_1']['5001|?Phf [mbar]'].notnull())

In [None]:
temp = pd.concat([slow_measurements['DG005_1']['5001|?Phf [mbar]'].dropna(), 
                  slow_measurements['DG005_1']['5001|ΔPhf [mbar]'].dropna()])
    
slow_measurements['DG005_1']['5001|?Phf [mbar]'] = \
                temp.reindex_like(slow_measurements['DG005_1'])

In [None]:
sum(slow_measurements['DG005_1']['5001|?Phf [mbar]'].notnull())

##### Correct DG069_2

In [None]:
slow_measurements['DG069_2']['5001|ΔPhf [mbar]'].value_counts(dropna = False);

In [None]:
sum(slow_measurements['DG069_2']['5001|ΔPhf [mbar]'].notnull())

In [None]:
slow_measurements['DG069_2']['5001|?Phf [mbar]'].value_counts(dropna = False);

In [None]:
sum(slow_measurements['DG069_2']['5001|?Phf [mbar]'].notnull())

In [None]:
temp = pd.concat([slow_measurements['DG069_2']['5001|?Phf [mbar]'].dropna(), 
                  slow_measurements['DG069_2']['5001|ΔPhf [mbar]'].dropna()])
    
slow_measurements['DG069_2']['5001|?Phf [mbar]'] = \
                temp.reindex_like(slow_measurements['DG069_2'])

In [None]:
sum(slow_measurements['DG069_2']['5001|?Phf [mbar]'].notnull())

### Rename columns and remove unimportant columns

In [None]:
# Creating a dictionary to rename "clumsy" column names with simple ones

old = ['5001|% leak [%]', '5001|?Phf [mbar]', '5001|FiO2 [%]', '5001|FlowDev [L/min]', 
       '5001|MV [L/min]', '5001|MVe [L/min]', '5001|MVi [L/min]', '5001|MVleak [L/min]', 
       '5001|Pmean [mbar]', '5001|VThf [mL]', '5001|DCO2 [mL^2/s]']

new = ['leak%', 'amplitude', 'fiO2', 'flow', 'MV', 'MVe', 'MVi', 
       'MVleak', 'MAP', 'VThf', 'DCO2']

rename_dict = dict(zip(old, new))

In [None]:
# Rename column names and removing unimportant columns

for recording in recordings_hfov:
    slow_measurements[recording].rename(columns=rename_dict, inplace=True)
    to_delete = [par for par in list(slow_measurements[recording]) 
                if par.startswith('5001') or par.startswith('8272')]
    slow_measurements[recording] = slow_measurements[recording].drop(to_delete, axis = 1)

### Retrieve the set frequency (Hz) and adding it to the DataFrames

In [None]:
freq = {}
for recording in recordings_hfov:
    freq[recording] = vent_settings[recording][vent_settings[recording].Id == 'fhf'].copy()
    freq[recording]['frequency'] = freq[recording]['Value New']
    # reindex the freq Dataframe with the index of slow_measurements will allow concatenation
    # with the freq data filled in for all rows of the slow_measurements DataFrame
    freq[recording] = freq[recording].reindex(slow_measurements[recording].index, method = 'ffill')

In [None]:
for recording in recordings_hfov:
    slow_measurements[recording] = pd.concat([slow_measurements[recording], freq[recording]],
                                            join = 'inner', axis = 1)

In [None]:
for recording in recordings_hfov:
    slow_measurements[recording].drop(['Time [ms]', 'Rel.Time [s]', 'Value Old',
                                       'Value New', 'Date_Time', 'Date', 'Time',
                                       'Id', 'Name', 'Unit'], axis=1, inplace=True )

### Final lengths of the preprocessed recordings

In [None]:
print('Length of the recordings in seconds: \n')
for recording in recordings_hfov:
    print('%-10s %-10.d' % (recording, len(slow_measurements[recording])))

In [None]:
recording_periods = {}
for recording in recordings_hfov:
    start = str(slow_measurements[recording].index[0])
    end = str(slow_measurements[recording].index[-1])
    recording_periods[recording] = [start, end]

In [None]:
recording_duration_frame = DataFrame(recording_periods, index = ['start', 'end'])
recording_duration_frame

### Add the recording's name to the DataFrames as a categorical variable

In [None]:
for recording in recordings_hfov:
    slow_measurements[recording]['recording'] = recording 

### Limit ventilation settings to HFOV recordings

In [None]:
# Limit to recordings containing HFOV

vent_settings_selected = {key : value  for key, value in vent_settings_selected.items() if
                          key in recordings_hfov}

In [None]:
vent_settings_selected['DG089'].sort_index(inplace = True)

### Limit clinical details to the hfov recordings

In [None]:
clinical_details_hfov = clinical_details.loc[recordings_hfov]

### Export data to pickle files¶

In [None]:
rec1 = recordings_hfov[:int(len(recordings_hfov) * 0.5)] 
rec2 = recordings_hfov[int(len(recordings_hfov) * 0.5):] 

In [None]:
slow_measurements_1 = { key: value for key, value in slow_measurements.items() if key in rec1}
with open('%s/%s.pickle' % (DATA_DUMP, 'slow_measurements_hfov_1'), 'wb') as handle:
    pickle.dump(slow_measurements_1, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
slow_measurements_2 = { key: value for key, value in slow_measurements.items() if key in rec2}
with open('%s/%s.pickle' % (DATA_DUMP, 'slow_measurements_hfov_2'), 'wb') as handle:
    pickle.dump(slow_measurements_2, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open('%s/%s.pickle' % (DATA_DUMP, 'vent_settings_selected_hfov'), 'wb') as handle:
    pickle.dump(vent_settings_selected, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open('%s/%s.pickle' % (DATA_DUMP, 'clinical_details_hfov'), 'wb') as handle:
    pickle.dump(clinical_details_hfov, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Export data as Excel files

In [None]:
writer = pd.ExcelWriter('%s/%s' % (DIR_WRITE, 'ventilator_modes.xlsx'))
for recording in recordings_hfov:
    vent_modes[recording].to_excel(writer,'%s' % recording)
writer.save()

In [None]:
writer = pd.ExcelWriter('%s/%s' % (DIR_WRITE, 'ventilator_modes_selected.xlsx'))
for recording in recordings_hfov:
    vent_modes_selected[recording].to_excel(writer,'%s' % recording)
writer.save()

In [None]:
writer = pd.ExcelWriter('%s/%s' % (DIR_WRITE, 'ventilator_settings.xlsx'))
for recording in recordings_hfov:
    vent_settings[recording].to_excel(writer,'%s' % recording)
writer.save()

In [None]:
writer = pd.ExcelWriter('%s/%s' % (DIR_WRITE, 'ventilator_settings_selected.xlsx'))
for recording in recordings_hfov:
    vent_settings_selected[recording].to_excel(writer,'%s' % recording)
writer.save()

In [None]:
writer = pd.ExcelWriter('%s/%s' % (DIR_WRITE, 'clinical_details.xlsx'))
clinical_details_hfov.to_excel(writer,'clinical_details')
writer.save()

In [None]:
writer = pd.ExcelWriter('%s/%s' % (DIR_WRITE, 'recording_periods.xlsx'))
recording_duration_frame.T.to_excel(writer,'rec_periods.xlsx')
writer.save()