![alt text](./Cerny_logo_1.jpg)

# Analysis of Cerny ventilation recordings: `AL000001 - AL000150`

The data processed and analysed in this Notebook were collected by the **Neonatal Emergency and Transport Service of the Peter Cerny Foundation**, Budapest, Hungary

**Author: Dr Gusztav Belteki**

____

This notebook imports all ventilator data of these recordings (including ventilator parameters, settings, alarms (0.5Hz sampling rate) and waveform data (150Hz sampling rate).

- Total: **150 cases**
- 21 cases were removed as they were less than 15 minutes long (empty or partial recordings) 
- **129 cases** remaining

A dictionary containing the processed ventilation data exported as pickle archive: **data_pars_1_150.pickle** 

### Importing the necessary libraries and setting options

In [None]:
import IPython
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk

import os
import sys
import re
import pickle
import datetime

from scipy import stats
from pandas import Series, DataFrame
from datetime import datetime, timedelta

%matplotlib inline

matplotlib.style.use('classic')
matplotlib.rcParams['figure.facecolor'] = 'w'

pd.set_option('display.max_rows', 300)
pd.set_option('display.max_columns', 300)

In [None]:
print("Python version: {}".format(sys.version))
print("pandas version: {}".format(pd.__version__))
print("matplotlib version: {}".format(matplotlib.__version__))
print("NumPy version: {}".format(np.__version__))
print("SciPy version: {}".format(sp.__version__))
print("IPython version: {}".format(IPython.__version__))
print("scikit-learn version: {}".format(sk.__version__))

### List and set the working directory and the directory to write out data

In [None]:
# Topic of the Notebook which will also be the name of the subfolder containing results
TOPIC = 'fabian'

# Name of the external hard drive
DRIVE = 'GUSZTI'

# Directory containing clinical and blood gas data
CWD = '/Users/guszti/ventilation_fabian'

# Directory on external drive to read the ventilation data from
DIR_READ = '/Volumes/%s/Fabian/fabian_ventilator_data_1_150' % DRIVE

DIR_WRITE = '%s/%s' % (CWD, 'Analyses')

# Images and raw data will be written on an external hard drive
if not os.path.isdir('/Volumes/%s/data_dump/%s' % (DRIVE, TOPIC)):
    os.makedirs('/Volumes/%s/data_dump/%s' % (DRIVE, TOPIC))
DATA_DUMP = '/Volumes/%s/data_dump/%s' % (DRIVE, TOPIC)

In [None]:
os.chdir(CWD)

In [None]:
os.getcwd()

In [None]:
DIR_READ

In [None]:
DIR_WRITE

In [None]:
DATA_DUMP

### Import ventilation data as text and create a dictionary of the different recordings

In [None]:
cases = os.listdir(DIR_READ)
cases = sorted(case for case in cases if case.startswith('AL')) # remove hidden other files
# print(cases)

In [None]:
len(cases)

In [None]:
%%time

# import all data file in the vent_dict dictionary

vent_dict = {}
for case in cases:
    flist = os.listdir(os.path.join(DIR_READ, case))
    # print(flist)
    for fle in flist:
        if not fle.startswith('.'):
            fle_handle = open(os.path.join(DIR_READ, case, fle), 'r', encoding = 'latin1')
            vent_dict['%s_%s' % (case, fle[-5])] = fle_handle.read()
            fle_handle.close()

In [None]:
len(vent_dict)

### Split recordings into records

In [None]:
%%time

data_dict = {} # In this dict the keys are time points and the values are vent data
for key, value in sorted(vent_dict.items()):
    # print('Working on %s' % key)
    data_list = value.split('\n') # Individual records are separated by a 'newline' character
    data_dict[key] = {} # an inner dictionary for the given recording
    for number, record in enumerate(data_list[:-1]):
        try:
            time_stamp, data_str = record.split(';') # splitting record to time stamp and ventilation data
            data_dict[key][time_stamp] = data_str       
        except:
            #print('In %s, record #%d cannot be parsed: \n %s' % (key, number, record[:75]), '\n')
            continue

### Combine data dictionaries from the same cases

In [None]:
%%time

# data_dict_2 contains the combined ventilation data for each case

data_dict_2 = {}
for case in cases:
    dicts_to_combine = []
    for recording in vent_dict.keys():
        if recording.startswith(case):
            dicts_to_combine.append(data_dict[recording])
    data_dict_2[case] = {k: v for dct in dicts_to_combine for k, v in dct.items()} 

### Separate parameter data and waves data

In [None]:
%%time

# Records containing parameter data start with '<', records containing waves data always have a space (' ')
# after the first byte (two characters in hexadecimal notation)

with open('%s/%s' % (DIR_WRITE, 'abnormal_records.txt'), 'a') as fileout:

    data_dict_waves = {}
    data_dict_pars = {}
    for case in cases:
        data_dict_waves[case] = {}
        data_dict_pars[case] = {}
        for key, value in sorted(data_dict_2[case].items()):
            if value.startswith('<'): # These are ventilator parameter slow data  
                data_dict_pars[case][key] = value[13:-12] # removing 13 characters from the 
                # beginnings and 12 characters from the end (they are not parameters)
            elif value[2] == ' ': # Waves data have a space character at the third position
                data_dict_waves[case][key] = value
            else:
                print('In case %s, record at %s cannot be parsed as waves or parameters:' % (case, key ), 
                     file = fileout)
                print(value, '\n', file = fileout)
            

## Parameters data (sampling rate: 0.5Hz)

### Create embedded dictionary for the various parameters and their values

In [None]:
%%time

with open('%s/%s' % (DIR_WRITE, 'abnormal_values.txt'), 'w') as fileout:

    data_dict_pars_2 = {}
    for case in cases:
        print('Working on %s' % case)
        data_dict_pars_2[case] = {}
        for time, values in data_dict_pars[case].items():
            time = datetime.strptime(time[:-4], '%Y. %m. %d. %H:%M:%S')
            data_dict_pars_2[case][time] = {} # inner dictionary with the time stamps used as keys
            for pair in values.split('|'):
                if ',' in pair: # ventilator software version field contains a comma
                    continue
                
                try:
                    code, value = pair.split('=') # split records into parameter keys and values
                except:
                    print('Record cannot be unpacked:\n %s, %s\n' % (time, pair), file = fileout)
                    continue
                
                if code.startswith('0'): # The codes <10 start with zeros (e.g. 00, 01, 02,...)
                                     # and the leading zeros need to be removed
                    code = code[1:]  
                
                try:
                    parameter = int(code)
                except ValueError:
                    print('Error during coverting value to int:\n%r\n' % code, file = fileout)
                    continue
            
                if code == '145': # Device ID variant, hexadecimal number
                    data_dict_pars_2[case][time][parameter] = value
                    
                elif code in ['125', '126']: 
                    # convert Mode options 1 & 2 to binary number to retrieve bits
                    data_dict_pars_2[case][time][parameter] = bin(int(value))[2:].zfill(14)
            
                elif '.' in value or value == '0':
                
                    try:
                        data_dict_pars_2[case][time][parameter] = float(value)
                    except ValueError:
                        print('Value cannot be converted to float\n:%r\n' % value, file = fileout)
                        continue
                else: 
                    try:
                        data_dict_pars_2[case][time][parameter] = int(value)
                    except:
                        print('Value cannot be converted to int\n:%r\n' % value, file = fileout)
                        continue

In [None]:
%%time

# Parameter #125 is 'Mode option 1': its different bits are meaning different parameters
# Parameter #126 is 'Mode option 2': its different bits are meaning different parameters
# See Aculink protocol for more details

for case in cases:
    print('Working on %s' % case)
    for time in sorted(data_dict_pars_2[case].keys()):
        try:
            # Ventilation_stopped; 0 = no, 1 = yes
            data_dict_pars_2[case][time][270] = int(data_dict_pars_2[case][time][125][-1])
            # VG_state: 0 = off, 1 = on
            data_dict_pars_2[case][time][271] = int(data_dict_pars_2[case][time][125][-2])
            # Volume limit state: 0 = off, 1 = on
            data_dict_pars_2[case][time][272] = int(data_dict_pars_2[case][time][125][-3])
            # Ventilator_range: 0  = neonatal, 1 = paediatric
            data_dict_pars_2[case][time][273] = int(data_dict_pars_2[case][time][125][-4])
            # trigger_mode: 0 = volumetrigger, 1 = flowtrigger
            data_dict_pars_2[case][time][274] = int(data_dict_pars_2[case][time][125][-8])
    
            # I_E_HFOV (HFOV I:E ratio): 0=1:3, 1=1:2, 2=1:1
            if data_dict_pars_2[case][time][125][-14:-12] == '00':
                data_dict_pars_2[case][time][275] = 0
            elif data_dict_pars_2[case][time][125][-14:-12] == '01':
                data_dict_pars_2[case][time][275] = 1
            elif data_dict_pars_2[case][time][125][-14:-12] == '10':
                data_dict_pars_2[case][time][275] = 2
    
            # pressure_rise_control: 0=I-flow, 1=Ramp, 2=AutoIFlow
            if data_dict_pars_2[case][time][126][-2:]   == '00':
                data_dict_pars_2[case][time][276] = 0
            elif data_dict_pars_2[case][time][126][-2:] == '01':
                data_dict_pars_2[case][time][276] = 1
            elif data_dict_pars_2[case][time][126][-2:] == '10':
                data_dict_pars_2[case][time][276] = 2
    
            # HFOV recruitment: 0 = off, 1 = on
            data_dict_pars_2[case][time][277] = int(data_dict_pars_2[case][time][126][-3])
        
        except:
            print('Error in %s, %s' % (case, time))    

### Create DataFrame from Parameters Data

In [None]:
%%time

data_pars = {}
for case in cases:
    data_pars[case] = DataFrame(data_dict_pars_2[case]).T

In [None]:
%%time

# Replace codes for text (see Aculink protocol)

for case in cases:
    a = data_pars[case].copy()
    a= a.replace(-32764, 'off')
    a= a.replace(-32765, 'not valid')
    a= a.replace(-32766, 'out of range')
    a= a.replace(-32767, 'unused')
    data_pars[case] = a

In [None]:
recording_duration = []

for case in cases:
    # The sampling rate is 0.5 Hz (1 in 2 seocnds), e.g. if the lentgh of the DataFrame is 450, its duration
    # is 15 minutes (900 seconds)
    recording_duration.append((case, 2 * len(data_pars[case])))  

recording_duration = DataFrame(recording_duration)
recording_duration.columns = ['case', 'seconds']
recording_duration.index = recording_duration['case']
recording_duration.drop('case', axis = 1, inplace = True)

In [None]:
recording_duration['seconds'].describe()

In [None]:
recording_duration.sort_values(by = 'seconds', ascending = True)[:25]

### Remove those recordings which are less than 15 minutes long

Recordings less than 15 minutes (900 seconds) long are very likely incomplete and sometimes completely empty.

In [None]:
len(data_pars)

In [None]:
# The sampling rate is 0.5 Hz (1 in 2 seocnds), if the lentgh of the DataFrame is 450, its duration
# is 15 minutes (900 seconds)

for case in cases:
    if len(data_pars[case]) < 450:
        print('Removing %s' % case)
        del data_pars[case]

cases = sorted(data_pars.keys())

In [None]:
len(data_pars)

In [None]:
len(cases)

In [None]:
### Replace codes for categorical variables with informative category names

In [None]:
mapping_vent_mode = {0: None, 1: 'IPPV', 2: 'SIPPV', 3: 'SIMV', 4: 'SIMVPSV', 5: 'PSV', 
                     6: 'CPAP', 7: 'NCPAP', 8: 'DUOPAP', 9: 'HFO', 10: 'O2therapy', 15: 'service'}
for case in cases:
    data_pars[case][101].replace(mapping_vent_mode, inplace = True)

In [None]:
mapping_patient_range = {1: 'Neonatal', 2: 'Pediatric'}
for case in cases:
    data_pars[case][100].replace(mapping_patient_range, inplace = True)

In [None]:
mapping_patient_range_2 = {0: 'Neonatal', 1: 'Pediatric'}
for case in cases:
    data_pars[case][273].replace(mapping_patient_range_2, inplace = True)

In [None]:
mapping_power = {0: 'Network', 1: 'Battery'}
for case in cases:
    data_pars[case][127].replace(mapping_power, inplace = True)

In [None]:
mapping_off_on = {0: 'off', 1: 'on'}
for case in cases:
    data_pars[case][157].replace(mapping_off_on, inplace = True)
    data_pars[case][158].replace(mapping_off_on, inplace = True)
    data_pars[case][271].replace(mapping_off_on, inplace = True)
    data_pars[case][272].replace(mapping_off_on, inplace = True)
    data_pars[case][277].replace(mapping_off_on, inplace = True)

In [None]:
mapping_no_yes = {0: 'no', 1: 'yes'}
for case in cases:
    data_pars[case][270].replace(mapping_no_yes, inplace = True)

In [None]:
mapping_trigger = {0: 'Volumetrigger', 1: 'Flowtrigger'}
for case in cases:
    data_pars[case][274].replace(mapping_trigger, inplace = True)

In [None]:
mapping_IE_HFOV = {0: '1:3', 1: '1:2', 2: '1:1'}
for case in cases:
    data_pars[case][275].replace(mapping_IE_HFOV, inplace = True)

In [None]:
mapping_pressure_rise_ctrl = {0: 'I-flow', 1: 'Ramp', 2: 'AutoIFlow'}
for case in cases:
    data_pars[case][276].replace(mapping_pressure_rise_ctrl, inplace = True)

In [None]:
mapping_pressure_unit = {0: 'mbar', 1: 'cmH2O',}
for case in cases:
    data_pars[case][140].replace(mapping_pressure_unit, inplace = True)

In [None]:
mapping_CO2 = {0: 'mmHg', 1: 'kPa', 2: 'Vol%'}
for case in cases:
    data_pars[case][141].replace(mapping_CO2, inplace = True)

### Parse the parameter values using Fabian parameter library

In [None]:
par_key_table = pd.read_excel('Fabian_parameters.xlsx')

In [None]:
par_key_table;

In [None]:
par_key_dict = {}
for row in par_key_table.index:
    par_key_dict[par_key_table.code[row]] = par_key_table.name[row]

In [None]:
for case in cases:
    data_pars[case].rename(columns = par_key_dict, inplace = True )

### Sort DataFrames according to time stamp index

In [None]:
for case in cases:
    data_pars[case].sort_index(inplace = True)

### Write individual text files with the ventilator modes

##### Create sub-directories for each case if it does not yet exist

In [None]:
# Images and raw data will be written on an external hard drive
if not os.path.isdir('%s/%s' % (DATA_DUMP, 'fabian_cases')):
    os.makedirs('%s/%s' % (DATA_DUMP, 'fabian_cases'))

for case in cases: 
    if not os.path.isdir('%s/%s/%s' % (DATA_DUMP, 'fabian_cases', case)):
        os.makedirs('%s/%s/%s' % (DATA_DUMP, 'fabian_cases', case))

In [None]:
for case in cases:
    a = data_pars[case]
    
    o2therapy = len(a[a['Ventilator_mode'] == 'O2therapy'])
    ncpap = len(a[a['Ventilator_mode'] == 'NCPAP'])
    duopap = len(a[a['Ventilator_mode'] == 'DUOPAP'])
    simv = len(a[a['Ventilator_mode'] == 'SIMV'])
    ippv = len(a[a['Ventilator_mode'] == 'IPPV'])
    sippv = len(a[a['Ventilator_mode'] == 'SIPPV'])
    simvpsv = len(a[a['Ventilator_mode'] == 'SIMVPSV'])
    vg_on = len(data_pars[case][data_pars[case]['VG_state'] == 'on'])
  
    
    fileout = open('%s/%s/%s/%s_%s.%s' % (DATA_DUMP, 'fabian_cases', case, case, 'vent_info', 'txt'), 'w')
    
    fileout.write('O2 therapy: %d sec \n' % (o2therapy * 2))
    fileout.write('NCPAP:      %d sec \n' % (ncpap * 2))
    fileout.write('DUOPAP:     %d sec \n' % (duopap * 2))
    fileout.write('IPPV:       %d sec \n' % (ippv * 2))
    fileout.write('SIPPV:      %d sec \n' % (sippv * 2))
    fileout.write('SIMV:       %d sec \n' % (simv * 2))
    fileout.write('SIMVPSV:    %d sec \n\n' % (simvpsv * 2))
    fileout.write('VG on:      %d sec \n' % (vg_on * 2))
    
    fileout.close()

### Export processed data as pickle files

## Waves data (sampling rate = 150Hz)

In [None]:
len(data_dict_waves.keys())

In [None]:
# Remove those cases which are < 15 minute long
data_dict_waves_selected = {key : value for key, value in data_dict_waves.items() if key in cases}

In [None]:
len(data_dict_waves_selected.keys())

### Export sample waves data to pickle files

In [None]:
rec1 = cases[70:72]