![alt text](./Cerny_logo_1.jpg)

# Analysis of Cerny ventilation recordings

## EDA on and further processing on clinical details and ventilator data

This notebook imports the preprocessed **Fabian ventilator parameters** data from pickle archive and performs exploratory data analysis and further preprocessing on clinical details & ventilation modes and ventilator data.

### Importing the necessary libraries and setting options

In [None]:
import IPython
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk

import os
import sys
import re
import pickle

from scipy import stats
from pandas import Series, DataFrame
from datetime import datetime, timedelta

%matplotlib inline
matplotlib.style.use('classic')
matplotlib.rcParams['figure.facecolor'] = 'w'

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
# pd.set_option('mode.chained_assignment', None) 

In [None]:
print("Python version: {}".format(sys.version))
print("pandas version: {}".format(pd.__version__))
print("matplotlib version: {}".format(matplotlib.__version__))
print("NumPy version: {}".format(np.__version__))
print("SciPy version: {}".format(sp.__version__))
print("IPython version: {}".format(IPython.__version__))
print("scikit-learn version: {}".format(sk.__version__))

### List and set the working directory and the directory to write out data

In [None]:
# Topic of the Notebook which will also be the name of the subfolder containing results
TOPIC = 'fabian'

# Name of the external hard drive
DRIVE = 'GUSZTI'

# Directory containing clinical and blood gas data
CWD = '/Users/guszti/ventilation_fabian'

# Directory on external drive to read the ventilation data from
DIR_READ = '/Volumes/%s/Fabian/fabian_data' % DRIVE

DIR_WRITE = '%s/%s/%s' % (CWD, 'Analyses', 'analysis_ventilated_301_600')
if not os.path.isdir(DIR_WRITE):
    os.makedirs(DIR_WRITE)

# Images and raw data will be written on an external hard drive
if not os.path.isdir('/Volumes/%s/data_dump/%s' % (DRIVE, TOPIC)):
    os.makedirs('/Volumes/%s/data_dump/%s' % (DRIVE, TOPIC))
DATA_DUMP = '/Volumes/%s/data_dump/%s' % (DRIVE, TOPIC)

In [None]:
os.chdir(CWD)
os.getcwd()

In [None]:
DIR_READ, DIR_WRITE, DATA_DUMP

### Import pickle archives

In [None]:
with open('%s/%s.pickle' % (DATA_DUMP, 'clin_df_301_600'), 'rb') as handle:
    clin_df = pickle.load(handle)

with open('%s/%s.pickle' % (DATA_DUMP, 'data_pars_measurements_301_600'), 'rb') as handle:
    data_pars_measurements = pickle.load(handle)
    
with open('%s/%s.pickle' % (DATA_DUMP, 'data_pars_settings_301_600'), 'rb') as handle:
    data_pars_settings = pickle.load(handle)
    
with open('%s/%s.pickle' % (DATA_DUMP, 'data_pars_alarms_301_600'), 'rb') as handle:
    data_pars_alarms = pickle.load(handle)

### Import table for interpreting ventilator parameters

In [None]:
par_key_table = pd.read_excel('Fabian_parameters.xlsx')
par_key_table;

### How many recordings have we got for analysis


In [None]:
len(data_pars_measurements)

In [None]:
cases = data_pars_measurements.keys()

### EDA on ventilation modes

##### How many cases of the different ventilation modes occur

In [None]:
vent_modes = {}
for case in cases:
    # Multiply by two to get the number of seconds
    vent_modes[case] = data_pars_settings[case]['Ventilator_mode'].value_counts() * 2
    
vent_modes = DataFrame(vent_modes).T

vent_modes.replace(np.nan, 0, inplace = True)

In [None]:
# Add the duration of the recordings

recording_duration = {}

for case in cases:
    recording_duration[case] = 2 * len(data_pars_settings[case])

In [None]:
vent_modes['ventilation'] = vent_modes['IPPV'] + vent_modes['SIMV'] + \
                            vent_modes['SIPPV'] + vent_modes['PSV'] + vent_modes['SIMVPSV']

vent_modes['noninvasive'] = vent_modes['CPAP'] + vent_modes['DUOPAP'] + \
                            vent_modes['NCPAP'] + vent_modes['O2therapy']  

vent_modes['total'] = Series(recording_duration)

In [None]:
vent_modes.head()

In [None]:
print('SIMV:', sum(vent_modes['SIMV'] > 0))
print('SIPPV:', sum(vent_modes['SIPPV'] > 0))
print('SIMVPSV:', sum(vent_modes['SIMVPSV'] > 0))
print('PSV:', sum(vent_modes['PSV'] > 0))
print('IPPV:', sum(vent_modes['IPPV'] > 0))
print('NCPAP:', sum(vent_modes['NCPAP'] > 0))
print('CPAP:', sum(vent_modes['CPAP'] > 0))
print('DUOPAP:', sum(vent_modes['DUOPAP'] > 0))
print('O2therapy:', sum(vent_modes['O2therapy'] > 0))
print('ventilation:', sum(vent_modes['ventilation'] > 0))
print('noninvasive:', sum(vent_modes['noninvasive'] > 0))
print('total', len(vent_modes))

In [None]:
# How many seconds of each ventilation mode in total?
total_duration = DataFrame(vent_modes.sum(axis = 0), columns = ['duration (seconds)'])
total_duration

##### Export Dataframes containing ventilator modes to Excel files and pickle archives

In [None]:
writer = pd.ExcelWriter('%s/%s' % (DIR_WRITE, 'ventilation_modes_301_600.xlsx'))
vent_modes.to_excel(writer, 'vent_modes')
total_duration.to_excel(writer, 'total_duration')
writer.save()

In [None]:
with open('%s/%s.pickle' % (DATA_DUMP, 'vent_modes_301_600'), 'wb') as handle:
    pickle.dump(vent_modes, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Only consider those recordings that have at least 15 minutes (900 seconds) mechanical ventilation

In [None]:
vent_modes_ventilated = vent_modes[vent_modes['ventilation'] > 900]
len(vent_modes_ventilated)

In [None]:
vent_modes_ventilated.head()

In [None]:
cases = sorted(vent_modes_ventilated.index)
len(cases)

### Remove recordings that had no flow working sensor and hence tidal volume measurements

In [None]:
to_remove = ['AL000314', 'AL000350', 'AL000354', 'AL000401', 'AL000449', 'AL000459','AL000492', 
             'AL000493']

for case in to_remove:
    if case in cases:
        cases.remove(case)

### Remove the periods from the beginning and the end of the recordings when the patient was not connected to the ventilator

This requires manual inspection of the tidal volume and pressure graphs

This dictionary contains tuples of the start and end points as strings
This was obtained by manual inspection of VTmand and PIP and the recordings
and manually removing the start and the end when the baby was not on the ventilator (e.g. no VTmand)

In [None]:
with open('%s/%s.pickle' % (DATA_DUMP, 'limit_301_600_ventilated'), 'rb') as handle:
    limit = pickle.load(handle)

In [None]:
limit

In [None]:
len(limit)

In [None]:
# Trim ventilator data using the manual filters

data_pars_measurements_ventilated = {}
data_pars_settings_ventilated = {}
data_pars_alarms_ventilated = {}

for case in cases:
    data_pars_measurements_ventilated[case] = data_pars_measurements[case][limit[case][0] : limit[case][1]]
    data_pars_settings_ventilated[case] = data_pars_settings[case][limit[case][0] : limit[case][1]]
    data_pars_alarms_ventilated[case] = data_pars_alarms[case][limit[case][0] : limit[case][1]]

### Now re-analyse the filtered data as above

##### How many cases of the different ventilation modes occur

In [None]:
vent_modes_ventilated = {}
for case in cases:
    # Multiply by two to get the number of seconds
    vent_modes_ventilated[case] = data_pars_settings_ventilated[case]['Ventilator_mode'].value_counts() * 2
    
vent_modes_ventilated = DataFrame(vent_modes_ventilated).T

vent_modes_ventilated.replace(np.nan, 0, inplace = True)

In [None]:
# Add the duration of the recordings

recording_duration = {}

for case in cases:
    recording_duration[case] = 2 * len(data_pars_settings[case])

In [None]:
vent_modes_ventilated.head()

### Add VG data

In [None]:
VG = {}
for case in cases:
    try:
        # Multiply by two to get the number of seconds
        VG[case] = data_pars_settings_ventilated[case]['VG_state'].value_counts() * 2
    except KeyError:
        VG[case] = np.zeros(1)
        # print('No VG_state for %s' % case)
        
VG = DataFrame(VG).T
VG.columns = ['VG_on']

In [None]:
vent_modes_ventilated = pd.concat([vent_modes_ventilated, VG], axis = 1)

In [None]:
# Add the duration of the recordings

recording_duration_ventilated = {}

for case in cases:
    recording_duration_ventilated[case] = 2 * len(data_pars_settings_ventilated[case])

In [None]:
vent_modes_ventilated['total'] = Series(recording_duration_ventilated)

In [None]:
vent_modes_ventilated.head()

In [None]:
# How many seconds of each ventilation mode in total?
total_duration_ventilated = DataFrame(vent_modes_ventilated.sum(axis = 0), columns = ['duration (seconds)'])
total_duration_ventilated

### Only consider those recordings that have at least 15 minutes (900 seconds) mechanical ventilation

In [None]:
# After the trimming all but one recording remained longer than 15 minutes. 
# Remove that

len(vent_modes_ventilated[vent_modes_ventilated['total'] > 900])

In [None]:
vent_modes_ventilated = vent_modes_ventilated[vent_modes_ventilated['total'] > 900]
cases = sorted(vent_modes_ventilated.index)

In [None]:
len(vent_modes_ventilated), len(cases)

In [None]:
vent_modes_ventilated

In [None]:
print('SIMV:', sum(vent_modes_ventilated['SIMV'] > 0))
print('SIPPV:', sum(vent_modes_ventilated['SIPPV'] > 0))
print('SIMVPSV:', sum(vent_modes_ventilated['SIMVPSV'] > 0))
# print('PSV:', sum(vent_modes_ventilated['PSV'] > 0))
print('IPPV:', sum(vent_modes_ventilated['IPPV'] > 0))
print('VG_on:', sum(vent_modes_ventilated['VG_on'] > 0))
print('total', len(vent_modes_ventilated))

In [None]:
len(vent_modes_ventilated)

In [None]:
len(cases)

In [None]:
data_pars_measurements_ventilated = {rec : data_pars_measurements_ventilated[rec] for rec 
                                     in data_pars_measurements_ventilated
                                     if rec in cases}

data_pars_settings_ventilated = {rec : data_pars_settings_ventilated[rec] for rec 
                                     in data_pars_settings_ventilated
                                     if rec in cases}

data_pars_alarms_ventilated = {rec : data_pars_alarms_ventilated[rec] for rec 
                                     in data_pars_alarms_ventilated
                                     if rec in cases}

### Export trimmed DataFrames

In [None]:
with open('%s/%s.pickle' % (DATA_DUMP, 'data_pars_measurements_ventilated_301_600'), 'wb') as handle:
    pickle.dump(data_pars_measurements_ventilated, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('%s/%s.pickle' % (DATA_DUMP, 'data_pars_settings_ventilated_301_600'), 'wb') as handle:
    pickle.dump(data_pars_settings_ventilated, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('%s/%s.pickle' % (DATA_DUMP, 'data_pars_alarms_ventilated_301_600'), 'wb') as handle:
    pickle.dump(data_pars_alarms_ventilated, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Export Dataframes containing ventilator modes to Excel files and pickle archives

In [None]:
writer = pd.ExcelWriter('%s/%s' % (DIR_WRITE, 'ventilation_modes_ventilated_301_600.xlsx'))
vent_modes_ventilated.to_excel(writer, 'vent_modes_ventilated_301_600')
total_duration_ventilated.to_excel(writer, 'total_duration_vent_301_600')
writer.save()

In [None]:
with open('%s/%s.pickle' % (DATA_DUMP, 'vent_modes_ventilated_301_600'), 'wb') as handle:
    pickle.dump(vent_modes_ventilated, handle, protocol=pickle.HIGHEST_PROTOCOL)