![alt text](./Cerny_logo_1.jpg)

# Analysis of Cerny ventilation recordings

## Processing clinical details

This notebook imports and processes clinical data and exports it into a pickle archive.

### Importing the necessary libraries and setting options

In [None]:
import IPython
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk

import os
import sys
import re
import pickle

from scipy import stats
from pandas import Series, DataFrame
from datetime import datetime, timedelta

%matplotlib inline
matplotlib.style.use('classic')
matplotlib.rcParams['figure.facecolor'] = 'w'

pd.set_option('display.max_rows', 250)
pd.set_option('display.max_columns', 250)
# pd.set_option('mode.chained_assignment', None) 

In [None]:
print("Python version: {}".format(sys.version))
print("pandas version: {}".format(pd.__version__))
print("matplotlib version: {}".format(matplotlib.__version__))
print("NumPy version: {}".format(np.__version__))
print("SciPy version: {}".format(sp.__version__))
print("IPython version: {}".format(IPython.__version__))
print("scikit-learn version: {}".format(sk.__version__))

### List and set the working directory and the directory to write out data

In [None]:
# Topic of the Notebook which will also be the name of the subfolder containing results
TOPIC = 'fabian'

# Name of the external hard drive
DRIVE = 'GUSZTI'

# Directory containing clinical and blood gas data
CWD = '/Users/guszti/ventilation_fabian'

# Directory on external drive to read the ventilation data from
DIR_READ = '/Volumes/%s/Fabian/fabian_patient_data_all' % DRIVE

DIR_WRITE = '%s/%s' % (CWD, 'Analyses')

# Images and raw data will be written on an external hard drive
if not os.path.isdir('/Volumes/%s/data_dump/%s' % (DRIVE, TOPIC)):
    os.makedirs('/Volumes/%s/data_dump/%s' % (DRIVE, TOPIC))
DATA_DUMP = '/Volumes/%s/data_dump/%s' % (DRIVE, TOPIC)

In [None]:
os.chdir(CWD)
os.getcwd()

In [None]:
DIR_READ, DIR_WRITE, DATA_DUMP

### Import ventilation data

This is needed to know the beginning and the end of the recordings

In [None]:
with open('%s/%s.pickle' % (DATA_DUMP, 'data_pars_1_150'), 'rb') as handle:
    data_pars_1_150 = pickle.load(handle)

with open('%s/%s.pickle' % (DATA_DUMP, 'data_pars_151_300'), 'rb') as handle:
    data_pars_151_300 = pickle.load(handle)
    
with open('%s/%s.pickle' % (DATA_DUMP, 'data_pars_301_450'), 'rb') as handle:
    data_pars_301_450 = pickle.load(handle)
    
with open('%s/%s.pickle' % (DATA_DUMP, 'data_pars_451_600'), 'rb') as handle:
    data_pars_451_600 = pickle.load(handle)
    
with open('%s/%s.pickle' % (DATA_DUMP, 'data_pars_601_750'), 'rb') as handle:
    data_pars_601_750 = pickle.load(handle)
    
data_pars = {**data_pars_1_150, **data_pars_151_300, **data_pars_301_450,
             **data_pars_451_600, **data_pars_601_750}

In [None]:
len(data_pars)

### Import clinical data

In [None]:
# import text files in a dictionary
clin_dict = {}
for fname in os.listdir(DIR_READ):
    if not fname.startswith('.'): # disregard hidden files
        fhandle = open(os.path.join('%s' % DIR_READ, fname), 'r', encoding = 'cp1252')
        clin_dict[fname[:-4]] = fhandle.read() # use the filenames without the .txt extension as keys
        fhandle.close()

In [None]:
# split the clinical data into a list
for key in sorted(clin_dict.keys()):
    clin_dict[key] = clin_dict[key].split('\n')[:-1]

In [None]:
# Create an inner dictionary for the different clinical data
for key, value in sorted(clin_dict.items()):
    temp_dict = {}
    for item in value:
        td_key, *td_value = item.split(':')
        td_key = td_key.strip()
        temp_dict[td_key] = ''.join(td_value)[1:]
    clin_dict[key] = temp_dict

In [None]:
# Create a DataFrame from the dictionary of dictionaries
clin_df = DataFrame(clin_dict).T
clin_df.index.name = 'Recording_ID'
clin_df.sort_index(inplace = True)

In [None]:
len(clin_df)

### Limit clinical data up to `AL000665`

In [None]:
clin_df.iloc[551]

In [None]:
clin_df = clin_df[:552]

### Drop cases which have no clinical data

In [None]:
clin_df = clin_df.dropna(axis = 0, how = 'all')

In [None]:
len(clin_df)

### Drop cases for which there is no ventilation data

Ventilation recordings may have been excluded because they were two short (<15 mintes total) or aberrant

In [None]:
combined = sorted(set(list(clin_df.index)) & set(data_pars.keys()))

In [None]:
clin_df = clin_df.loc[combined]
len(clin_df)

### Clean up clinical dataframe

In [None]:
# Curate the time of births of some recordings after manual inspection of case notes
clin_df.loc['AL000360']['Date of Birth'] = '20180906 0707'
clin_df.loc['AL000638']['Date of Birth'] = '20190814 1114'

In [None]:
# Change order of columns and create English names

clin_df = clin_df[['Esetlap id', 'Date of Birth', 'Gestation Age', 'Birth Weight', 
                   'Actual Weight', 'Pathology', 'Start', 'End']]
clin_df.columns = ['Case ID', 'Date of Birth', 'Gestational Age',
                   'Birth Weight', 'Actual Weight', 'Pathology', 'Start', 'End']

In [None]:
clin_df['Gestational Age'] = clin_df['Gestational Age'].map(lambda x: int(x[:2]))
clin_df['Birth Weight'] = clin_df['Birth Weight'].map(lambda x: int(x[:-6]))
clin_df['Actual Weight'] = clin_df['Actual Weight'].str.strip(' grams')

In [None]:
actual_weight = []
for i in range(len(clin_df)):
    if clin_df.iloc[i]['Actual Weight'] == '':
        actual_weight.append(clin_df.iloc[i]['Birth Weight'])
    else:
        actual_weight.append(int(clin_df.iloc[i]['Actual Weight']))

clin_df['Weight'] = actual_weight

#### Start and end from ventilation data
This shows the time points when ventilator was turned on and off. At the beginning and the end of the recoridngs the baby was usually not attached to the ventilator. The ventilator recordings have been manually inspected and have been trimmed accordingly.

In [None]:
starts = {}; ends = {}
for rec in sorted(clin_df.index):
    try:
        starts[rec] = data_pars[rec].index[0]
    except KeyError:
        continue
        
    try:
        ends[rec] = data_pars[rec].index[-1]
    except KeyError:
        continue
        
start_end = DataFrame([starts, ends]).T
start_end.columns = ['Recording start', 'Recording end']

In [None]:
clin_df = pd.concat([clin_df, start_end], axis = 1, join = 'outer')

In [None]:
clin_df['Date of Birth'] = clin_df['Date of Birth'].map(lambda x: pd.to_datetime(x))
clin_df['Pathology'] = clin_df['Pathology'].map(lambda x: x.split(';')[:-1])

In [None]:
clin_df['Duration'] = clin_df['Recording end'] - clin_df['Recording start']

In [None]:
clin_df['Postnatal Age']   = clin_df['Recording end'] - clin_df['Date of Birth']

In [None]:
clin_df.info()

In [None]:
clin_df['Gestational Age'] = pd.to_timedelta((clin_df['Gestational Age']), unit='W', errors='raise')

In [None]:
clin_df['Corrected gestational Age'] = pd.to_timedelta((clin_df['Gestational Age']), unit='D', 
                                                       errors='raise') + clin_df['Postnatal Age']

In [None]:
clin_df['Gestational Age (weeks)'] = \
    clin_df['Gestational Age'].apply(lambda x: x.total_seconds() / (60 * 60 * 24 * 7))

clin_df['Corrected gestational Age (weeks)'] = \
    clin_df['Corrected gestational Age'].apply(lambda x: round(x.total_seconds() / (60 * 60 * 24 * 7), 1))

In [None]:
clin_df.sort_index(axis = 1).head()

### EDA on clinical details

In [None]:
clin_df.describe()

#### For some recordings the age at the time of transfer is "negative"  - these need to be corrected

In [None]:
clin_df[clin_df['Postnatal Age'] < pd.to_timedelta(0)]

#### For some recordings the duration of the recording is "negative"  - these need to be corrected

In [None]:
clin_df[clin_df['Duration'] < pd.to_timedelta(0)]

#### Babies was at less than 23 weeks gestation

In [None]:
clin_df[clin_df['Gestational Age (weeks)'] < 23]

#### Babies born with less than 500 g birth weight

In [None]:
clin_df[clin_df['Birth Weight'] < 500]

In [None]:
len(clin_df[clin_df['Birth Weight'] < 500])

#### Babies transferred with the postnatal age of > 46 weeks we need to discuss whether to include them in the data analysis

In [None]:
a = clin_df[clin_df['Corrected gestational Age (weeks)'] > 46]
a.sort_values('Corrected gestational Age (weeks)')

In [None]:
len(clin_df[clin_df['Corrected gestational Age (weeks)'] > 46])

### Import the now curated `icd_codes.xlsx` files to contain now all relevant diagnosis including new ones

In [None]:
icd_codes = pd.read_excel('/Users/guszti/ventilation_fabian/icd_codes_curated.xlsx', 
                          usecols = [0,1], index_col = 0)

### Create Pathology column with English names

In [None]:
icd_dictionary = dict(zip(icd_codes.index, icd_codes['name']))

In [None]:
def icd_replace(lst):
    icd_list = []
    for item in lst:
        new_item = icd_dictionary[item]
        icd_list.append(new_item)
    return icd_list

In [None]:
clin_df['Pathology_English'] = clin_df['ICD'].apply(icd_replace)

### Final cleanup of the DataFrame

In [None]:
clin_df.columns

In [None]:
column_list = ['Case ID', 'Date of Birth', 'Gestational Age (weeks)', 'Birth Weight',
              'Postnatal Age', 'Corrected gestational Age (weeks)',  'Weight',
              'ICD', 'Pathology_English', 'Recording start', 'Recording end', 'Duration',] 
      
clin_df = clin_df[column_list]

In [None]:
clin_df.head()

### Statistics on clinical data

In [None]:
clinical_stats = round(clin_df.describe(percentiles = [0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99]), 1)
clinical_stats

### Export clinical information and statistics as Excel sheets

In [None]:
writer = pd.ExcelWriter('%s/%s' % (DIR_WRITE, 'clinical_data_all_1_665.xlsx'))
clin_df.to_excel(writer, 'clin_df')
writer.save()

In [None]:
writer = pd.ExcelWriter('%s/%s' % (DIR_WRITE, 'clinical_stats_1_665.xlsx'))
clinical_stats.to_excel(writer, 'stats')
writer.save()

### Export processed data as pickle files

In [None]:
with open('%s/%s.pickle' % (DATA_DUMP, 'clin_df_1_665'), 'wb') as handle:
    pickle.dump(clin_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Create patient lists for various disease groups

### RDS

In [None]:
RDS_dg = {'P22', 'P220'}

In [None]:
RDS_cases = []
for case, dgs in clin_df['ICD'].items():
    if set(dgs).intersection(RDS_dg):
        RDS_cases.append(case)

In [None]:
print(RDS_cases)

In [None]:
clin_df_RDS = clin_df.loc[RDS_cases]
clin_df_RDS;

In [None]:
len(clin_df_RDS)

### HIE

In [None]:
HIE_dg = ['P219', 'Z518', 'Z548',]  

In [None]:
HIE_cases = []
for case, dgs in clin_df['ICD'].items():
    if set(dgs).intersection(HIE_dg):
        HIE_cases.append(case)

In [None]:
clin_df_HIE = clin_df.loc[HIE_cases]
clin_df_HIE;

In [None]:
len(clin_df_HIE)

### Meconium aspiration

In [None]:
MAS_dg = ['P240',]

In [None]:
MAS_cases = []
for case, dgs in clin_df['ICD'].items():
    if set(dgs).intersection(MAS_dg):
        MAS_cases.append(case)

In [None]:
clin_df_MAS = clin_df.loc[MAS_cases]
clin_df_MAS;

In [None]:
len(clin_df_MAS)

### PPHN

In [None]:
PPHN_dg = ['P293', ]

In [None]:
PPHN_cases = []
for case, dgs in clin_df['ICD'].items():
    if set(dgs).intersection(PPHN_dg):
        PPHN_cases.append(case)

In [None]:
clin_df_PPHN = clin_df.loc[PPHN_cases]
clin_df_PPHN;

In [None]:
len(clin_df_PPHN)

### Congenital diaphragmatic hernia

In [None]:
CDH_dg = ['Q790', 'Q791']

In [None]:
CDH_cases = []
for case, dgs in clin_df['ICD'].items():
    if set(dgs).intersection(CDH_dg):
        CDH_cases.append(case)

In [None]:
clin_df_CDH = clin_df.loc[CDH_cases]
clin_df_CDH;

In [None]:
len(clin_df_CDH)

### Necrotizing enterocolitis (NEC)

In [None]:
NEC_dg = ['P77',]

In [None]:
NEC_cases = []
for case, dgs in clin_df['ICD'].items():
    if set(dgs).intersection(NEC_dg):
        NEC_cases.append(case)

In [None]:
clin_df_NEC = clin_df.loc[NEC_cases]
clin_df_NEC;

In [None]:
len(clin_df_NEC)

### Surgical cases (except NEC and CDH)

In [None]:
surgical_dg = ['K409', 'K562', 'K566', 'K631', 'Q391', 'Q392', 'Q423' , 'Q431',
               'Q438', 'Q556', 'Q641', 'Q792', 'Q793', 'R1000']

In [None]:
surgical_cases = []
for case, dgs in clin_df['ICD'].items():
    if set(dgs).intersection(surgical_dg):
        surgical_cases.append(case)

In [None]:
clin_df_surgical = clin_df.loc[surgical_cases]
clin_df_surgical;

In [None]:
len(clin_df_surgical)

### Cardiac cases (except PFO / ASD)

In [None]:
cardiac_dg = ['Q201', 'Q203', 'Q205' ,'Q210', 'Q212', 'Q213', 'Q220', 'Q221', 'Q224', 
              'Q228', 'Q232', 'Q234', 'Q240', 'Q244', 'Q245', 'Q251', 'Q253', 'Q254', 'Q262',]

In [None]:
cardiac_cases = []
for case, dgs in clin_df['ICD'].items():
    if set(dgs).intersection(cardiac_dg):
        cardiac_cases.append(case)

In [None]:
clin_df_cardiac = clin_df.loc[cardiac_cases]
clin_df_cardiac;

In [None]:
len(cardiac_dg)

### Export clinical dataframes into a multisheet Excel file

In [None]:
writer = pd.ExcelWriter('%s/%s' % (DIR_WRITE, 'clinical_data_diseases_1_665.xlsx'))
clin_df.to_excel(writer, 'all')
clin_df_cardiac.to_excel(writer, 'cardiac')
clin_df_CDH.to_excel(writer, 'CDH')
clin_df_HIE.to_excel(writer, 'HIE')
clin_df_MAS.to_excel(writer, 'MAS')
clin_df_NEC.to_excel(writer, 'NEC')
clin_df_PPHN.to_excel(writer, 'PPHN')
clin_df_RDS.to_excel(writer, 'RDS')
clin_df_surgical.to_excel(writer, 'surgical')
writer.save()