![alt text](./Cerny_logo_1.jpg)

# Analysis of Cerny ventilation recordings

## Processing clinical details

This notebook imports and processes clinical data and exports it into a pickle archive.

### Importing the necessary libraries and setting options

In [1]:
import IPython
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk

import os
import sys
import re
import pickle

from scipy import stats
from pandas import Series, DataFrame
from datetime import datetime, timedelta

%matplotlib inline
matplotlib.style.use('classic')
matplotlib.rcParams['figure.facecolor'] = 'w'

pd.set_option('display.max_rows', 250)
pd.set_option('display.max_columns', 250)
# pd.set_option('mode.chained_assignment', None) 

In [2]:
print("Python version: {}".format(sys.version))
print("pandas version: {}".format(pd.__version__))
print("matplotlib version: {}".format(matplotlib.__version__))
print("NumPy version: {}".format(np.__version__))
print("SciPy version: {}".format(sp.__version__))
print("IPython version: {}".format(IPython.__version__))
print("scikit-learn version: {}".format(sk.__version__))

Python version: 3.7.9 (default, Aug 31 2020, 07:22:35) 
[Clang 10.0.0 ]
pandas version: 1.1.3
matplotlib version: 3.3.2
NumPy version: 1.19.2
SciPy version: 1.5.2
IPython version: 7.19.0
scikit-learn version: 0.23.2


### List and set the working directory and the directory to write out data

In [3]:
# Topic of the Notebook which will also be the name of the subfolder containing results
TOPIC = 'fabian'

# Name of the external hard drive
DRIVE = 'GUSZTI'

# Directory containing clinical and blood gas data
CWD = '/Users/guszti/ventilation_fabian'

# Directory on external drive to read the ventilation data from
DIR_READ = '/Volumes/%s/Fabian/fabian_patient_data_all' % DRIVE

DIR_WRITE = '%s/%s' % (CWD, 'Analyses')

# Images and raw data will be written on an external hard drive
if not os.path.isdir('/Volumes/%s/data_dump/%s' % (DRIVE, TOPIC)):
    os.makedirs('/Volumes/%s/data_dump/%s' % (DRIVE, TOPIC))
DATA_DUMP = '/Volumes/%s/data_dump/%s' % (DRIVE, TOPIC)

In [4]:
os.chdir(CWD)
os.getcwd()

'/Users/guszti/ventilation_fabian'

In [5]:
DIR_READ, DIR_WRITE, DATA_DUMP

('/Volumes/GUSZTI/Fabian/fabian_patient_data_all',
 '/Users/guszti/ventilation_fabian/Analyses',
 '/Volumes/GUSZTI/data_dump/fabian')

### Import ventilation data

This is needed to know the beginning and the end of the recordings

In [6]:
with open('%s/%s.pickle' % (DATA_DUMP, 'data_pars_1_150'), 'rb') as handle:
    data_pars_1_150 = pickle.load(handle)

with open('%s/%s.pickle' % (DATA_DUMP, 'data_pars_151_300'), 'rb') as handle:
    data_pars_151_300 = pickle.load(handle)
    
with open('%s/%s.pickle' % (DATA_DUMP, 'data_pars_301_450'), 'rb') as handle:
    data_pars_301_450 = pickle.load(handle)
    
with open('%s/%s.pickle' % (DATA_DUMP, 'data_pars_451_600'), 'rb') as handle:
    data_pars_451_600 = pickle.load(handle)
    
with open('%s/%s.pickle' % (DATA_DUMP, 'data_pars_601_750'), 'rb') as handle:
    data_pars_601_750 = pickle.load(handle)
    
with open('%s/%s.pickle' % (DATA_DUMP, 'data_pars_751_900'), 'rb') as handle:
    data_pars_751_900 = pickle.load(handle)
    
with open('%s/%s.pickle' % (DATA_DUMP, 'data_pars_901_1050'), 'rb') as handle:
    data_pars_901_1050 = pickle.load(handle)
    
with open('%s/%s.pickle' % (DATA_DUMP, 'data_pars_1051_1100'), 'rb') as handle:
    data_pars_1051_1100 = pickle.load(handle)
    
data_pars = {**data_pars_1_150, **data_pars_151_300, **data_pars_301_450,
             **data_pars_451_600, **data_pars_601_750, **data_pars_751_900,
             **data_pars_901_1050, **data_pars_1051_1100}

In [7]:
len(data_pars)

874

### Import clinical data

In [8]:
# import text files in a dictionary
clin_dict = {}
for fname in os.listdir(DIR_READ):
    if not fname.startswith('.'): # disregard hidden files
        fhandle = open(os.path.join('%s' % DIR_READ, fname), 'r', encoding = 'cp1252')
        clin_dict[fname[:-4]] = fhandle.read() # use the filenames without the .txt extension as keys
        fhandle.close()

In [9]:
# split the clinical data into a list
for key in sorted(clin_dict.keys()):
    clin_dict[key] = clin_dict[key].split('\n')[:-1]

In [10]:
# Create an inner dictionary for the different clinical data
for key, value in sorted(clin_dict.items()):
    temp_dict = {}
    for item in value:
        td_key, *td_value = item.split(':')
        td_key = td_key.strip()
        temp_dict[td_key] = ''.join(td_value)[1:]
    clin_dict[key] = temp_dict

In [11]:
# Create a DataFrame from the dictionary of dictionaries
clin_df = DataFrame(clin_dict).T
clin_df.index.name = 'Recording_ID'
clin_df.sort_index(inplace = True)

In [12]:
len(clin_df)

925

### Drop cases which have no clinical data

In [14]:
clin_df.dropna(axis = 0, how = 'all', inplace = True)

In [15]:
len(clin_df)

850

In [66]:
writer = pd.ExcelWriter('%s/%s' % (DIR_WRITE, 'clinical_data_all_1_1100_unfiltered.xlsx'))
clin_df.to_excel(writer, 'clin_df')
writer.save()

### Drop cases for which there is no ventilation data

Ventilation recordings may have been excluded because they were two short (<15 mintes total) or aberrant

In [None]:
combined = sorted(set(list(clin_df.index)) & set(data_pars.keys()))

In [None]:
clin_df = clin_df.loc[combined]
len(clin_df)

### Clean up clinical dataframe

In [17]:
# Curate the time of births of some recordings after manual inspection of case notes
clin_df.loc['AL000360']['Date of Birth'] = '20180906 0707'
clin_df.loc['AL000638']['Date of Birth'] = '20190814 1114'

In [18]:
# Change order of columns and create English names

clin_df = clin_df[['Esetlap id', 'Date of Birth', 'Gestation Age', 'Birth Weight', 
                   'Actual Weight', 'Pathology', 'Start', 'End']]
clin_df.columns = ['Case ID', 'Date of Birth', 'Gestational Age',
                   'Birth Weight', 'Actual Weight', 'Pathology', 'Start', 'End']

In [19]:
clin_df['Gestational Age'] = clin_df['Gestational Age'].map(lambda x: int(x[:2]))
clin_df['Birth Weight'] = clin_df['Birth Weight'].map(lambda x: int(x[:-6]))
clin_df['Actual Weight'] = clin_df['Actual Weight'].str.strip(' grams')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [20]:
actual_weight = []
for i in range(len(clin_df)):
    if clin_df.iloc[i]['Actual Weight'] == '':
        actual_weight.append(clin_df.iloc[i]['Birth Weight'])
    else:
        actual_weight.append(int(clin_df.iloc[i]['Actual Weight']))

clin_df['Weight'] = actual_weight

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


#### Start and end from ventilation data
This shows the time points when ventilator was turned on and off. At the beginning and the end of the recoridngs the baby was usually not attached to the ventilator. The ventilator recordings have been manually inspected and have been trimmed accordingly.

In [21]:
starts = {}; ends = {}
for rec in sorted(clin_df.index):
    try:
        starts[rec] = data_pars[rec].index[0]
    except KeyError:
        continue
        
    try:
        ends[rec] = data_pars[rec].index[-1]
    except KeyError:
        continue
        
start_end = DataFrame([starts, ends]).T
start_end.columns = ['Recording start', 'Recording end']

In [22]:
clin_df = pd.concat([clin_df, start_end], axis = 1, join = 'outer')

In [23]:
clin_df['Date of Birth'] = clin_df['Date of Birth'].map(lambda x: pd.to_datetime(x))
clin_df['Pathology'] = clin_df['Pathology'].map(lambda x: x.split(';')[:-1])

In [24]:
clin_df['Duration'] = clin_df['Recording end'] - clin_df['Recording start']

In [25]:
clin_df['Postnatal Age']   = clin_df['Recording end'] - clin_df['Date of Birth']

In [26]:
clin_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 850 entries, AL000003 to AL001099
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype          
---  ------           --------------  -----          
 0   Case ID          850 non-null    object         
 1   Date of Birth    850 non-null    datetime64[ns] 
 2   Gestational Age  850 non-null    int64          
 3   Birth Weight     850 non-null    int64          
 4   Actual Weight    850 non-null    object         
 5   Pathology        850 non-null    object         
 6   Start            850 non-null    object         
 7   End              850 non-null    object         
 8   Weight           850 non-null    int64          
 9   Recording start  792 non-null    datetime64[ns] 
 10  Recording end    792 non-null    datetime64[ns] 
 11  Duration         792 non-null    timedelta64[ns]
 12  Postnatal Age    792 non-null    timedelta64[ns]
dtypes: datetime64[ns](3), int64(3), object(5), timedelta64[ns](2)
memory usag

In [27]:
clin_df['Gestational Age'] = pd.to_timedelta((clin_df['Gestational Age']), unit='W', errors='raise')

In [28]:
clin_df['Corrected gestational Age'] = pd.to_timedelta((clin_df['Gestational Age']), unit='D', 
                                                       errors='raise') + clin_df['Postnatal Age']

In [29]:
clin_df['Gestational Age (weeks)'] = \
    clin_df['Gestational Age'].apply(lambda x: x.total_seconds() / (60 * 60 * 24 * 7))

clin_df['Corrected gestational Age (weeks)'] = \
    clin_df['Corrected gestational Age'].apply(lambda x: round(x.total_seconds() / (60 * 60 * 24 * 7), 1))

In [30]:
clin_df.sort_index(axis = 1).head()

Unnamed: 0,Actual Weight,Birth Weight,Case ID,Corrected gestational Age,Corrected gestational Age (weeks),Date of Birth,Duration,End,Gestational Age,Gestational Age (weeks),Pathology,Postnatal Age,Recording end,Recording start,Start,Weight
AL000003,,990,42543,196 days 02:09:04,28.0,2017-03-24 17:41:00,0 days 01:44:57,"Fri, 24 Mar 2017 195004 +0100",196 days,28.0,"[Neonat praemat gr s 28 (P07.3) , IRDS (P22....",0 days 02:09:04,2017-03-24 19:50:04,2017-03-24 18:05:07,"Fri, 24 Mar 2017 180507 +0100",990
AL000005,,3530,42552,259 days 02:07:19,37.0,2017-03-26 17:50:00,0 days 00:42:26,"Sun, 26 Mar 2017 195719 +0200",259 days,37.0,"[Neonat mat gr s 37 (P96.4) , Infectio (P39....",0 days 02:07:19,2017-03-26 19:57:19,2017-03-26 19:14:53,"Sun, 26 Mar 2017 191453 +0200",3530
AL000006,,1470,42554,217 days 01:18:07,31.0,2017-03-26 23:37:00,0 days 00:50:46,"Mon, 27 Mar 2017 005507 +0200",217 days,31.0,"[Neonat praemat gr s 31 (P07.3) , Gemini A (...",0 days 01:18:07,2017-03-27 00:55:07,2017-03-27 00:04:21,"Mon, 27 Mar 2017 000421 +0200",1470
AL000007,4800.0,3200,42578,339 days 02:48:57,48.4,2017-01-29 00:00:00,0 days 02:00:28,"Wed, 29 Mar 2017 024857 +0200",280 days,40.0,"[Exsiccatio (E86) , Légzési elégtelenség (P2...",59 days 02:48:57,2017-03-29 02:48:57,2017-03-29 00:48:29,"Wed, 29 Mar 2017 004829 +0200",4800
AL000008,,3230,42596,266 days 04:22:48,38.0,2017-03-29 13:20:00,0 days 01:58:22,"Wed, 29 Mar 2017 174248 +0200",266 days,38.0,"[Neonat mat gr s 38 (P96.4) , Intézeten kívü...",0 days 04:22:48,2017-03-29 17:42:48,2017-03-29 15:44:26,"Wed, 29 Mar 2017 154426 +0200",3230


### EDA on clinical details

In [31]:
clin_df.describe()

Unnamed: 0,Gestational Age,Birth Weight,Weight,Duration,Postnatal Age,Corrected gestational Age,Gestational Age (weeks),Corrected gestational Age (weeks)
count,850,850.0,850.0,792,792,792,850.0,792.0
mean,242 days 18:49:58.588235296,2435.655294,2609.732941,0 days 01:17:24.023989898,12 days 19:41:55.128787878,255 days 21:52:49.674242444,34.683529,36.54697
std,33 days 22:12:49.083953691,1021.366885,1032.950271,0 days 00:54:51.966850579,30 days 19:50:27.660454673,41 days 09:38:00.784496699,4.84651,5.915318
min,147 days 00:00:00,300.0,300.0,0 days 00:16:08,0 days 00:01:09,147 days 01:41:03,21.0,21.0
25%,224 days 00:00:00,1692.5,1850.0,0 days 00:46:45.750000,0 days 03:07:50.250000,231 days 10:18:43.250000,32.0,33.1
50%,252 days 00:00:00,2600.0,2715.0,0 days 01:07:44,0 days 06:57:55,259 days 02:24:49,36.0,37.0
75%,266 days 00:00:00,3200.0,3347.5,0 days 01:38:41,5 days 18:09:21,274 days 05:09:33.750000,38.0,39.2
max,294 days 00:00:00,5400.0,5920.0,0 days 19:03:15,231 days 10:33:32,503 days 13:40:44,42.0,71.9


#### For some recordings the age at the time of transfer is "negative"  - these need to be corrected

In [32]:
clin_df[clin_df['Postnatal Age'] < pd.to_timedelta(0)]

Unnamed: 0,Case ID,Date of Birth,Gestational Age,Birth Weight,Actual Weight,Pathology,Start,End,Weight,Recording start,Recording end,Duration,Postnatal Age,Corrected gestational Age,Gestational Age (weeks),Corrected gestational Age (weeks)


#### For some recordings the duration of the recording is "negative"  - these need to be corrected

In [33]:
clin_df[clin_df['Duration'] < pd.to_timedelta(0)]

Unnamed: 0,Case ID,Date of Birth,Gestational Age,Birth Weight,Actual Weight,Pathology,Start,End,Weight,Recording start,Recording end,Duration,Postnatal Age,Corrected gestational Age,Gestational Age (weeks),Corrected gestational Age (weeks)


#### Babies was at less than 23 weeks gestation

In [34]:
clin_df[clin_df['Gestational Age (weeks)'] < 23]

Unnamed: 0,Case ID,Date of Birth,Gestational Age,Birth Weight,Actual Weight,Pathology,Start,End,Weight,Recording start,Recording end,Duration,Postnatal Age,Corrected gestational Age,Gestational Age (weeks),Corrected gestational Age (weeks)
AL000294,47588,2018-07-10 11:30:00,147 days,300,,"[Neonat immat gr s 21 (P07.2) , Resuscitatio...","Tue, 10 Jul 2018 122935 +0200","Tue, 10 Jul 2018 131103 +0200",300,2018-07-10 12:29:35,2018-07-10 13:11:03,0 days 00:41:28,0 days 01:41:03,147 days 01:41:03,21.0,21.0
AL000561,50979,2019-06-01 22:08:00,154 days,580,,"[Neonat immat gr s 22 (P07.2) , Gemini A (P0...","Sat, 01 Jun 2019 224754 +0200","Sun, 02 Jun 2019 011426 +0200",580,2019-06-01 22:47:54,2019-06-02 01:14:26,0 days 02:26:32,0 days 03:06:26,154 days 03:06:26,22.0,22.0


#### Babies born with less than 500 g birth weight

In [35]:
clin_df[clin_df['Birth Weight'] < 500]

Unnamed: 0,Case ID,Date of Birth,Gestational Age,Birth Weight,Actual Weight,Pathology,Start,End,Weight,Recording start,Recording end,Duration,Postnatal Age,Corrected gestational Age,Gestational Age (weeks),Corrected gestational Age (weeks)
AL000038,42889,2017-04-01 00:00:00,161 days,490,690.0,"[Neonat immat gr s 22 (P07.2) 22-23 , Ileus (...","Sat, 29 Apr 2017 091252 +0200","Sat, 29 Apr 2017 105944 +0200",690,2017-04-29 09:12:52,2017-04-29 10:59:44,0 days 01:46:52,28 days 10:59:44,189 days 10:59:44,23.0,27.1
AL000066,43365,2017-05-18 14:24:00,168 days,450,650.0,"[PDA (Q25.0) , Neonat immat gr s 24 (P07.2) ...","Wed, 14 Jun 2017 105336 +0200","Wed, 14 Jun 2017 121541 +0200",650,2017-06-14 10:53:36,2017-06-14 12:15:41,0 days 01:22:05,26 days 21:51:41,194 days 21:51:41,24.0,27.8
AL000099,43778,2017-06-10 10:44:00,182 days,490,830.0,"[Neonat immat gr s 26 (P07.2) , NEC (P77) ,...","Sun, 23 Jul 2017 151033 +0200","Sun, 23 Jul 2017 155506 +0200",830,2017-07-23 15:10:33,2017-07-23 15:55:06,0 days 00:44:33,43 days 05:11:06,225 days 05:11:06,26.0,32.2
AL000100,43779,2017-04-01 00:00:00,175 days,490,1890.0,"[ROP (H35.1) , BPD (P27.1) ]","Mon, 24 Jul 2017 083540 +0200","Mon, 24 Jul 2017 091856 +0200",1890,2017-07-24 08:35:40,2017-07-24 09:18:56,0 days 00:43:16,114 days 09:18:56,289 days 09:18:56,25.0,41.3
AL000101,43781,2017-04-01 00:00:00,175 days,490,1890.0,"[ROP (H35.1) st.p. vitrectomian l.u. , BPD (P...","Mon, 24 Jul 2017 110810 +0200","Mon, 24 Jul 2017 114954 +0200",1890,2017-07-24 11:08:10,2017-07-24 11:49:54,0 days 00:41:44,114 days 11:49:54,289 days 11:49:54,25.0,41.4
AL000140,44200,2017-08-08 10:20:00,161 days,410,640.0,"[Neonat immat gr s 23 (P07.2) , NEC (P77) ,...","Fri, 25 Aug 2017 132348 +0200","Fri, 25 Aug 2017 151422 +0200",640,2017-08-25 13:23:48,2017-08-25 15:14:22,0 days 01:50:34,17 days 04:54:22,178 days 04:54:22,23.0,25.5
AL000163,45238,2017-07-13 18:07:00,168 days,360,2880.0,"[BPD (P27.1) , Légzési elégtelenség (P28.5) ...","Sat, 25 Nov 2017 144108 +0100","Sat, 25 Nov 2017 164239 +0100",2880,2017-11-25 14:41:08,2017-11-25 16:42:39,0 days 02:01:31,134 days 22:35:39,302 days 22:35:39,24.0,43.3
AL000182,45454,2017-07-13 23:05:00,168 days,360,3300.0,"[Neonat immat gr s 24 (P07.2) , Hernia ingui...","Wed, 13 Dec 2017 124334 +0100","Wed, 13 Dec 2017 144535 +0100",3300,2017-12-13 12:43:34,2017-12-13 14:45:35,0 days 02:02:01,152 days 15:40:35,320 days 15:40:35,24.0,45.8
AL000294,47588,2018-07-10 11:30:00,147 days,300,,"[Neonat immat gr s 21 (P07.2) , Resuscitatio...","Tue, 10 Jul 2018 122935 +0200","Tue, 10 Jul 2018 131103 +0200",300,2018-07-10 12:29:35,2018-07-10 13:11:03,0 days 00:41:28,0 days 01:41:03,147 days 01:41:03,21.0,21.0
AL000348,48173,2018-07-27 00:00:00,175 days,490,,"[Neonat immat gr s 25 (P07.2) , Hypothermia ...","Tue, 28 Aug 2018 153653 +0200","Tue, 28 Aug 2018 162146 +0200",490,2018-08-28 15:36:53,2018-08-28 16:21:46,0 days 00:44:53,32 days 16:21:46,207 days 16:21:46,25.0,29.7


In [36]:
len(clin_df[clin_df['Birth Weight'] < 500])

13

#### Babies transferred with the postnatal age of > 46 weeks we need to discuss whether to include them in the data analysis

In [37]:
a = clin_df[clin_df['Corrected gestational Age (weeks)'] > 46]
a.sort_values('Corrected gestational Age (weeks)')

Unnamed: 0,Case ID,Date of Birth,Gestational Age,Birth Weight,Actual Weight,Pathology,Start,End,Weight,Recording start,Recording end,Duration,Postnatal Age,Corrected gestational Age,Gestational Age (weeks),Corrected gestational Age (weeks)
AL000451,49876,2019-01-02 00:00:00,287 days,3000,3720.0,"[Stenosis tracheae (Q32.1) , Infectio (P39.9...","Thu, 07 Feb 2019 063812 +0100","Thu, 07 Feb 2019 070516 +0100",3720,2019-02-07 06:38:12,2019-02-07 07:05:16,0 days 00:27:04,36 days 07:05:16,323 days 07:05:16,41.0,46.2
AL000782,53788,2019-12-15 10:07:00,266 days,4020,4230.0,"[Neonat mat gr s 38 (P96.4) , Vitium cordis ...","Tue, 11 Feb 2020 102153 +0100","Tue, 11 Feb 2020 113111 +0100",4230,2020-02-11 10:21:53,2020-02-11 11:31:11,0 days 01:09:18,58 days 01:24:11,324 days 01:24:11,38.0,46.3
AL000823,54127,2020-01-14 00:00:00,266 days,3580,,"[Neonat mat gr s 38 (P96.4) , ASD (Q21.1) ,...","Fri, 13 Mar 2020 155505 +0100","Fri, 13 Mar 2020 171936 +0100",3580,2020-03-13 15:55:05,2020-03-13 17:19:36,0 days 01:24:31,59 days 17:19:36,325 days 17:19:36,38.0,46.5
AL000459,49912,2019-01-02 00:00:00,287 days,3000,3720.0,"[Foramen ovale apertum (Q21.1) , Stenosis tr...","Mon, 11 Feb 2019 120638 +0100","Mon, 11 Feb 2019 124448 +0100",3720,2019-02-11 12:06:38,2019-02-11 12:44:48,0 days 00:38:10,40 days 12:44:48,327 days 12:44:48,41.0,46.8
AL000688,52223,2019-07-07 09:05:00,259 days,2265,4245.0,"[Neonat mat gr s 37 (P96.4) , Postasphyxias ...","Wed, 18 Sep 2019 090008 +0200","Wed, 18 Sep 2019 092730 +0200",4245,2019-09-18 09:00:08,2019-09-18 09:27:30,0 days 00:27:22,73 days 00:22:30,332 days 00:22:30,37.0,47.4
AL000819,54105,2019-10-14 00:00:00,182 days,950,3250.0,"[Pneumonia (J18.9) , Légzési elégtelenség (P...","Thu, 12 Mar 2020 103614 +0100","Thu, 12 Mar 2020 112302 +0100",3250,2020-03-12 10:36:14,2020-03-12 11:23:02,0 days 00:46:48,150 days 11:23:02,332 days 11:23:02,26.0,47.5
AL000042,42922,2017-02-23 00:00:00,266 days,3000,4900.0,"[Neonat mat gr s 39 (P96.4) , Convulsio (P90...","Tue, 02 May 2017 193707 +0200","Tue, 02 May 2017 202154 +0200",4900,2017-05-02 19:37:07,2017-05-02 20:21:54,0 days 00:44:47,68 days 20:21:54,334 days 20:21:54,38.0,47.8
AL000043,42923,2017-02-23 00:00:00,266 days,3000,4900.0,"[Neonat mat gr s 38 (P96.4) , Convulsio (P90...","Tue, 02 May 2017 204804 +0200","Tue, 02 May 2017 211132 +0200",4900,2017-05-02 20:48:04,2017-05-02 21:11:32,0 days 00:23:28,68 days 21:11:32,334 days 21:11:32,38.0,47.8
AL000478,50349,2019-02-01 00:00:00,280 days,3550,4310.0,"[Hypotonia musculorum (P94.2) , Légzészavar ...","Wed, 27 Mar 2019 123644 +0100","Wed, 27 Mar 2019 131024 +0100",4310,2019-03-27 12:36:44,2019-03-27 13:10:24,0 days 00:33:40,54 days 13:10:24,334 days 13:10:24,40.0,47.8
AL000690,52261,2019-07-07 09:05:00,259 days,2265,4200.0,"[Encephalopathia (G93.4) postasphyxias , Gast...","Sun, 22 Sep 2019 051336 +0200","Sun, 22 Sep 2019 063040 +0200",4200,2019-09-22 05:13:36,2019-09-22 06:30:40,0 days 01:17:04,76 days 21:25:40,335 days 21:25:40,37.0,48.0


In [38]:
len(clin_df[clin_df['Corrected gestational Age (weeks)'] > 46])

37

### Identify ICD codes

In [39]:
icd = re.compile(r'\(\S\d+\.?\d*\)')

In [40]:
def icd_finder(lst):
    icd_list = []
    for item in lst:
        if icd.findall(item):
            icd_list.append(icd.findall(item)[0])
    
    return icd_list

In [41]:
def icd_cleanup(lst):
    icd_list = []
    for item in lst:
        if '.' in item:
            new_item = item[0 : item.index('.')] + item[item.index('.') + 1 : ]
        else:
            new_item = item
        icd_list.append(new_item[1:-1])
    return icd_list

In [42]:
clin_df['ICD'] = clin_df['Pathology'].apply(icd_finder)

In [43]:
clin_df['ICD'] = clin_df['ICD'].apply(icd_cleanup)

In [44]:
clin_df['ICD'];

### Identify all icd codes

In [45]:
icd_all = []

for _, item in clin_df.iterrows():
    icd_all.extend(item['ICD'])
    
icd_all = sorted(set(icd_all))

In [46]:
len(icd_all)

195

In [47]:
icd_all_frme = DataFrame(icd_all)
icd_all_frme.columns = ['code']
icd_all_frme.sort_values(by = 'code', inplace = True)

In [48]:
icd_all_frme.head()

Unnamed: 0,code
0,A419
1,D180
2,D181
3,D379
4,D4890


In [49]:
len(icd_all_frme)

195

In [50]:
writer = pd.ExcelWriter('%s/%s' % (DIR_WRITE, 'icd_codes.xlsx'))
icd_all_frme.to_excel(writer, 'icd_codes')
writer.save()

### Import the file that has been manually curated from the exported `icd_codes.xlsx` files to contain all relevant diagnosis previously identified

In [53]:
icd_codes = pd.read_excel('/Users/guszti/ventilation_fabian/icd_codes_curated.xlsx', 
                          usecols = [0,1], index_col = 0)

In [54]:
icd_codes.head()

Unnamed: 0_level_0,name
code,Unnamed: 1_level_1
A419,"Sepsis, unspecified organism"
D180,Hemangioma unspecified site
D181,"Lymphangioma, any site"
D379,Neoplasm of uncertain behavior of digestive or...
D4890,"Neoplasm of uncertain behavior, unspecified"


In [55]:
icd_all_frme.head()

Unnamed: 0,code
0,A419
1,D180
2,D181
3,D379
4,D4890


Identify new codes not in the dataset so far

In [56]:
new_diagnoses = sorted(set(icd_all_frme['code']) - set(icd_codes.index))
new_diagnoses

['G039', 'Q393', 'R635']

At this point the `icd_codes_curated.xlsx` file needs to be manually curated with these entries.

### Import the now curated `icd_codes.xlsx` files to contain now all relevant diagnosis including new ones

In [57]:
icd_codes = pd.read_excel('/Users/guszti/ventilation_fabian/icd_codes_curated.xlsx', 
                          usecols = [0,1], index_col = 0)

### Create Pathology column with English names

In [58]:
icd_dictionary = dict(zip(icd_codes.index, icd_codes['name']))

In [59]:
def icd_replace(lst):
    icd_list = []
    for item in lst:
        new_item = icd_dictionary[item]
        icd_list.append(new_item)
    return icd_list

In [60]:
clin_df['Pathology_English'] = clin_df['ICD'].apply(icd_replace)

KeyError: 'Q393'

In [61]:
clin_df

Unnamed: 0,Case ID,Date of Birth,Gestational Age,Birth Weight,Actual Weight,Pathology,Start,End,Weight,Recording start,Recording end,Duration,Postnatal Age,Corrected gestational Age,Gestational Age (weeks),Corrected gestational Age (weeks),ICD
AL000003,42543,2017-03-24 17:41:00,196 days,990,,"[Neonat praemat gr s 28 (P07.3) , IRDS (P22....","Fri, 24 Mar 2017 180507 +0100","Fri, 24 Mar 2017 195004 +0100",990,2017-03-24 18:05:07,2017-03-24 19:50:04,0 days 01:44:57,0 days 02:09:04,196 days 02:09:04,28.0,28.0,"[P073, P220, P704]"
AL000005,42552,2017-03-26 17:50:00,259 days,3530,,"[Neonat mat gr s 37 (P96.4) , Infectio (P39....","Sun, 26 Mar 2017 191453 +0200","Sun, 26 Mar 2017 195719 +0200",3530,2017-03-26 19:14:53,2017-03-26 19:57:19,0 days 00:42:26,0 days 02:07:19,259 days 02:07:19,37.0,37.0,"[P964, P399, P228]"
AL000006,42554,2017-03-26 23:37:00,217 days,1470,,"[Neonat praemat gr s 31 (P07.3) , Gemini A (...","Mon, 27 Mar 2017 000421 +0200","Mon, 27 Mar 2017 005507 +0200",1470,2017-03-27 00:04:21,2017-03-27 00:55:07,0 days 00:50:46,0 days 01:18:07,217 days 01:18:07,31.0,31.0,"[P073, P015, Q792, Q205]"
AL000007,42578,2017-01-29 00:00:00,280 days,3200,4800,"[Exsiccatio (E86) , Légzési elégtelenség (P2...","Wed, 29 Mar 2017 004829 +0200","Wed, 29 Mar 2017 024857 +0200",4800,2017-03-29 00:48:29,2017-03-29 02:48:57,0 days 02:00:28,59 days 02:48:57,339 days 02:48:57,40.0,48.4,"[E86, P285, I500, Z518, P964, E141]"
AL000008,42596,2017-03-29 13:20:00,266 days,3230,,"[Neonat mat gr s 38 (P96.4) , Intézeten kívü...","Wed, 29 Mar 2017 154426 +0200","Wed, 29 Mar 2017 174248 +0200",3230,2017-03-29 15:44:26,2017-03-29 17:42:48,0 days 01:58:22,0 days 04:22:48,266 days 04:22:48,38.0,38.0,"[P964, Z381, P219, P90, P708, P809]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AL001054,55383,2020-08-26 09:45:00,273 days,3650,,"[Neonat mat gr s 39 (P96.4) , Infectio (P39....","Wed, 26 Aug 2020 121824 +0200","Wed, 26 Aug 2020 130420 +0200",3650,2020-08-26 12:18:24,2020-08-26 13:04:20,0 days 00:45:56,0 days 03:19:20,273 days 03:19:20,39.0,39.0,"[P964, P399, _0000]"
AL001055,55393,2020-06-17 15:50:00,196 days,1246,3250,"[Neonat praemat gr s 28 (P07.3) , BPD (P27.1...","Thu, 27 Aug 2020 110758 +0200","Thu, 27 Aug 2020 131956 +0200",3250,2020-08-27 11:07:58,2020-08-27 13:19:56,0 days 02:11:58,70 days 21:29:56,266 days 21:29:56,28.0,38.1,"[P073, P271, I270, K409, _0000, P399]"
AL001086,55607,2020-09-04 17:56:00,252 days,2860,3160,"[Neonat praemat gr s 36 (P07.3) , Légzési el...","Mon, 21 Sep 2020 143058 +0200","Mon, 21 Sep 2020 151920 +0200",3160,2020-09-21 14:30:58,2020-09-21 15:19:20,0 days 00:48:22,16 days 21:23:20,268 days 21:23:20,36.0,38.4,"[P073, P285, _0000, Z930, Q288, D4890]"
AL001095,55736,2020-10-05 18:22:00,287 days,3040,,"[Neonat mat gr s 41 (P96.4) , Pneumonia (J18...","Tue, 06 Oct 2020 012642 +0200","Tue, 06 Oct 2020 021318 +0200",3040,NaT,NaT,NaT,NaT,NaT,41.0,,"[P964, J189, P228, _0000]"


### Final cleanup of the DataFrame

In [62]:
clin_df.columns

Index(['Case ID', 'Date of Birth', 'Gestational Age', 'Birth Weight',
       'Actual Weight', 'Pathology', 'Start', 'End', 'Weight',
       'Recording start', 'Recording end', 'Duration', 'Postnatal Age',
       'Corrected gestational Age', 'Gestational Age (weeks)',
       'Corrected gestational Age (weeks)', 'ICD'],
      dtype='object')

In [63]:
column_list = ['Case ID', 'Date of Birth', 'Gestational Age (weeks)', 'Birth Weight',
              'Postnatal Age', 'Corrected gestational Age (weeks)',  'Weight',
              'ICD', 'Pathology_English', 'Recording start', 'Recording end', 'Duration',] 
      
clin_df = clin_df[column_list]

KeyError: "['Pathology_English'] not in index"

In [64]:
clin_df.head()

Unnamed: 0,Case ID,Date of Birth,Gestational Age,Birth Weight,Actual Weight,Pathology,Start,End,Weight,Recording start,Recording end,Duration,Postnatal Age,Corrected gestational Age,Gestational Age (weeks),Corrected gestational Age (weeks),ICD
AL000003,42543,2017-03-24 17:41:00,196 days,990,,"[Neonat praemat gr s 28 (P07.3) , IRDS (P22....","Fri, 24 Mar 2017 180507 +0100","Fri, 24 Mar 2017 195004 +0100",990,2017-03-24 18:05:07,2017-03-24 19:50:04,0 days 01:44:57,0 days 02:09:04,196 days 02:09:04,28.0,28.0,"[P073, P220, P704]"
AL000005,42552,2017-03-26 17:50:00,259 days,3530,,"[Neonat mat gr s 37 (P96.4) , Infectio (P39....","Sun, 26 Mar 2017 191453 +0200","Sun, 26 Mar 2017 195719 +0200",3530,2017-03-26 19:14:53,2017-03-26 19:57:19,0 days 00:42:26,0 days 02:07:19,259 days 02:07:19,37.0,37.0,"[P964, P399, P228]"
AL000006,42554,2017-03-26 23:37:00,217 days,1470,,"[Neonat praemat gr s 31 (P07.3) , Gemini A (...","Mon, 27 Mar 2017 000421 +0200","Mon, 27 Mar 2017 005507 +0200",1470,2017-03-27 00:04:21,2017-03-27 00:55:07,0 days 00:50:46,0 days 01:18:07,217 days 01:18:07,31.0,31.0,"[P073, P015, Q792, Q205]"
AL000007,42578,2017-01-29 00:00:00,280 days,3200,4800.0,"[Exsiccatio (E86) , Légzési elégtelenség (P2...","Wed, 29 Mar 2017 004829 +0200","Wed, 29 Mar 2017 024857 +0200",4800,2017-03-29 00:48:29,2017-03-29 02:48:57,0 days 02:00:28,59 days 02:48:57,339 days 02:48:57,40.0,48.4,"[E86, P285, I500, Z518, P964, E141]"
AL000008,42596,2017-03-29 13:20:00,266 days,3230,,"[Neonat mat gr s 38 (P96.4) , Intézeten kívü...","Wed, 29 Mar 2017 154426 +0200","Wed, 29 Mar 2017 174248 +0200",3230,2017-03-29 15:44:26,2017-03-29 17:42:48,0 days 01:58:22,0 days 04:22:48,266 days 04:22:48,38.0,38.0,"[P964, Z381, P219, P90, P708, P809]"


### Statistics on clinical data

In [65]:
clinical_stats = round(clin_df.describe(percentiles = [0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99]), 1)
clinical_stats

Unnamed: 0,Gestational Age,Birth Weight,Weight,Duration,Postnatal Age,Corrected gestational Age,Gestational Age (weeks),Corrected gestational Age (weeks)
count,850,850.0,850.0,792,792,792,850.0,792.0
mean,242 days 18:49:58.588235296,2435.7,2609.7,0 days 01:17:24.023989898,12 days 19:41:55.128787878,255 days 21:52:49.674242444,34.7,36.5
std,33 days 22:12:49.083953691,1021.4,1033.0,0 days 00:54:51.966850579,30 days 19:50:27.660454673,41 days 09:38:00.784496699,4.8,5.9
min,147 days 00:00:00,300.0,300.0,0 days 00:16:08,0 days 00:01:09,147 days 01:41:03,21.0,21.0
1%,161 days 00:00:00,490.0,569.8,0 days 00:20:34.200000,0 days 01:08:43.910000,167 days 11:07:08.880000,23.0,23.9
5%,175 days 00:00:00,700.0,850.0,0 days 00:30:05.100000,0 days 01:38:34.850000,189 days 14:49:21.500000,25.0,27.1
25%,224 days 00:00:00,1692.5,1850.0,0 days 00:46:45.750000,0 days 03:07:50.250000,231 days 10:18:43.250000,32.0,33.1
50%,252 days 00:00:00,2600.0,2715.0,0 days 01:07:44,0 days 06:57:55,259 days 02:24:49,36.0,37.0
75%,266 days 00:00:00,3200.0,3347.5,0 days 01:38:41,5 days 18:09:21,274 days 05:09:33.750000,38.0,39.2
95%,280 days 00:00:00,3900.0,4205.5,0 days 02:23:14.499999999,74 days 04:35:51.449999985,320 days 21:34:31.850000,40.0,45.8


### Export clinical information data in tables and individual text files

##### Create sub-directories for each case if it does not yet exist

In [None]:
# Images and raw data will be written on an external hard drive
if not os.path.isdir('%s/%s' % (DATA_DUMP, 'fabian_cases')):
    os.makedirs('%s/%s' % (DATA_DUMP, 'fabian_cases'))

for case in sorted(clin_df.index): 
    if not os.path.isdir('%s/%s/%s' % (DATA_DUMP, 'fabian_cases', case)):
        os.makedirs('%s/%s/%s' % (DATA_DUMP, 'fabian_cases', case))

##### Export clinical data about individual cases as text files

In [None]:
# Clinical info about all recordings which clinical data are available and are over 15 minutes long

for case in sorted(clin_df.index):
    
    fileout = open('%s/%s/%s/%s_%s.%s' % (DATA_DUMP, 'fabian_cases', case, case, 'clin_info', 'txt'), 'w')
    
    fileout.write('Case ID:             %-50s\n' % case)
    fileout.write('Start:               %-50s\n' % datetime.strftime(clin_df.loc[case]['Recording start'], 
                                                          '%d/%m/%Y %H:%M:%S', ))
    fileout.write('End:                 %-50s\n' % datetime.strftime(clin_df.loc[case]['Recording end'], 
                                                          '%d/%m/%Y %H:%M:%S', ))
    fileout.write('Duration:            %-50s\n' % clin_df.loc[case]['Duration'])
    fileout.write('Gestational age:     %-50s\n' % clin_df.loc[case]['Gestational Age (weeks)'])
    fileout.write('Postmenstrual age:   %-50s\n' % clin_df.loc[case]['Corrected gestational Age (weeks)'])
    fileout.write('Birth Weight:        %-50s\n' % clin_df.loc[case]['Birth Weight'])
    fileout.write('Weight:              %-50s\n' % clin_df.loc[case]['Weight'])
    fileout.write('ICD:                 %-50s\n' % ', '.join(clin_df.loc[case]['ICD']))
    fileout.write('Diagnoses:           %-50s\n' % ', '.join(clin_df.loc[case]['Pathology_English']))
    
    fileout.close()

### Export clinical information as an Excel sheet

In [None]:
writer = pd.ExcelWriter('%s/%s' % (DIR_WRITE, 'clinical_data_all_1_1100.xlsx'))
clin_df.to_excel(writer, 'clin_df')
writer.save()

### Export statistics on clinical data

In [None]:
writer = pd.ExcelWriter('%s/%s' % (DIR_WRITE, 'clinical_stats_1_1100.xlsx'))
clinical_stats.to_excel(writer, 'stats')
writer.save()

### Export processed data as pickle files

In [None]:
with open('%s/%s.pickle' % (DATA_DUMP, 'clin_df_1_1100'), 'wb') as handle:
    pickle.dump(clin_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Create patient lists for various disease groups

### RDS

In [None]:
RDS_dg = {'P22', 'P220'}

In [None]:
RDS_cases = []
for case, dgs in clin_df['ICD'].items():
    if set(dgs).intersection(RDS_dg):
        RDS_cases.append(case)

In [None]:
print(RDS_cases)

In [None]:
clin_df_RDS = clin_df.loc[RDS_cases]
clin_df_RDS;

In [None]:
len(clin_df_RDS)

### HIE

In [None]:
HIE_dg = ['P219', 'Z518', 'Z548',]  

In [None]:
HIE_cases = []
for case, dgs in clin_df['ICD'].items():
    if set(dgs).intersection(HIE_dg):
        HIE_cases.append(case)

In [None]:
clin_df_HIE = clin_df.loc[HIE_cases]
clin_df_HIE;

In [None]:
len(clin_df_HIE)

### Meconium aspiration

In [None]:
MAS_dg = ['P240',]

In [None]:
MAS_cases = []
for case, dgs in clin_df['ICD'].items():
    if set(dgs).intersection(MAS_dg):
        MAS_cases.append(case)

In [None]:
clin_df_MAS = clin_df.loc[MAS_cases]
clin_df_MAS;

In [None]:
len(clin_df_MAS)

### PPHN

In [None]:
PPHN_dg = ['P293', ]

In [None]:
PPHN_cases = []
for case, dgs in clin_df['ICD'].items():
    if set(dgs).intersection(PPHN_dg):
        PPHN_cases.append(case)

In [None]:
clin_df_PPHN = clin_df.loc[PPHN_cases]
clin_df_PPHN;

In [None]:
len(clin_df_PPHN)

### Congenital diaphragmatic hernia

In [None]:
CDH_dg = ['Q790', 'Q791']

In [None]:
CDH_cases = []
for case, dgs in clin_df['ICD'].items():
    if set(dgs).intersection(CDH_dg):
        CDH_cases.append(case)

In [None]:
clin_df_CDH = clin_df.loc[CDH_cases]
clin_df_CDH;

In [None]:
len(clin_df_CDH)

### Necrotizing enterocolitis (NEC)

In [None]:
NEC_dg = ['P77',]

In [None]:
NEC_cases = []
for case, dgs in clin_df['ICD'].items():
    if set(dgs).intersection(NEC_dg):
        NEC_cases.append(case)

In [None]:
clin_df_NEC = clin_df.loc[NEC_cases]
clin_df_NEC;

In [None]:
len(clin_df_NEC)

### Surgical cases (except NEC, CDH and cardiac)

In [None]:
surgical_dg = ['K409', 'K562', 'K566', 'K631', 'K9210', 'Q059', 'Q321', 'Q391', 'Q392', 'Q423' , 'Q431',
               'Q438', 'Q4380', 'Q549', 'Q556', 'Q641', 'Q792', 'Q793', 'R1000']

In [None]:
surgical_cases = []
for case, dgs in clin_df['ICD'].items():
    if set(dgs).intersection(surgical_dg):
        surgical_cases.append(case)

In [None]:
clin_df_surgical = clin_df.loc[surgical_cases]
clin_df_surgical;

In [None]:
len(clin_df_surgical)

### Cardiac cases (except PFO / ASD)

In [None]:
cardiac_dg = ['Q201', 'Q203', 'Q205' ,'Q210', 'Q212', 'Q213', 'Q220', 'Q221', 'Q224', 
              'Q228', 'Q232', 'Q234', 'Q240', 'Q244', 'Q245', 'Q251', 'Q252', 'Q253', 'Q254', 'Q262',]

In [None]:
cardiac_cases = []
for case, dgs in clin_df['ICD'].items():
    if set(dgs).intersection(cardiac_dg):
        cardiac_cases.append(case)

In [None]:
clin_df_cardiac = clin_df.loc[cardiac_cases]
clin_df_cardiac;

In [None]:
len(cardiac_dg)

### Export clinical dataframes into a multisheet Excel file

In [None]:
writer = pd.ExcelWriter('%s/%s' % (DIR_WRITE, 'clinical_data_diseases_1_1100.xlsx'))
clin_df.to_excel(writer, 'all')
clin_df_cardiac.to_excel(writer, 'cardiac')
clin_df_CDH.to_excel(writer, 'CDH')
clin_df_HIE.to_excel(writer, 'HIE')
clin_df_MAS.to_excel(writer, 'MAS')
clin_df_NEC.to_excel(writer, 'NEC')
clin_df_PPHN.to_excel(writer, 'PPHN')
clin_df_RDS.to_excel(writer, 'RDS')
clin_df_surgical.to_excel(writer, 'surgical')
writer.save()

### Export selected clinical data as pickle archive

In [None]:
with open('%s/%s.pickle' % (DATA_DUMP, 'clin_df_cardiac'), 'wb') as handle:
    pickle.dump(clin_df_cardiac, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('%s/%s.pickle' % (DATA_DUMP, 'clin_df_CDH'), 'wb') as handle:
    pickle.dump(clin_df_CDH, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('%s/%s.pickle' % (DATA_DUMP, 'clin_df_HIE'), 'wb') as handle:
    pickle.dump(clin_df_HIE, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('%s/%s.pickle' % (DATA_DUMP, 'clin_df_MAS'), 'wb') as handle:
    pickle.dump(clin_df_MAS, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('%s/%s.pickle' % (DATA_DUMP, 'clin_df_NEC'), 'wb') as handle:
    pickle.dump(clin_df_NEC, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('%s/%s.pickle' % (DATA_DUMP, 'clin_df_PPHN'), 'wb') as handle:
    pickle.dump(clin_df_PPHN, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('%s/%s.pickle' % (DATA_DUMP, 'clin_df_RDS'), 'wb') as handle:
    pickle.dump(clin_df_RDS, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('%s/%s.pickle' % (DATA_DUMP, 'clin_df_surgical'), 'wb') as handle:
    pickle.dump(clin_df_surgical, handle, protocol=pickle.HIGHEST_PROTOCOL)
       