![alt text](./Cerny_logo_1.jpg)

# Analysis of Cerny ventilation recordings

#### Processing clinical details

This notebook imports and processes clinical data and exports it into a pickle archive.

The data processed and analysed in this Notebook were collected by the **Neonatal Emergency and Transport Service of the Peter Cerny Foundation**, Budapest, Hungary

**Author: Dr Gusztav Belteki**

### 1. Import the required libraries and set options

In [1]:
import IPython
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib
import matplotlib.pyplot as plt

import os
import sys
import re
import pickle

from scipy import stats
from pandas import Series, DataFrame
from datetime import datetime, timedelta

%matplotlib inline
matplotlib.style.use('classic')
matplotlib.rcParams['figure.facecolor'] = 'w'

pd.set_option('display.max_rows', 250)
pd.set_option('display.max_columns', 250)
pd.set_option('mode.chained_assignment', None) 

# This is to turn off a warning message which is given when read_Excel() imports '.xlsx' files
import warnings
warnings.simplefilter("ignore")

In [2]:
print("Python version: {}".format(sys.version))
print("pandas version: {}".format(pd.__version__))
print("matplotlib version: {}".format(matplotlib.__version__))
print("NumPy version: {}".format(np.__version__))
print("SciPy version: {}".format(sp.__version__))
print("IPython version: {}".format(IPython.__version__))

Python version: 3.11.7 (main, Dec 15 2023, 12:09:04) [Clang 14.0.6 ]
pandas version: 2.1.4
matplotlib version: 3.8.0
NumPy version: 1.26.3
SciPy version: 1.11.4
IPython version: 8.20.0


### 2. List and set the working directory and the directory to write out data

In [3]:
# Name of the external hard drive
DRIVE = 'GUSZTI'

# Directory on external drive to read the clinical from
DIR_READ = os.path.join(os.sep, 'Volumes', DRIVE, 'Fabian_new', 'fabian_patient_data_all_new')

# Path to project folder containing ventilation research results
PATH = os.path.join(os.sep, 'Users', 'guszti', 'Library', 'Mobile Documents', 'com~apple~CloudDocs', 
                            'Documents', 'Research', 'Ventilation')

# Folder to export the result of analysis
DIR_WRITE = os.path.join(PATH, 'ventilation_fabian_new', 'Analyses')
os.makedirs(DIR_WRITE, exist_ok = True)

# Folder on a USB stick to export data to and to import processed data exported by other Notebooks
DATA_DUMP = os.path.join(os.sep, '/Volumes', DRIVE, 'data_dump', 'fabian_new',)
os.makedirs(DATA_DUMP, exist_ok = True)

In [4]:
DIR_READ, DIR_WRITE, DATA_DUMP

('/Volumes/GUSZTI/Fabian_new/fabian_patient_data_all_new',
 '/Users/guszti/Library/Mobile Documents/com~apple~CloudDocs/Documents/Research/Ventilation/ventilation_fabian_new/Analyses',
 '/Volumes/GUSZTI/data_dump/fabian_new')

### 3. Import ventilation data

This is needed to know the beginning and the end of the recordings

In [5]:
%%time

with open(os.path.join(DATA_DUMP, 'data_pars_1_150.pickle'), 'rb') as handle:
    data_pars_1_150 = pickle.load(handle)

with open(os.path.join(DATA_DUMP, 'data_pars_151_300.pickle'), 'rb') as handle:
    data_pars_151_300 = pickle.load(handle)

with open(os.path.join(DATA_DUMP, 'data_pars_301_450.pickle'), 'rb') as handle:
    data_pars_301_450 = pickle.load(handle)
    
with open(os.path.join(DATA_DUMP, 'data_pars_451_600.pickle'), 'rb') as handle:
    data_pars_451_600 = pickle.load(handle)
    
with open(os.path.join(DATA_DUMP, 'data_pars_601_750.pickle'), 'rb') as handle:
    data_pars_601_750 = pickle.load(handle)
    
with open(os.path.join(DATA_DUMP, 'data_pars_751_900.pickle'), 'rb') as handle:
    data_pars_751_900 = pickle.load(handle)
    
with open(os.path.join(DATA_DUMP, 'data_pars_901_1050.pickle'), 'rb') as handle:
    data_pars_901_1050 = pickle.load(handle)
    
with open(os.path.join(DATA_DUMP, 'data_pars_1051_1200.pickle'), 'rb') as handle:
    data_pars_1051_1200 = pickle.load(handle)
    
with open(os.path.join(DATA_DUMP, 'data_pars_1201_1350.pickle'), 'rb') as handle:
    data_pars_1201_1350 = pickle.load(handle)

with open(os.path.join(DATA_DUMP, 'data_pars_1351_1500.pickle'), 'rb') as handle:
    data_pars_1351_1500 = pickle.load(handle)
    
data_pars = {**data_pars_1_150, **data_pars_151_300, **data_pars_301_450, **data_pars_451_600,
             **data_pars_601_750, **data_pars_751_900, **data_pars_901_1050, **data_pars_1051_1200,
             **data_pars_1201_1350, **data_pars_1351_1500}

CPU times: user 7.89 s, sys: 1.85 s, total: 9.74 s
Wall time: 11.6 s


Shift the time stamp of ventilator recordings recorded with incorrect time stamps

Az eltérés valóban az AT000110-estől kezdődik és az AT000216-ig tart. Sajnos volt közben egy téli/nyári váltás is. 
Március 28 után a plusz 9 órával kell korrigálni, előtte 10 órával. 

In [6]:
for case in data_pars:
    if 110 <= int(case[2:].lstrip('0')) < 195:
        data_pars[case].index = data_pars[case].index.shift(10, freq='H')
        
    elif 195 <= int(case[2:].lstrip('0')) <= 216:
        data_pars[case].index = data_pars[case].index.shift(9, freq='H')

In [7]:
len(data_pars)

1154

### 4. Import clinical data

In [8]:
# import text files in a dictionary
clin_dict = {}
for fname in os.listdir(DIR_READ):
    if not fname.startswith('.'): # disregard hidden files
        fhandle = open(os.path.join(DIR_READ, fname), 'r', encoding = 'cp1252', errors='replace')
        clin_dict[fname[:-4]] = fhandle.read() # use the filenames without the .txt extension as keys
        fhandle.close()

In [9]:
# split the clinical data into a list
for key in sorted(clin_dict.keys()):
    clin_dict[key] = clin_dict[key].split('\n')[:-1]

In [10]:
# Create an inner dictionary for the different clinical data
for key, value in sorted(clin_dict.items()):
    temp_dict = {}
    for item in value:
        td_key, *td_value = item.split(':')
        td_key = td_key.strip()
        temp_dict[td_key] = ''.join(td_value)[1:]
    clin_dict[key] = temp_dict

In [11]:
# Create a DataFrame from the dictionary of dictionaries
clin_df = DataFrame(clin_dict).T
clin_df.index.name = 'Recording_ID'
clin_df.sort_index(inplace = True)
# Drop column containing confidential data (name)
clin_df = clin_df.drop('Name', axis=1)

In [12]:
len(clin_df)

1199

### 5. Drop cases which have no clinical data

In [13]:
clin_df.dropna(axis = 0, how = 'all', inplace = True)

In [14]:
len(clin_df)

1150

### 6. Drop cases for which there is no ventilation data

Ventilation recordings may have been excluded because they were too short (<15 mintes total) or aberrant

In [15]:
combined = sorted(set(list(clin_df.index)) & set(data_pars.keys()))

In [16]:
clin_df = clin_df.loc[combined]
len(clin_df)

1079

### 7. Clean up clinical dataframe

In [17]:
clin_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1079 entries, AT000005 to AT001397
Data columns (total 40 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Esetlap id     1079 non-null   object
 1   Date of Birth  1079 non-null   object
 2   Gestation Age  1079 non-null   object
 3   Birth Weight   1079 non-null   object
 4   Actual Weight  1079 non-null   object
 5   Start          1079 non-null   object
 6   End            1079 non-null   object
 7   Pathology      1079 non-null   object
 8   Astrup 1       987 non-null    object
 9   Time           1076 non-null   object
 10  pH             1075 non-null   object
 11  pCO2           1075 non-null   object
 12  pO2            1075 non-null   object
 13  HCO3           1075 non-null   object
 14  ABE            987 non-null    object
 15  Saturatio      987 non-null    object
 16  FiO2           987 non-null    object
 17  Type           987 non-null    object
 18  Astrup 2       987 non

In [18]:
# Change order of columns and create English names

clin_df = clin_df[['Esetlap id', 'Date of Birth', 'Gestation Age', 'Birth Weight', 
                   'Actual Weight', 'Pathology', 'Start', 'End']]
clin_df.columns = ['Case ID', 'Date of Birth', 'Gestational Age',
                   'Birth Weight', 'Actual Weight', 'Pathology', 'Start', 'End']

In [19]:
clin_df.loc[:, 'Gestational Age'] = clin_df['Gestational Age'].map(lambda x: int(x[:2]))
clin_df.loc[:, 'Birth Weight'] = clin_df['Birth Weight'].map(lambda x: int(x[:-6]))
clin_df.loc[:, 'Actual Weight'] = clin_df['Actual Weight'].str.strip(' grams')

#### Start and end of ventilation data
This shows the time points when ventilator was turned on and off. At the beginning and the end of the recoridngs the baby was usually not attached to the ventilator. The ventilator recordings have been manually inspected and have been trimmed accordingly.

In [20]:
starts = {}; ends = {}
for rec in sorted(clin_df.index):
    try:
        starts[rec] = data_pars[rec].index[0]
    except KeyError:
        continue
        
    try:
        ends[rec] = data_pars[rec].index[-1]
    except KeyError:
        continue
        
start_end = DataFrame([starts, ends]).T
start_end.columns = ['Recording start', 'Recording end']

In [21]:
clin_df = pd.concat([clin_df, start_end], axis = 1, join = 'outer')

In [22]:
clin_df['Date of Birth'] = clin_df['Date of Birth'].map(lambda x: pd.to_datetime(x))
clin_df['Pathology'] = clin_df['Pathology'].map(lambda x: x.split(';')[:-1])

In [23]:
clin_df['Duration'] = clin_df['Recording end'] - clin_df['Recording start']

In [24]:
clin_df['Postnatal Age']   = clin_df['Recording end'] - clin_df['Date of Birth']

In [25]:
clin_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1079 entries, AT000005 to AT001397
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype          
---  ------           --------------  -----          
 0   Case ID          1079 non-null   object         
 1   Date of Birth    1079 non-null   datetime64[ns] 
 2   Gestational Age  1079 non-null   object         
 3   Birth Weight     1079 non-null   object         
 4   Actual Weight    1079 non-null   object         
 5   Pathology        1079 non-null   object         
 6   Start            1079 non-null   object         
 7   End              1079 non-null   object         
 8   Recording start  1079 non-null   datetime64[ns] 
 9   Recording end    1079 non-null   datetime64[ns] 
 10  Duration         1079 non-null   timedelta64[ns]
 11  Postnatal Age    1079 non-null   timedelta64[ns]
dtypes: datetime64[ns](3), object(7), timedelta64[ns](2)
memory usage: 109.6+ KB


In [26]:
clin_df['Gestational Age'] = pd.to_timedelta((clin_df['Gestational Age']), unit='W', errors='raise')

In [27]:
clin_df['Corrected gestational Age'] = pd.to_timedelta((clin_df['Gestational Age']), unit='D', 
                                                       errors='raise') + clin_df['Postnatal Age']

In [29]:
clin_df['Gestational Age (weeks)'] = clin_df['Gestational Age'].apply(lambda x: x.total_seconds() / (60 * 60 * 24 * 7))

clin_df['Corrected gestational Age (weeks)'] = \
    clin_df['Corrected gestational Age'].apply(lambda x: round(x.total_seconds() / (60 * 60 * 24 * 7), 1))

In [30]:
actual_weight = []
for i in range(len(clin_df)):
    if clin_df.iloc[i]['Actual Weight'] != '':
        actual_weight.append(int(clin_df.iloc[i]['Actual Weight']))
    #If actual weight is not available and postnatal age is <7 days, use birth weight, otherwise put np.nan
    elif clin_df.iloc[i]['Postnatal Age'] <= pd.to_timedelta('7D'):
        actual_weight.append(clin_df.iloc[i]['Birth Weight'])
    else:
        actual_weight.append(np.nan)

clin_df['Weight'] = actual_weight

In [31]:
clin_df.sort_index(axis = 1).head(2)

Unnamed: 0,Actual Weight,Birth Weight,Case ID,Corrected gestational Age,Corrected gestational Age (weeks),Date of Birth,Duration,End,Gestational Age,Gestational Age (weeks),Pathology,Postnatal Age,Recording end,Recording start,Start,Weight
AT000005,,1150,55871,203 days 03:53:22,29.0,2020-10-21 08:31:00,0 days 02:12:55,"Wed, 21 Oct 2020 122422 +0200",203 days,29.0,"[Neonat praemat gr s 29 (P07.3) , RDS (P22) ...",0 days 03:53:22,2020-10-21 12:24:22,2020-10-21 10:11:27,"Wed, 21 Oct 2020 101127 +0200",1150.0
AT000006,940.0,820,55875,212 days 23:53:36,30.4,2020-09-27 15:45:00,0 days 00:41:31,"Wed, 21 Oct 2020 153836 +0200",189 days,27.0,"[Neonat immat gr s 27 (P07.2) , RDS (P22) ,...",23 days 23:53:36,2020-10-21 15:38:36,2020-10-21 14:57:05,"Wed, 21 Oct 2020 145705 +0200",940.0


### 8. Exploratory analysis on clinical details

In [32]:
clin_df.describe()

Unnamed: 0,Date of Birth,Gestational Age,Recording start,Recording end,Duration,Postnatal Age,Corrected gestational Age,Gestational Age (weeks),Corrected gestational Age (weeks),Weight
count,1079,1079,1079,1079,1079,1079,1079,1079.0,1079.0,1048.0
mean,2022-05-13 06:45:52.159407104,248 days 06:47:02.613531048,2022-05-23 16:52:03.202965760,2022-05-23 18:05:40.691380736,0 days 01:13:37.488415199,10 days 11:19:48.531974050,258 days 18:06:51.145505096,35.468953,36.954124,2740.655534
min,2020-09-27 15:45:00,154 days 00:00:00,2020-10-21 10:11:27,2020-10-21 12:24:22,0 days 00:15:52,0 days 00:04:33,154 days 02:58:33,22.0,22.0,280.0
25%,2021-07-12 02:23:30,231 days 00:00:00,2021-07-29 06:11:53,2021-07-29 06:54:30,0 days 00:44:24.500000,0 days 03:11:39.500000,238 days 04:56:13,33.0,34.0,2097.5
50%,2022-03-11 00:00:00,259 days 00:00:00,2022-03-31 08:06:29,2022-03-31 09:38:13,0 days 01:06:31,0 days 06:12:49,262 days 23:09:40,37.0,37.6,2850.0
75%,2023-02-27 15:37:30,273 days 00:00:00,2023-03-08 20:56:46.500000,2023-03-08 21:39:21,0 days 01:33:47,3 days 06:21:34,280 days 00:33:24.500000,39.0,40.0,3450.0
max,2024-06-29 16:00:00,392 days 00:00:00,2024-06-29 22:14:21,2024-06-29 23:34:39,0 days 06:31:10,365 days 03:01:02,589 days 03:01:02,56.0,84.2,5700.0
std,,32 days 07:28:32.579460072,,,0 days 00:40:04.118948388,27 days 23:18:24.548710621,38 days 08:21:34.525736990,4.615927,5.48061,994.876894


#### For some recordings the age at the time of transfer is "negative"  - these need to be corrected

In [33]:
clin_df[clin_df['Postnatal Age'] < pd.to_timedelta(0)].sort_values('Postnatal Age')

Unnamed: 0,Case ID,Date of Birth,Gestational Age,Birth Weight,Actual Weight,Pathology,Start,End,Recording start,Recording end,Duration,Postnatal Age,Corrected gestational Age,Gestational Age (weeks),Corrected gestational Age (weeks),Weight


#### For some recordings the duration of the recording is "negative"  - these need to be corrected

In [34]:
clin_df[clin_df['Duration'] < pd.to_timedelta(0)]

Unnamed: 0,Case ID,Date of Birth,Gestational Age,Birth Weight,Actual Weight,Pathology,Start,End,Recording start,Recording end,Duration,Postnatal Age,Corrected gestational Age,Gestational Age (weeks),Corrected gestational Age (weeks),Weight


#### Duration longer than 6 hours - this would be unusual for neonatal transport

In [35]:
clin_df[clin_df['Duration'] > pd.to_timedelta('6H')]

Unnamed: 0,Case ID,Date of Birth,Gestational Age,Birth Weight,Actual Weight,Pathology,Start,End,Recording start,Recording end,Duration,Postnatal Age,Corrected gestational Age,Gestational Age (weeks),Corrected gestational Age (weeks),Weight
AT000842,61060,2022-08-28 01:55:00,238 days,2350,,"[Neonat praemat gr s 34 (P07.3) , RDS (P22) ...","Sun, 28 Aug 2022 084958 +0200","Sun, 28 Aug 2022 152108 +0200",2022-08-28 08:49:58,2022-08-28 15:21:08,0 days 06:31:10,0 days 13:26:08,238 days 13:26:08,34.0,34.1,2350.0
AT001272,65673,2024-04-10 00:00:00,287 days,3620,3500.0,[],"Sat, 13 Apr 2024 210219 +0200","Sat, 13 Apr 2024 221249 +0200",2024-04-13 21:02:19,2024-04-14 03:20:38,0 days 06:18:19,4 days 03:20:38,291 days 03:20:38,41.0,41.6,3500.0


#### Babies was at less than 23 weeks gestation

In [36]:
clin_df[clin_df['Gestational Age (weeks)'] < 23]

Unnamed: 0,Case ID,Date of Birth,Gestational Age,Birth Weight,Actual Weight,Pathology,Start,End,Recording start,Recording end,Duration,Postnatal Age,Corrected gestational Age,Gestational Age (weeks),Corrected gestational Age (weeks),Weight
AT000017,56003,2020-11-10 16:00:00,154 days,520,,"[Neonat immat gr s 22 (P07.2) , Hypothermia ...","Tue, 10 Nov 2020 175907 +0100","Tue, 10 Nov 2020 193447 +0100",2020-11-10 17:59:07,2020-11-10 19:34:47,0 days 01:35:40,0 days 03:34:47,154 days 03:34:47,22.0,22.0,520.0
AT001235,63907,2023-08-30 12:41:00,154 days,280,,"[Neonat immat gr s 22 (P07.2) , RDS (P22) ,...","Wed, 30 Aug 2023 141835 +0200","Wed, 30 Aug 2023 153933 +0200",2023-08-30 14:18:35,2023-08-30 15:39:33,0 days 01:20:58,0 days 02:58:33,154 days 02:58:33,22.0,22.0,280.0


#### Babies born with less than 500 g birth weight

In [37]:
len(clin_df[clin_df['Birth Weight'] < 1000])

110

In [38]:
clin_df[clin_df['Birth Weight'] < 500]

Unnamed: 0,Case ID,Date of Birth,Gestational Age,Birth Weight,Actual Weight,Pathology,Start,End,Recording start,Recording end,Duration,Postnatal Age,Corrected gestational Age,Gestational Age (weeks),Corrected gestational Age (weeks),Weight
AT000129,56646,2021-01-23 00:00:00,203 days,460,620.0,"[Neonat praemat gr s 29 (P07.3) , Dysmaturit...","Thu, 11 Feb 2021 114454 +0100","Thu, 11 Feb 2021 123425 +0100",2021-02-11 12:44:54,2021-02-11 13:34:25,0 days 00:49:31,19 days 13:34:25,222 days 13:34:25,29.0,31.8,620.0
AT000357,58015,2021-08-03 00:00:00,280 days,320,,"[Neonat mat gr s 40 (P96.4) , Asphyxia perin...","Wed, 04 Aug 2021 001908 +0200","Wed, 04 Aug 2021 010710 +0200",2021-08-04 00:19:08,2021-08-04 01:07:10,0 days 00:48:02,1 days 01:07:10,281 days 01:07:10,40.0,40.1,320.0
AT000686,59909,2022-04-16 05:07:00,168 days,450,,"[Neonat immat gr s 24 (P07.2) , RDS (P22) ,...","Sat, 16 Apr 2022 062815 +0200","Sat, 16 Apr 2022 090723 +0200",2022-04-16 06:28:15,2022-04-16 09:07:23,0 days 02:39:08,0 days 04:00:23,168 days 04:00:23,24.0,24.0,450.0
AT000916,62119,2022-12-17 12:19:00,182 days,490,490.0,"[Neonat immat gr s 26 (P07.2) , RDS (P22) ,...","Tue, 27 Dec 2022 155336 +0100","Tue, 27 Dec 2022 162456 +0100",2022-12-27 15:53:36,2022-12-27 16:24:56,0 days 00:31:20,10 days 04:05:56,192 days 04:05:56,26.0,27.5,490.0
AT001123,63199,2023-03-24 00:00:00,168 days,410,1340.0,"[Neonat immat gr s 24 (P07.2) , BPD (P27.1) ...","Wed, 31 May 2023 105159 +0200","Wed, 31 May 2023 113043 +0200",2023-05-31 10:51:59,2023-05-31 11:30:43,0 days 00:38:44,68 days 11:30:43,236 days 11:30:43,24.0,33.8,1340.0
AT001126,63217,2023-03-24 00:00:00,168 days,410,1380.0,"[Sepsis (P36.9) , Légzészavar (P22.8) , Ne...","Fri, 02 Jun 2023 104943 +0200","Fri, 02 Jun 2023 113505 +0200",2023-06-02 10:49:43,2023-06-02 11:35:05,0 days 00:45:22,70 days 11:35:05,238 days 11:35:05,24.0,34.1,1380.0
AT001235,63907,2023-08-30 12:41:00,154 days,280,,"[Neonat immat gr s 22 (P07.2) , RDS (P22) ,...","Wed, 30 Aug 2023 141835 +0200","Wed, 30 Aug 2023 153933 +0200",2023-08-30 14:18:35,2023-08-30 15:39:33,0 days 01:20:58,0 days 02:58:33,154 days 02:58:33,22.0,22.0,280.0



#### Babies transferred with the postnatal age of > 46 weeks we need to discuss whether to include them in the data analysis

In [39]:
a = clin_df[clin_df['Corrected gestational Age (weeks)'] > 46]
a.sort_values('Corrected gestational Age (weeks)', ascending=False);

In [40]:
len(clin_df[clin_df['Corrected gestational Age (weeks)'] > 46])

42

### 9. Identify ICD codes

In [41]:
icd = re.compile(r'\(\S\d+\.?\d*\)')

In [42]:
def icd_finder(lst):
    icd_list = []
    for item in lst:
        if icd.findall(item):
            icd_list.append(icd.findall(item)[0])
    
    return icd_list

In [43]:
def icd_cleanup(lst):
    icd_list = []
    for item in lst:
        if '.' in item:
            new_item = item[0 : item.index('.')] + item[item.index('.') + 1 : ]
        else:
            new_item = item
        icd_list.append(new_item[1:-1])
    return icd_list

In [44]:
clin_df['ICD'] = clin_df['Pathology'].apply(icd_finder)

In [45]:
clin_df['ICD'] = clin_df['ICD'].apply(icd_cleanup)

In [46]:
clin_df['ICD'];

### 10. Identify all icd codes

In [47]:
icd_all = []

for _, item in clin_df.iterrows():
    icd_all.extend(item['ICD'])
    
icd_all = sorted(set(icd_all))

In [48]:
len(icd_all)

193

In [49]:
icd_all_frme = DataFrame(icd_all)
icd_all_frme.columns = ['code']
icd_all_frme.sort_values(by = 'code', inplace = True)

In [50]:
icd_all_frme.head()

Unnamed: 0,code
0,A419
1,A500
2,B3420
3,C383
4,D4890


In [51]:
len(icd_all_frme)

193

In [52]:
writer = pd.ExcelWriter(os.path.join(DIR_WRITE, 'icd_codes.xlsx'))
icd_all_frme.to_excel(writer, 'icd_codes')
writer.close()

### 11. Import the file that has been manually curated from the exported `icd_codes.xlsx` files to contain all relevant diagnosis previously identified

In [53]:
icd_codes = pd.read_excel(os.path.join(PATH, 'ventilation_fabian_new', 'icd_codes_curated_new.xlsx'), 
    usecols = [0,1], index_col = 0)

In [54]:
icd_codes.head()

Unnamed: 0_level_0,name
code,Unnamed: 1_level_1
A419,"Sepsis, unspecified organism"
B3420,"Coronavirus infection, unspecified"
A500,Congenital syphilis
C383,"Malignant neoplasm of mediastinum, part unspec..."
D180,Hemangioma unspecified site


In [55]:
icd_all_frme.head()

Unnamed: 0,code
0,A419
1,A500
2,B3420
3,C383
4,D4890


Identify new codes not in the dataset so far

In [56]:
new_diagnoses = sorted(set(icd_all_frme['code']) - set(icd_codes.index))
new_diagnoses

[]

At this point the `icd_codes_curated_new.xlsx` file needs to be manually curated with these entries.

### 12. Import the now curated `icd_codes.xlsx` files to contain now all relevant diagnosis including new ones

In [57]:
icd_codes = pd.read_excel(os.path.join(PATH, 'ventilation_fabian_new', 'icd_codes_curated_new.xlsx'),
    usecols = [0,1], index_col = 0)

In [58]:
# Now there are no new codes
sorted(set(icd_all_frme['code']) - set(icd_codes.index))

[]

### 13. Create Pathology column with English names

In [59]:
icd_dictionary = dict(zip(icd_codes.index, icd_codes['name']))

In [60]:
def icd_replace(lst):
    icd_list = []
    for item in lst:
        new_item = icd_dictionary[item]
        icd_list.append(new_item)
    return icd_list

In [61]:
clin_df['Pathology_English'] = clin_df['ICD'].apply(icd_replace)

In [62]:
clin_df.head()

Unnamed: 0,Case ID,Date of Birth,Gestational Age,Birth Weight,Actual Weight,Pathology,Start,End,Recording start,Recording end,Duration,Postnatal Age,Corrected gestational Age,Gestational Age (weeks),Corrected gestational Age (weeks),Weight,ICD,Pathology_English
AT000005,55871,2020-10-21 08:31:00,203 days,1150,,"[Neonat praemat gr s 29 (P07.3) , RDS (P22) ...","Wed, 21 Oct 2020 101127 +0200","Wed, 21 Oct 2020 122422 +0200",2020-10-21 10:11:27,2020-10-21 12:24:22,0 days 02:12:55,0 days 03:53:22,203 days 03:53:22,29.0,29.0,1150.0,"[P073, P22, _0000, I959]","[Preterm newborn, unspecified weeks of gestati..."
AT000006,55875,2020-09-27 15:45:00,189 days,820,940.0,"[Neonat immat gr s 27 (P07.2) , RDS (P22) ,...","Wed, 21 Oct 2020 145705 +0200","Wed, 21 Oct 2020 153836 +0200",2020-10-21 14:57:05,2020-10-21 15:38:36,0 days 00:41:31,23 days 23:53:36,212 days 23:53:36,27.0,30.4,940.0,"[P072, P22, _0000, A500]","[Extreme immaturity of newborn, unspecified we..."
AT000007,55881,2020-10-21 07:44:00,252 days,3250,,"[Neonat praemat gr s 36 (P07.3) , Gemini A (...","Wed, 21 Oct 2020 185027 +0200","Wed, 21 Oct 2020 201025 +0200",2020-10-21 18:50:27,2020-10-21 20:10:25,0 days 01:19:58,0 days 12:26:25,252 days 12:26:25,36.0,36.1,3250.0,"[P073, P015, _0000]","[Preterm newborn, unspecified weeks of gestati..."
AT000008,55883,2020-10-17 16:29:00,245 days,2280,,"[Resuscitatio (Z51.8) st.p. , Hypothermia (P8...","Thu, 22 Oct 2020 104332 +0200","Thu, 22 Oct 2020 121641 +0200",2020-10-22 10:43:32,2020-10-22 12:16:41,0 days 01:33:09,4 days 19:47:41,249 days 19:47:41,35.0,35.7,2280.0,"[Z518, P809, P073, _0000, P285, R571]","[Resuscitation, Hypothermia of newborn, unspec..."
AT000009,55891,2020-10-22 11:25:00,196 days,800,,"[Neonat praemat gr s 28 (P07.3) , RDS (P22) ...","Thu, 22 Oct 2020 142312 +0200","Thu, 22 Oct 2020 161231 +0200",2020-10-22 14:23:12,2020-10-22 16:12:31,0 days 01:49:19,0 days 04:47:31,196 days 04:47:31,28.0,28.0,800.0,"[P073, P22, P809, _0000]","[Preterm newborn, unspecified weeks of gestati..."


### 14. Final cleanup of the DataFrame

In [63]:
clin_df.columns

Index(['Case ID', 'Date of Birth', 'Gestational Age', 'Birth Weight',
       'Actual Weight', 'Pathology', 'Start', 'End', 'Recording start',
       'Recording end', 'Duration', 'Postnatal Age',
       'Corrected gestational Age', 'Gestational Age (weeks)',
       'Corrected gestational Age (weeks)', 'Weight', 'ICD',
       'Pathology_English'],
      dtype='object')

In [64]:
column_list = ['Case ID', 'Date of Birth', 'Gestational Age (weeks)', 'Birth Weight',
              'Postnatal Age', 'Corrected gestational Age (weeks)',  'Weight',
              'ICD', 'Pathology_English', 'Recording start', 'Recording end', 'Duration',] 
      
clin_df = clin_df[column_list]

In [65]:
clin_df.tail()

Unnamed: 0,Case ID,Date of Birth,Gestational Age (weeks),Birth Weight,Postnatal Age,Corrected gestational Age (weeks),Weight,ICD,Pathology_English,Recording start,Recording end,Duration
AT001389,66300,2024-06-14 00:00:00,28.0,990,13 days 14:36:39,29.9,1000.0,[],[],2024-06-27 13:45:59,2024-06-27 14:36:39,0 days 00:50:40
AT001391,66308,2024-06-28 08:32:00,24.0,630,0 days 02:27:22,24.0,630.0,[],[],2024-06-28 09:20:00,2024-06-28 10:59:22,0 days 01:39:22
AT001395,66321,2024-06-28 21:50:00,26.0,900,0 days 02:05:01,26.0,900.0,[],[],2024-06-28 22:13:13,2024-06-28 23:55:01,0 days 01:41:48
AT001396,66322,2024-06-28 13:35:00,36.0,3310,0 days 13:29:03,36.1,3310.0,[],[],2024-06-29 01:52:49,2024-06-29 03:04:03,0 days 01:11:14
AT001397,66326,2024-06-29 16:00:00,38.0,3300,0 days 07:34:39,38.0,3300.0,[],[],2024-06-29 22:14:21,2024-06-29 23:34:39,0 days 01:20:18


### 15. Statistics on clinical data

In [66]:
clinical_stats = round(clin_df.describe(percentiles = [0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99]), 1)
clinical_stats

Unnamed: 0,Date of Birth,Gestational Age (weeks),Postnatal Age,Corrected gestational Age (weeks),Weight,Recording start,Recording end,Duration
count,1079,1079.0,1079,1079.0,1048.0,1079,1079,1079
mean,2022-05-13 06:45:52.159407104,35.5,10 days 11:19:48.531974050,37.0,2740.7,2022-05-23 16:52:03.202965760,2022-05-23 18:05:40.691380736,0 days 01:13:37.488415199
min,2020-09-27 15:45:00,22.0,0 days 00:04:33,22.0,280.0,2020-10-21 10:11:27,2020-10-21 12:24:22,0 days 00:15:52
1%,2020-11-07 10:03:40.800000,24.0,0 days 01:09:49.600000,24.8,584.1,2020-11-10 10:59:24.260000,2020-11-10 12:25:10.700000,0 days 00:18:51.560000
5%,2020-12-20 09:26:12,25.0,0 days 01:42:09.800000,27.7,990.0,2020-12-23 14:03:50.800000,2020-12-23 15:25:06.100000,0 days 00:27:55.600000
25%,2021-07-12 02:23:30,33.0,0 days 03:11:39.500000,34.0,2097.5,2021-07-29 06:11:53,2021-07-29 06:54:30,0 days 00:44:24.500000
50%,2022-03-11 00:00:00,37.0,0 days 06:12:49,37.6,2850.0,2022-03-31 08:06:29,2022-03-31 09:38:13,0 days 01:06:31
75%,2023-02-27 15:37:30,39.0,3 days 06:21:34,40.0,3450.0,2023-03-08 20:56:46.500000,2023-03-08 21:39:21,0 days 01:33:47
95%,2024-05-09 22:23:36,40.0,65 days 18:13:30.499999997,44.8,4216.5,2024-05-14 23:57:21.900000,2024-05-15 01:16:26.900000,0 days 02:25:17.899999999
99%,2024-06-21 15:55:04.800000,41.0,128 days 01:18:32.400000014,55.9,4917.1,2024-06-22 21:35:38.920000,2024-06-22 23:11:00.300000,0 days 03:24:38.920000


### 16. Export clinical information data in tables and individual text files

##### Create sub-directories for each case if it does not yet exist

In [67]:
# Images and raw data will be written on an external hard drive
os.makedirs(os.path.join(DATA_DUMP, 'fabian_cases_new'), exist_ok = True)

for case in sorted(clin_df.index): 
    os.makedirs(os.path.join(DATA_DUMP, 'fabian_cases_new', case), exist_ok = True)

##### Export clinical data about individual cases as text files

In [68]:
# Clinical info about all recordings which clinical data are available and are over 15 minutes long

for case in sorted(clin_df.index):
    
    fileout = open(os.path.join(DATA_DUMP, 'fabian_cases_new', case, f'{case}_clin_info.txt'), 'w')
    
    fileout.write('Case ID:             %-50s\n' % case)
    fileout.write('Start:               %-50s\n' % datetime.strftime(clin_df.loc[case]['Recording start'], 
                                                          '%d/%m/%Y %H:%M:%S', ))
    fileout.write('End:                 %-50s\n' % datetime.strftime(clin_df.loc[case]['Recording end'], 
                                                          '%d/%m/%Y %H:%M:%S', ))
    fileout.write('Duration:            %-50s\n' % clin_df.loc[case]['Duration'])
    fileout.write('Gestational age:     %-50s\n' % clin_df.loc[case]['Gestational Age (weeks)'])
    fileout.write('Postmenstrual age:   %-50s\n' % clin_df.loc[case]['Corrected gestational Age (weeks)'])
    fileout.write('Birth Weight:        %-50s\n' % clin_df.loc[case]['Birth Weight'])
    fileout.write('Weight:              %-50s\n' % clin_df.loc[case]['Weight'])
    fileout.write('ICD:                 %-50s\n' % ', '.join(clin_df.loc[case]['ICD']))
    fileout.write('Diagnoses:           %-50s\n' % ', '.join(clin_df.loc[case]['Pathology_English']))
    
    fileout.close()

#### Export clinical information as an Excel sheet

In [69]:
writer = pd.ExcelWriter(os.path.join(DIR_WRITE, 'clinical_data_all_new.xlsx'))
clin_df.to_excel(writer, 'clin_df')
writer.close()

### 17. Export statistics on clinical data

In [72]:
writer = pd.ExcelWriter(os.path.join(DIR_WRITE, 'clinical_stats_new_1_1397.xlsx'))
clinical_stats.to_excel(writer, 'stats')
writer.close()

### 18. Export processed data as pickle files

In [73]:
with open(os.path.join(DATA_DUMP, 'clin_df_new.pickle'), 'wb') as handle:
    pickle.dump(clin_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

### 19. Create patient lists for various disease groups

#### A. RDS

In [74]:
RDS_dg = {'P22', 'P220'}

In [75]:
RDS_cases = []
for case, dgs in clin_df['ICD'].items():
    if set(dgs).intersection(RDS_dg):
        RDS_cases.append(case)

In [76]:
clin_df_RDS = clin_df.loc[RDS_cases]
clin_df_RDS;

In [77]:
len(clin_df_RDS)

309

#### B. HIE

In [78]:
HIE_dg = ['P219', 'Z518', 'Z548',]  

In [79]:
HIE_cases = []
for case, dgs in clin_df['ICD'].items():
    if set(dgs).intersection(HIE_dg):
        HIE_cases.append(case)

In [80]:
clin_df_HIE = clin_df.loc[HIE_cases]
clin_df_HIE;

In [81]:
len(clin_df_HIE)

178

#### C. Meconium aspiration

In [82]:
MAS_dg = ['P240',]

In [83]:
MAS_cases = []
for case, dgs in clin_df['ICD'].items():
    if set(dgs).intersection(MAS_dg):
        MAS_cases.append(case)

In [84]:
clin_df_MAS = clin_df.loc[MAS_cases]
clin_df_MAS;

In [85]:
len(clin_df_MAS)

90

#### D. PPHN

In [86]:
PPHN_dg = ['P293', ]

In [87]:
PPHN_cases = []
for case, dgs in clin_df['ICD'].items():
    if set(dgs).intersection(PPHN_dg):
        PPHN_cases.append(case)

In [88]:
clin_df_PPHN = clin_df.loc[PPHN_cases]
clin_df_PPHN;

In [89]:
len(clin_df_PPHN)

37

#### E. Congenital diaphragmatic hernia

In [90]:
CDH_dg = ['Q790', 'Q791']

In [91]:
CDH_cases = []
for case, dgs in clin_df['ICD'].items():
    if set(dgs).intersection(CDH_dg):
        CDH_cases.append(case)

In [92]:
clin_df_CDH = clin_df.loc[CDH_cases]
clin_df_CDH;

In [93]:
len(clin_df_CDH)

13

#### F. Necrotizing enterocolitis (NEC)

In [94]:
NEC_dg = ['P77',]

In [95]:
NEC_cases = []
for case, dgs in clin_df['ICD'].items():
    if set(dgs).intersection(NEC_dg):
        NEC_cases.append(case)

In [96]:
clin_df_NEC = clin_df.loc[NEC_cases]
clin_df_NEC;

In [97]:
len(clin_df_NEC)

32

#### G. Surgical cases (except NEC, CDH and cardiac)

In [98]:
surgical_dg = ['K409', 'K562', 'K566', 'K631', 'K921', 'K9210',  'Q019',
               'Q059',  'Q300', 'Q319', 'Q321', 'Q330',  'Q391', 'Q392', 'Q410', 'Q423' , 'Q428', 
               'Q431', 'Q433', 'Q438', 'Q4380', 'Q445', 'Q512', 'Q549', 'Q556', 'Q620', 'Q621', 'Q639',
               'Q641', 'Q642', 'Q791', 'Q792', 'Q793', 'Q848', 'R1000', 'Z432', 'Z433', 'Z921', 'Z930', 
               'Z931', 'Z9320', 'Z933' ]

In [99]:
surgical_cases = []
for case, dgs in clin_df['ICD'].items():
    if set(dgs).intersection(surgical_dg):
        surgical_cases.append(case)

In [100]:
clin_df_surgical = clin_df.loc[surgical_cases]
clin_df_surgical;

In [101]:
len(clin_df_surgical)

67

### Cardiac cases (except PFO / ASD)

In [102]:
cardiac_dg = ['Q201', 'Q203', 'Q205', 'Q210', 'Q211', 'Q212', 'Q213', 'Q220', 'Q221', 'Q223', 'Q224', 'Q225', 
              'Q228', 'Q230', 'Q232', 'Q234', 'Q240', 'Q244', 'Q245', 'Q251', 'Q252', 'Q253', 'Q254', 'Q262',]

In [103]:
cardiac_cases = []
for case, dgs in clin_df['ICD'].items():
    if set(dgs).intersection(cardiac_dg):
        cardiac_cases.append(case)

In [104]:
clin_df_cardiac = clin_df.loc[cardiac_cases]
clin_df_cardiac;

In [105]:
len(cardiac_dg)

24

### Export clinical dataframes into a multisheet Excel file

In [106]:
writer = pd.ExcelWriter(os.path.join(DIR_WRITE, 'clinical_data_diseases_new.xlsx'))

clin_df.to_excel(writer, 'all')
clin_df_cardiac.to_excel(writer, 'cardiac')
clin_df_CDH.to_excel(writer, 'CDH')
clin_df_HIE.to_excel(writer, 'HIE')
clin_df_MAS.to_excel(writer, 'MAS')
clin_df_NEC.to_excel(writer, 'NEC')
clin_df_PPHN.to_excel(writer, 'PPHN')
clin_df_RDS.to_excel(writer, 'RDS')
clin_df_surgical.to_excel(writer, 'surgical')

writer.close()

### Export selected clinical data as pickle archive

In [107]:
with open(os.path.join(DATA_DUMP, 'clin_df_new_cardiac.pickle'), 'wb') as handle:
    pickle.dump(clin_df_cardiac, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(os.path.join(DATA_DUMP, 'clin_df_new_CDH.pickle'), 'wb') as handle:
    pickle.dump(clin_df_CDH, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(os.path.join(DATA_DUMP, 'clin_df_new_HIE.pickle'), 'wb') as handle:
    pickle.dump(clin_df_HIE, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(os.path.join(DATA_DUMP, 'clin_df_new_MAS.pickle'), 'wb') as handle:
    pickle.dump(clin_df_MAS, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(os.path.join(DATA_DUMP, 'clin_df_new_NEC.pickle'), 'wb') as handle:
    pickle.dump(clin_df_NEC, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(os.path.join(DATA_DUMP, 'clin_df_new_PPHN.pickle'), 'wb') as handle:
    pickle.dump(clin_df_PPHN, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(os.path.join(DATA_DUMP, 'clin_df_new_RDS.pickle'), 'wb') as handle:
    pickle.dump(clin_df_RDS, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(os.path.join(DATA_DUMP, 'clin_df_new_surgical.pickle'), 'wb') as handle:
    pickle.dump(clin_df_surgical, handle, protocol=pickle.HIGHEST_PROTOCOL)
       