# Install Required Dependencies

In [None]:
!pip install vitaldb

In [None]:
import os
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch

import vitaldb

# Set Random Seed for Reproducibility

In [None]:
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.use_deterministic_algorithms(True)
os.environ['PYTHONASHSEED'] = str(seed)

# Set Up Local Data Caches

Since the VitalDB data is static, local copies are stored and reused to avoid expensive downloads and to speed up data processing.

The default directory defined below is already in the project `.gitignore` file. If later modified, it should also be added to the project `.gitignore`.

In [None]:
VITALDB_CACHE = './vitaldb_cache'
VITAL_ALL = 'vital_all'
VITAL_MINI = 'vital_mini'
VITAL_METADATA = 'metadata'

In [None]:
!mkdir -p $VITALDB_CACHE
!mkdir -p $VITALDB_CACHE/$VITAL_ALL
!mkdir -p $VITALDB_CACHE/$VITAL_MINI
!mkdir -p $VITALDB_CACHE/$VITAL_METADATA
!ls -l $VITALDB_CACHE

# OSFS Bulk Data Download

**This step is not required, but will significantly speed up downstream processing and avoid a high volume of API requests to the VitalDB web site.**

The cache population code checks if OSFS bulk download data of VitalDB vital files is locally available.

- Manually downloaded the OSF Store archives from the following site: https://osf.io/dtc45/
    - `Vital Files 0001-2000`
    - `Vital Files 2001-4000`
    - `Vital Files 4001-6388`
- Once the `OSF Storage (United States)` link is clicked a `Download as zip` link will appear.
- Once downloaded, extract each of the 3 zip archives.
- Move all files from each of the unzip directories into the `${VITALDB_CACHE}/${VITAL_ALL}` directory.

In [None]:
# Returns the Pandas DataFrame for the specified dataset.
#   One of 'cases', 'labs', or 'trks'
# If the file exists locally, create and return the DataFrame.
# Else, download and cache the csv first, then return the DataFrame.
def vitaldb_dataframe_loader(dataset_name):
    if dataset_name not in ['cases', 'labs', 'trks']:
        raise ValueError(f'Invalid dataset name: {dataset_name}')
    file_path = f'{VITALDB_CACHE}/{VITAL_METADATA}/{dataset_name}.csv'
    if os.path.isfile(file_path):
        print(f'{dataset_name}.csv exists locally.')
        df = pd.read_csv(file_path)
        return df
    else:
        print(f'downloading {dataset_name} and storing in the local cache for future reuse.')
        df = pd.read_csv(f'https://api.vitaldb.net/{dataset_name}')
        df.to_csv(file_path, index=False)
        return df

# Cases

In [None]:
cases = vitaldb_dataframe_loader('cases')
cases = cases.set_index('caseid')
cases.shape

In [None]:
cases.index.nunique()

In [None]:
cases.head()

In [None]:
cases['sex'].value_counts()

# Tracks

In [None]:
trks = vitaldb_dataframe_loader('trks')
trks = trks.set_index('caseid')
trks.shape

In [None]:
trks.index.nunique()

In [None]:
trks.groupby('caseid')[['tid']].count().plot();

In [None]:
trks.groupby('caseid')[['tid']].count().hist();

In [None]:
trks.groupby('tname').count().sort_values(by='tid', ascending=False)

## Parameters of Interest

### Hemodynamic Parameters Reference
https://vitaldb.net/dataset/?query=overview#h.f7d712ycdpk2

**Solar8000/ART_MBP**

mean blood pressure

Parameter, Description, Type/Hz, Unit

Solar8000/ART_MBP, Mean arterial pressure, N, mmHg

In [None]:
trks[trks['tname'].str.contains('Solar8000/ART_MBP')].shape

**SNUADC/ART**

arterial blood pressure waveform

Parameter, Description, Type/Hz, Unit

SNUADC/ART, Arterial pressure wave, W/500, mmHg

In [None]:
trks[trks['tname'].str.contains('SNUADC/ART')].shape

**SNUADC/ECG_II**

electrocardiogram waveform

Parameter, Description, Type/Hz, Unit

SNUADC/ECG_II, ECG lead II wave, W/500, mV

In [None]:
trks[trks['tname'].str.contains('SNUADC/ECG_II')].shape

**BIS/EEG1_WAV**

electroencephalogram waveform

Parameter, Description, Type/Hz, Unit

BIS/EEG1_WAV, EEG wave from channel 1, W/128, uV

In [None]:
trks[trks['tname'].str.contains('BIS/EEG1_WAV')].shape

# Cases of Interest

These are the subset of case ids for which modelling and analysis will be performed based upon inclusion criteria and waveform data availability.

In [None]:
TRACK_NAMES = ['SNUADC/ART', 'SNUADC/ECG_II', 'BIS/EEG1_WAV']

In [None]:
# As in the paper, select cases which meet the following criteria:
#
# For patients, the inclusion criteria were as follows:
# (1) adults (age >= 18)
# (2) administered general anaesthesia
# (3) undergone non-cardiac surgery. 
#
# For waveform data, the inclusion criteria were as follows:
# (1) no missing monitoring for ABP, ECG, and EEG waveforms
# (2) no cases containing false events or non-events due to poor signal quality
#     (checked in second stage of data preprocessing)

# adult
inclusion_1 = cases.loc[cases['age'] >= 18].index
print(f'{len(cases)-len(inclusion_1)} cases excluded, {len(inclusion_1)} remaining due to age criteria')

# general anesthesia
inclusion_2 = cases.loc[cases['ane_type'] == 'General'].index
print(f'{len(cases)-len(inclusion_2)} cases excluded, {len(inclusion_2)} remaining due to anesthesia criteria')

# non-cardiac surgery
inclusion_3 = cases.loc[
    ~cases['opname'].str.contains("cardiac", case=False)
    & ~cases['opname'].str.contains("aneurysmal", case=False)
].index
print(f'{len(cases)-len(inclusion_3)} cases excluded, {len(inclusion_3)} remaining due to non-cardiac surgery criteria')

# ABP, ECG, EEG waveforms
TRACK_NAMES = ['SNUADC/ART', 'SNUADC/ECG_II', 'BIS/EEG1_WAV']
inclusion_4 = trks.loc[trks['tname'].isin(TRACK_NAMES)].index.value_counts()
inclusion_4 = inclusion_4[inclusion_4 == len(TRACK_NAMES)].index
print(f'{len(cases)-len(inclusion_4)} cases excluded, {len(inclusion_4)} remaining due to missing waveform data')

cases_of_interest_idx = inclusion_1 \
    .intersection(inclusion_2) \
    .intersection(inclusion_3) \
    .intersection(inclusion_4)

cases_of_interest = cases.loc[cases_of_interest_idx]

print()
print(f'{cases_of_interest_idx.shape[0]} out of {cases.shape[0]} total cases remaining after exclusions applied')

In [None]:
cases_of_interest.head(n=5)

# Tracks of Interest

These are the subset of tracks (waveforms) for the cases of interest identified above.

In [None]:
# A single case maps to one or more waveform tracks. Select only the tracks required for analysis.
trks_of_interest = trks.loc[cases_of_interest_idx][trks.loc[cases_of_interest_idx]['tname'].isin(TRACK_NAMES)]
trks_of_interest.shape

In [None]:
trks_of_interest.head(n=5)

In [None]:
trks_of_interest_idx = trks_of_interest.set_index('tid').index
trks_of_interest_idx.shape

## Build Tracks Cache for Local Processing

Tracks data are large and therefore expensive to download every time used.
By default, the vital file format stores all tracks for each case internally. Since only certain tracks per case are required, each vital file can be further truncated to only store the tracks for needed waveforms.

In [None]:
# Maximum number of cases of interest for which to download data.
# Set to a small value for demo purposes, else set to None to disable and download all.
MAX_CASES = None
#MAX_CASES = 10

In [None]:
# Ensure the full vital file dataset is available for cases of interest.
count_downloaded = 0
count_present = 0

#for i, idx in enumerate(cases.index):
for i, idx in enumerate(cases_of_interest_idx):
    if MAX_CASES and i >= MAX_CASES:
        break

    full_path = f'{VITALDB_CACHE}/{VITAL_ALL}/{idx:04d}.vital'
    if not os.path.isfile(full_path):
        print(f'Missing vital file: {full_path}')
        # Download and save the file.
        vf = vitaldb.VitalFile(idx)
        vf.to_vital(full_path)
        count_downloaded += 1
    else:
        count_present += 1

print()
print(f'Count of cases of interest:           {cases_of_interest_idx.shape[0]}')
print(f'Count of vital files downloaded:      {count_downloaded}')
print(f'Count of vital files already present: {count_present}')

In [None]:
# Convert vital files to "mini" versions including only the subset of tracks based on TRACK_NAMES defined above.
# Only perform conversion for the cases of interest.
# NOTE: If this cell is interrupted, it can be restarted and will continue where it left off.
count_minified = 0
count_present = 0

for i, idx in enumerate(cases_of_interest_idx):
    if MAX_CASES and i >= MAX_CASES:
        break
    
    full_path = f'{VITALDB_CACHE}/{VITAL_ALL}/{idx:04d}.vital'
    mini_path = f'{VITALDB_CACHE}/{VITAL_MINI}/{idx:04d}_mini.vital'
    if not os.path.isfile(mini_path):
        print(f'Creating mini vital file: {idx}')
        vf = vitaldb.VitalFile(full_path, TRACK_NAMES)
        vf.to_vital(mini_path)
        count_minified += 1
    else:
        count_present += 1

print()
print(f'Count of cases of interest:           {cases_of_interest_idx.shape[0]}')
print(f'Count of vital files minified:        {count_minified}')
print(f'Count of vital files already present: {count_present}')

# Track Plotting Examples

These examples show multiple ways of accessing the same track data.

In [None]:
tmp_vf_path = f'{VITALDB_CACHE}/{VITAL_MINI}/0001_mini.vital'

In [None]:
tmp_vf = vitaldb.VitalFile(tmp_vf_path)
tmp_vf.get_track_names()

**`vitaldb.VitalFile.get_track_samples()`**

In [None]:
tmp_art_00 = tmp_vf.get_track_samples(TRACK_NAMES[0], 1/100)
tmp_art_00.shape

In [None]:
plt.figure(figsize=(20, 5))
plt.plot(tmp_art_00)
plt.show()

**`vitaldb.VitalFile.to_numpy()`**

In [None]:
tmp_art_01 = tmp_vf.to_numpy(TRACK_NAMES[0], 1/100)
tmp_art_01.shape

In [None]:
plt.figure(figsize=(20, 5))
plt.plot(tmp_art_01)
plt.show()

**`vitaldb.VitalFile.get_track_samples()`**

In [None]:
tmp_art_02 = tmp_vf.get_track_samples(TRACK_NAMES[0], 1/100)
tmp_art_02.shape

In [None]:
plt.figure(figsize=(20, 5))
plt.plot(tmp_art_02)
plt.show()

**`vitaldb.vital_recs()`**

In [None]:
tmp_art_03 = vitaldb.vital_recs(tmp_vf_path, TRACK_NAMES[0], 1/100)
tmp_art_03.shape

In [None]:
plt.figure(figsize=(20, 5))
plt.plot(tmp_art_03)
plt.show()

**`vitaldb.dataset.load_trk()`**

NOTE: This downloads a track based on raw id. Should not be needed, but showing how to do it.

In [None]:
SHOW_TRAK_DOWNLOAD = False

if SHOW_TRAK_DOWNLOAD:
    case_0001_trk_art_id = '724cdd7184d7886b8f7de091c5b135bd01949959'
    tmp_art_04 = vitaldb.dataset.load_trk(case_0001_trk_art_id, 1/100)
    print(tmp_art_04.shape)
    
    plt.figure(figsize=(20, 5))
    plt.plot(tmp_art_04)
    plt.show()
else:
    print('Manual track download example skipped.')

# One Minute ABP Extraction