In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import os, sys

# Data

In [43]:
kidney_df = pd.read_csv('kidney_allFrames.csv')
kidney_df.head()

Unnamed: 0,timestamp,stay_id,anchor_age,patientweight,los,gender,alb,aniongap,bun,crp,...,paralytics,sedation,tpnutrition,vasoactive,vasopressors,vent,hours-k-iv,hours-mg-noniv,hours-p-iv,hours-mg-iv
0,2144-02-22 21:00:00,31055329,48,74.4,2.774931,0,0.0,21.0,130.0,249.7,...,0,2,0,0,0,0,,,,
1,2144-02-23 03:00:00,31055329,48,74.4,2.774931,0,0.0,21.0,130.0,249.7,...,0,1,0,0,0,0,,,,
2,2144-02-23 09:00:00,31055329,48,74.4,2.774931,0,0.0,23.0,130.0,258.2,...,0,1,0,0,0,0,,,,
3,2144-02-23 15:00:00,31055329,48,74.4,2.774931,0,0.0,23.0,130.0,258.2,...,0,1,0,0,0,0,,,,
4,2144-02-23 21:00:00,31055329,48,74.4,2.774931,0,0.0,23.0,132.0,258.2,...,0,0,0,0,0,0,,,,


In [56]:
cohort = pd.read_csv("data/mimic-iv-1.0/icu/icustays.csv.gz", compression='gzip')
diagnoses_icd = pd.read_csv("data/mimic-iv-1.0/hosp/diagnoses_icd.csv.gz", compression='gzip')

TRANSPLANT_CODES = ['0091', '0092', '0093', '02Y', '02YA0Z0', '02YA0Z1', 
                    '02YA0Z2', '0794', '07Y', '07YM0Z0', '07YM0Z1', '07YM0Z2', 
                    '07YP0Z0', '07YP0Z1', '07YP0Z2', '0BYM0Z0', '0BYM0Z1', 
                    '0BY', '0BYC0Z0', '0BYC0Z1', '0BYC0Z2', '0BYD0Z0', '0BYD0Z1', 
                    '0BYD0Z2', '0BYF0Z0', '0BYF0Z1', '0BYF0Z2', '0BYG0Z0', '0BYG0Z1', 
                    '0BYG0Z2', '0BYH0Z0', '0BYH0Z1', '0BYH0Z2', '0BYJ0Z0', '0BYJ0Z1',
                    '0BYJ0Z2', '0BYK0Z0', '0BYK0Z1', '0BYK0Z2', '0BYL0Z0', '0BYL0Z1', 
                    '0BYL0Z2', '0BYM0Z2', '0DY', '0DY50Z0', '0DY50Z1', '0DY50Z2', 
                    '0DY60Z0', '0DY60Z1', '0DY60Z2', '0DY80Z0', '0DY80Z1', '0DY80Z2',
                    '0DYE0Z0', '0DYE0Z1', '0DYE0Z2', '0FY', '0FY00Z0', '0FY00Z1', 
                    '0FY00Z2', '0FYG0Z0', '0FYG0Z1', '0FYG0Z2', '0TY', '0TY00Z0',
                    '0TY00Z1', '0TY00Z2', '0TY10Z0', '0TY10Z1', '0TY10Z2', '0UY',
                    '0UY00Z0', '0UY00Z1', '0UY00Z2', '0UY10Z0', '0UY10Z1', '0UY10Z2', 
                    '0UY90Z0', '0UY90Z1', '0UY90Z2', '0WY', '0WY20Z0', '0WY20Z1', '0XY', 
                    '0XYJ0Z0', '0XYJ0Z1', '0XYK0Z0', '0XYK0Z1', '10Y', '10Y03ZE', 
                    '10Y03ZF', '10Y03ZG', '10Y03ZH', '10Y03ZJ', '10Y03ZK', '10Y03ZL', 
                    '10Y03ZM', '10Y03ZN', '10Y03ZP', '10Y03ZQ', '10Y03ZR', '10Y03ZS', 
                    '10Y03ZT', '10Y03ZV', '10Y03ZY', '10Y04ZE', '10Y04ZF', '10Y04ZG', 
                    '10Y04ZH', '10Y04ZJ', '10Y04ZK', '10Y04ZL', '10Y04ZM', '10Y04ZN', 
                    '10Y04ZP', '10Y04ZQ', '10Y04ZR', '10Y04ZS', '10Y04ZT', '10Y04ZV', 
                    '10Y04ZY', '10Y07ZE', '10Y07ZF', '10Y07ZG', '10Y07ZH', '10Y07ZJ', 
                    '10Y07ZK', '10Y07ZL', '10Y07ZM', '10Y07ZN', '10Y07ZP', '10Y07ZQ', 
                    '10Y07ZR', '10Y07ZS', '10Y07ZT', '10Y07ZV', '10Y07ZY', '1160', 
                    '1169', '3350', '3351', '3352', '336', '3751', '4100', '4101', 
                    '4102', '4103', '4104', '4106', '4107', '4108', '4109', '4191', 
                    '4194', '4697', '4974', '5051', '5059', '5280', '5282', '5283', 
                    '5284', '5285', '5286', '5553', '5561', '5569', '6353', '6592',
                    '8256', '8258', '8375', '8377', '8664', '5855']

hadm_ids_w_transplants = diagnoses_icd[diagnoses_icd.icd_code.isin(TRANSPLANT_CODES)].hadm_id.unique()
stay_ids_w_transplants = cohort[cohort.hadm_id.isin(hadm_ids_w_transplants)].stay_id.unique()
kidney_df['transplant'] = 0
transplant_indices = kidney_df[kidney_df.stay_id.isin(stay_ids_w_transplants)].index
kidney_df.loc[transplant_indices, 'transplant'] = 1

## Flu/Pneumonia

In [3]:
resp_df = pd.read_csv('flu_pneum_allFrames.csv')
procs = pd.read_csv("data/mimic-iv-1.0/icu/procedureevents.csv.gz", compression='gzip')
resp_df = resp_df.drop(columns=['vent'])
resp_df.head()

Unnamed: 0,timestamp,stay_id,anchor_age,patientweight,los,gender,alb,aniongap,bun,crp,...,pnutrition,ponutrition,packedrbc,paralytics,sedation,tpnutrition,vasoactive,vasopressors,hours-p-iv,hours-mg-iv
0,2143-07-29 10:00:00,39439439,48,69.0,2.051157,0,0.0,25.0,136.0,0.0,...,0,0,0,0,0,0,0,0,,
1,2143-07-29 16:00:00,39439439,48,69.0,2.051157,0,0.0,24.0,121.0,0.0,...,0,0,0,0,0,0,0,0,,
2,2143-07-29 22:00:00,39439439,48,69.0,2.051157,0,2.9,21.0,101.0,0.0,...,0,0,0,0,0,0,0,0,,
3,2143-07-30 04:00:00,39439439,48,69.0,2.051157,0,2.9,21.0,101.0,0.0,...,0,0,0,0,0,0,0,0,,
4,2143-07-30 10:00:00,39439439,48,69.0,2.051157,0,2.9,21.0,75.0,0.0,...,0,0,0,0,0,0,0,0,,


In [4]:
inputevents = pd.read_csv("data/mimic-iv-1.0/icu/inputevents.csv.gz", compression='gzip')

vent = procs[procs['itemid'] == 225792]
ventilated_stay_ids = vent.stay_id.unique()
resp_df['ventilated'] = 0

ventilated_indices = resp_df[resp_df.stay_id.isin(ventilated_stay_ids)].index
resp_df.loc[ventilated_indices, 'ventilated'] = 1

# RNN with LSTM layers

We will treat this as a text classification problem, where each vital is a word

### Classification

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

TRAIN_COLS = ['anchor_age', 'patientweight', 'los', 'gender',
       'alb', 'aniongap', 'bun', 'crp', 'ca', 'chloride', 'creatinine',
       'fibrinogen', 'glucose', 'hgb', 'k', 'mg', 'na', 'p', 'platelets',
       'troponin', 'wbc', 'apneainterval', 'artco2p', 'arto2p', 'expratio',
       'hr', 'inspratio', 'insptime', 'nibpd', 'nibpm', 'nibps', 'pip', 'rr',
       'spo2', 'temp', 'urine', 'vm', 'vt', 'betablockers', 'ca-iv',
       'ca-noniv', 'cablockers', 'dextrose', 'fluids', 'insulin', 'k-iv',
       'hours-k-iv', 'loopdiuretics', 'mg-iv', 'mg-noniv', 'hours-mg-noniv',
       'p-iv', 'p-noniv', 'pnutrition', 'ponutrition', 'packedrbc',
       'paralytics', 'sedation', 'tpnutrition', 'vasoactive', 'vasopressors',
       'hours-p-iv']

DISEASE = 'resp'

if DISEASE == 'kidney':
    pred_var = 'transplant'
    X = kidney_df[TRAIN_COLS]
    y = kidney_df[pred_var]
else:
    pred_var = 'ventilated'
    X = resp_df[TRAIN_COLS]
    y = resp_df[pred_var]
    
X = X.fillna(-1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
x_t, X_val, y_t, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=0)


In [6]:
def get_compiled_model():

    model = tf.keras.Sequential([
        tf.keras.layers.Dense(x_t.shape[1], activation='relu'),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(1)
    ])

    model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
                  optimizer=tf.keras.optimizers.Adam(1e-4),
                  metrics=['accuracy'])
    return model
model = get_compiled_model()

In [7]:
train_dataset = tf.data.Dataset.from_tensor_slices((x_t, y_t))
val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))

BATCH_SIZE = 128
SHUFFLE_BUFFER_SIZE = 100

train_dataset = train_dataset.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)
val_dataset = val_dataset.batch(BATCH_SIZE)

model.fit(train_dataset, 
          epochs=20,
          validation_data=val_dataset,
          validation_steps=30)

Epoch 1/20


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x162cb63c8>

In [8]:
preds = model.predict(X_test)
preds = preds.flatten()

# Binarize preds
preds[preds <= 0] = 0
preds[preds > 0] = 1

correct_idx, = np.where(preds == y_test.values) 
incorrect_idx, = np.where(preds != y_test.values) 
correct = preds[correct_idx] 
incorrect = preds[incorrect_idx]
tp = correct[correct == 1] # True positives
fp = incorrect[incorrect == 1] # False positives

tn = correct[correct == 0] # True negatives
fn =  incorrect[incorrect == 0]# False negatives


print("Precision: ", len(tp)/(len(tp) + len(fp)))
print("Recall: ", len(tp)/(len(tp) + len(fn)))
print("Accuracy", len(correct) / (len(correct) + len(incorrect)))

Precision:  0.5798737325425675
Recall:  0.9739717223650386
Accuracy 0.7316757011548433


### Regression

In [9]:
from sklearn.ensemble import GradientBoostingRegressor

TRAIN_COLS = ['anchor_age', 'patientweight', 'gender',
       'alb', 'aniongap', 'bun', 'crp', 'ca', 'chloride', 'creatinine',
       'fibrinogen', 'glucose', 'hgb', 'k', 'mg', 'na', 'p', 'platelets',
       'troponin', 'wbc', 'apneainterval', 'artco2p', 'arto2p', 'expratio',
       'hr', 'inspratio', 'insptime', 'nibpd', 'nibpm', 'nibps', 'pip', 'rr',
       'spo2', 'temp', 'urine', 'vm', 'vt', 'betablockers', 'ca-iv',
       'ca-noniv', 'cablockers', 'dextrose', 'fluids', 'insulin', 'k-iv',
       'hours-k-iv', 'loopdiuretics', 'mg-iv', 'mg-noniv', 'hours-mg-noniv',
       'p-iv', 'p-noniv', 'pnutrition', 'ponutrition', 'packedrbc',
       'paralytics', 'sedation', 'tpnutrition', 'vasoactive', 'vasopressors',
       'hours-p-iv']

pred_var = 'los'

if DISEASE == 'kidney':
    X = kidney_df[TRAIN_COLS]
else:
    X = resp_df[TRAIN_COLS]
    
X = X.fillna(-1)
y = resp_df[pred_var]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
x_t, X_val, y_t, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=0)

In [10]:
def get_compiled_model():

    model = tf.keras.Sequential([
        tf.keras.layers.Dense(len(x_t.columns), activation='relu'),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(1)
    ])

    model.compile(optimizer=tf.optimizers.Adam(learning_rate=0.1),
                  loss='mean_absolute_error')
    return model

model = get_compiled_model()

In [11]:
train_dataset = tf.data.Dataset.from_tensor_slices((x_t, y_t))
val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))

BATCH_SIZE = 128
SHUFFLE_BUFFER_SIZE = 100

train_dataset = train_dataset.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)
val_dataset = val_dataset.batch(BATCH_SIZE)

model.fit(train_dataset, 
          epochs=20,
          validation_data=val_dataset,
          validation_steps=30)


Epoch 1/20


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x1d78127b8>