In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import psycopg2
import sys
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.formula.api import logit

from sepsis_utils import sepsis_utils as su
from sepsis_utils import roc_utils as ru

from sklearn.pipeline import Pipeline

# used for train/test splits
from sklearn.cross_validation import train_test_split

# used to impute mean for data
from sklearn.preprocessing import Imputer

# normalize the data
from sklearn import preprocessing

# logistic regression is our model of choice
from sklearn.linear_model import LogisticRegression

# used to create confusion matrix
from sklearn.metrics import confusion_matrix

from sklearn.cross_validation import cross_val_score

# used to calculate AUROC/accuracy
from sklearn import metrics

# for calibration curve of severity scores
from sklearn.calibration import calibration_curve

# default colours for prettier plots
col = [[0.9047, 0.1918, 0.1988],
    [0.2941, 0.5447, 0.7494],
    [0.3718, 0.7176, 0.3612],
    [1.0000, 0.5482, 0.1000],
    [0.4550, 0.4946, 0.4722],
    [0.6859, 0.4035, 0.2412],
    [0.9718, 0.5553, 0.7741],
    [0.5313, 0.3359, 0.6523]];
marker = ['v','o','d','^','s','o','+']
ls = ['-','-','-','-','-','s','--','--']
%matplotlib inline

from __future__ import print_function

In [None]:
# load data
df = su.get_data()

Let's examine the columns in our dataframe:

In [None]:
df.columns

We have: ICU intime/outtime, suspected infection time, whether the microbiology culture was positive, some demographics, comorbidities, outcomes, and the severity scores. 

The severity scores appear twice. With no suffix, the score is extracted at a [0, 24] hour window centered around ICU admission - except labs have an extended [-6, 24] hour window (i.e. 'sofa' is extracted in this way).

The second set of scores, with suffix 'si' (suspected infection), are extracted in a [-48, 24] hour window around the suspected_infection_time (i.e. 'sofa_si' is extracted in this window).

# Time of suspected infection

Suspected infection is defined as:

* Antibiotics within 72 hours of a culture
* A culture within 24 hours of antibiotics

We can extract antibiotic usage from the, PRESCRIPTIONS, INPUTEVENTS_MV and INPUTEVENTS_CV tables. We can extract time of blood cultures from the MICROBIOLOGYEVENTS table. Detail is given in defining-suspected-infection.ipynb.

In [None]:
# distribution of time of infection
xi = np.linspace(-72, 72, 72*2+1)

idxKeep = ~df.suspected_infection_time.isnull()
tmp = (df.loc[idxKeep,'suspected_infection_time'] - df.loc[idxKeep,'intime']).values / np.timedelta64(1, 'h')

N_firstday = sum( (tmp>-24) & (tmp<24) )
plt.figure(figsize=[6,6])
plt.hist( tmp, bins=xi )
plt.title('{} patients suspected between [-24,24] ({:2.2f}%).'.format(
        N_firstday, N_firstday*100.0 / tmp.shape[0]))
plt.show()

We can see that most patients are suspected of infection either before, or at the time of their ICU admission. This motivates the decision to evaluate the performance of the scores at ICU admission.

# Cohort

The below code creates our cohort of interest. This cohort is used to apply inclusion criteria by means of an inner join. Inclusion criteria are:

* Adult patient, i.e. age >= 16
* First ICU stay for the patient
* Suspected of infection

In [None]:
print('{:5g} - total number of ICU stays in MIMIC.'.format(df.shape[0]))

idx = df.age > 1
N_rem = df.shape[0] - np.sum(idx)
print('{:5g}   include only adult ICU stays (removed {}).'.format(
        np.sum(idx), N_rem))

N_rem = np.sum(idx) - np.sum(idx & (df['icustay_num'] == 1))
idx = idx & (df['icustay_num'] == 1)
print('{:5g}   ... on their first ICU stay  (removed {}).'.format(
        np.sum(idx), N_rem))


N_rem = np.sum(idx) - np.sum(idx & (~df['suspected_infection_time'].isnull()))
idx = idx & (~df['suspected_infection_time'].isnull())
print('{:5g}   ... suspected of infection   (removed {}).'.format(
        np.sum(idx), N_rem))


idxRem = (df['suspected_infection_time']-df['intime'])<np.timedelta64(1,'D')
N_rem = np.sum(idx) - np.sum(idx & idxRem)
idx = idx & idxRem
print('{:5g}   ... suspected before 1st day (removed {}).'.format(
        np.sum(idx), N_rem))

df = df.loc[idx,:]

In [None]:
su.print_demographics(df)

print('')
print('{:5g} have qSOFA >= 2 ({:2.2f}%).'.format(
    (df.qsofa.values >= 2).sum(),100.0*(df.qsofa.values >= 2).mean()))

print('{:5g} have SOFA >= 2 ({:2.2f}%).'.format(
    (df.sofa.values >= 2).sum(),100.0*(df.sofa.values >= 2).mean()))

print('{:5g} have Sepsis-3 ({:2.2f}%).'.format(
    (df.sepsis3).sum(),100.0*(df.sepsis3).mean()))

print('{:5g} have SIRS >= 2 ({:2.2f}%).'.format(
    (df.sirs.values >= 2).sum(),100.0*(df.sirs.values >= 2).mean()))

print('{:5g} have LODS >= 2 ({:2.2f}%).'.format(
    (df.lods.values >= 2).sum(),100.0*(df.lods.values >= 2).mean()))

# Baseline model + scores

The original paper evaluates a *baseline model* with the addition of the various severity scores. 

> To measure predictive validity, a baseline risk model was created for in-hospital mortality based on preinfection criteria using multivariable logistic regression. The baseline model included age (as a fractional polynomial), sex, race/ethnicity (black, white, or other), and the weighted Charlson comorbidity score (as fractional polynomial) as a measure of chronic comorbidities.

This baseline model includes:

* age (fractional polynomial)
* sex
* ethnicity
* Charlson comorbidities (fractional polynomial)

We will reproduce this model, with the following caveats:

1. We will build and evaluate the model on the same dataset, so our estimates are "apparent"
2. We will use Elixhauser comorbidities, not Charlson comorbidities
3. We may not have identical fractional polynomial terms (as we are rebuilding the model on our dataset)

The following code block extracts the covariates for the baseline model.

In [None]:
X_header = ['age','elixhauser_hospital','hospital_expire_flag','angus',
            'is_male','race_black','race_other',
            'qsofa','sofa','sepsis3','sirs','lods']

X = df[X_header].values

# we'll write out the design matrix for the MFP model here - this is used by the R code
np.savetxt('sepsis3-design-matrix.csv', X, fmt='%4.4f',
           delimiter=',', header=','.join(X_header), comments='')

# Save the data to file

The dataframes will be loaded directly from a file, rather than the database.

In [None]:
df.to_csv('sepsis3-df.csv',sep=',',index=False)

The `X` data which was written to 'sepsis3-design-matrix.csv' will be used by the `print_auc_table_baseline` function to evaluate the AUROC of the scores when incorporated with the baseline model.

Here is an example:

In [None]:
print('Table of AUROCs of scores on their own, with p-values.')
# define outcome
target_header = "hospital_expire_flag"
y = df[target_header].values == 1

# define the covariates to be added in the MFP model (used for table of AUROCs)
preds_header = ['sirs','sofa','lods','qsofa']

su.print_auc_table(df, preds_header, target_header)


# model development - baseline covariates
mdl_formula = target_header + " ~ age + elixhauser_hospital + race_black + race_other + is_male"
print('\nBaseline model development...')
model = logit(formula=mdl_formula, data=df).fit()
print(model.summary())


print('\nAUROC of the baseline, and models built using baseline covariates + score listed..')

for score_added in ['Baseline','sirs','qsofa','sofa','lods']:
    frm = mdl_formula
    if score_added != 'Baseline':
        frm = frm + ' + ' + score_added
        
    model = logit(formula=frm, data=df).fit(disp=0)
    auc_mdl = metrics.roc_auc_score(df[target_header],model.predict())
    print('{:10s} {:0.3f}'.format(score_added, auc_mdl))