In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import psycopg2
import sys
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.formula.api import logit

from sepsis_utils import sepsis_utils as su
from sepsis_utils import roc_utils as ru

from sklearn.pipeline import Pipeline

# used for train/test splits
from sklearn.cross_validation import train_test_split

# used to impute mean for data
from sklearn.preprocessing import Imputer

# normalize the data
from sklearn import preprocessing

# logistic regression is our model of choice
from sklearn.linear_model import LogisticRegression

# used to create confusion matrix
from sklearn.metrics import confusion_matrix

from sklearn.cross_validation import cross_val_score

# used to calculate AUROC/accuracy
from sklearn import metrics

# for calibration curve of severity scores
from sklearn.calibration import calibration_curve

# default colours for prettier plots
col = [[0.9047, 0.1918, 0.1988],
    [0.2941, 0.5447, 0.7494],
    [0.3718, 0.7176, 0.3612],
    [1.0000, 0.5482, 0.1000],
    [0.4550, 0.4946, 0.4722],
    [0.6859, 0.4035, 0.2412],
    [0.9718, 0.5553, 0.7741],
    [0.5313, 0.3359, 0.6523]];
marker = ['v','o','d','^','s','o','+']
ls = ['-','-','-','-','-','s','--','--']
%matplotlib inline

from __future__ import print_function

In [None]:
# create a database connection

# below config used on pc70
sqluser = 'alistairewj'
dbname = 'mimic'
schema_name = 'mimiciii'

# Connect to local postgres version of mimic
con = psycopg2.connect(dbname=dbname, user=sqluser)

Each function extracts the data from a materialized view. See the *mimic-code* repository for instructions on how to create the materialized views.

In [None]:
# call functions to extract the severity scores
qsofa = su.get_qsofa(con)
sofa = su.get_sofa(con)
oasis = su.get_oasis(con)
lods = su.get_lods(con)
sirs = su.get_sirs(con)
angus = su.get_angus(con)

# Time of suspected infection

Suspected infection is defined as:

* Antibiotics within 72 hours of a culture
* A culture within 24 hours of antibiotics

We can extract antibiotic usage from the, PRESCRIPTIONS, INPUTEVENTS_MV and INPUTEVENTS_CV tables. We can extract time of blood cultures from the MICROBIOLOGYEVENTS table. Detail is given in defining-suspected-infection.ipynb.

In [None]:
ab = su.get_suspected_infection_time(con)

# Other data

This query extracts other data of interest:

* Age
* Gender
* Immunosuppression
* BMI
* Metastatic cancer (Elixhauser comorbidity)
* Diabetes (Elixhauser comorbidity)


In [None]:
misc = su.get_other_data(con)
print('{} ICU stays.'.format(misc.shape[0]))
idx = misc.age > 1
print('{} adult ICU stays.'.format(np.sum(idx)))
demog_col = ['height','weight','bmi']
for c in demog_col:
    print('\t{:2.2f}% have {}.'.format( (np.sum(idx) - misc[c][idx].isnull().sum())*100.0 / np.sum(idx), c ))

# Cohort

The below code creates our cohort of interest. This cohort is used to apply inclusion criteria by means of an inner join. Inclusion criteria are:

* Adult patient, i.e. age >= 16
* First ICU stay for the patient

In [None]:
cohort = su.get_cohort(con)

# close the database connection as we are finished extracting data
con.close()

# Create dataframe with *all patients*

We can ask some pretty sensible questions of this data.

* What percentage of patients had antibiotics with a culture?
* What percentage of these cultures were positive?

The Sepsis-3 guidelines exclusively evaluated patients with suspected infection, so we subselect to this population. We then report demographics, etc.

In [None]:
# initialize our dataframe to the cohort
df_all_pt = cohort

# merge in the various severity scores
df_all_pt = df_all_pt.merge(qsofa, how='left', on='icustay_id', suffixes=('','_qsofa'))
df_all_pt = df_all_pt.merge(sofa, how='left', on='icustay_id', suffixes=('','_sofa'))
df_all_pt = df_all_pt.merge(sirs, how='left', on='icustay_id', suffixes=('','_sirs'))
df_all_pt = df_all_pt.merge(ab, how='left', on='icustay_id', suffixes=('','_ab'))
df_all_pt = df_all_pt.merge(misc, how='left', on='icustay_id', suffixes=('','_misc'))
df_all_pt = df_all_pt.merge(oasis, how='left', on='icustay_id', suffixes=('','_oasis'))
df_all_pt = df_all_pt.merge(lods, how='left', on='icustay_id', suffixes=('','_lods'))
df_all_pt = df_all_pt.merge(angus, how='left', on='hadm_id', suffixes=('','_angus'))

# define sepsis-3 as: qSOFA >= 2 and SOFA >= 2
df_all_pt['sepsis3'] = (df_all_pt.qsofa >= 2) & (df_all_pt.sofa >=2)

print('{:5g} adult ICU stays (excluding subsequent ICU stays for the same patient).'.format(
    df_all_pt.shape[0]))

print('{:2.2f}% of patients with antibiotics/culture'.format(
    df_all_pt['suspected_infection_time'].count().astype(float) / df_all_pt.shape[0] * 100))

print('{:2.2f}% of patients with positive cultures'.format(
    df_all_pt['positiveculture'].sum().astype(float) / df_all_pt.shape[0] * 100))

print('{:2.2f}% of patients with antibiotics/culture had a positive culture'.format(
    df_all_pt['positiveculture'].sum().astype(float) / df_all_pt['suspected_infection_time'].count().astype(float) * 100))

df = df_all_pt.loc[(~df_all_pt['suspected_infection_time'].isnull().values)]

# Now we subselect to only patients who were *in the ICU* when the suspicion of infection occurred
idx = (pd.to_datetime(df['suspected_infection_time']) - df['intime']) > pd.to_timedelta('0','h')
print('Of these, {:2.2f}% of patients with suspected infection were in ICU at time of suspicion.'.format(100.0*np.mean(idx)))
df = df.loc[idx,:]


su.print_demographics(df)

print('')
print('{:5g} have qSOFA >= 2 ({:2.2f}%).'.format(
    (df.qsofa.values >= 2).sum(),100.0*(df.qsofa.values >= 2).mean()))

print('{:5g} have SOFA >= 2 ({:2.2f}%).'.format(
    (df.sofa.values >= 2).sum(),100.0*(df.sofa.values >= 2).mean()))

print('{:5g} have Sepsis-3 ({:2.2f}%).'.format(
    (df.sepsis3).sum(),100.0*(df.sepsis3).mean()))

print('{:5g} have SIRS >= 2 ({:2.2f}%).'.format(
    (df.sirs.values >= 2).sum(),100.0*(df.sirs.values >= 2).mean()))

print('{:5g} have LODS >= 2 ({:2.2f}%).'.format(
    (df.lods.values >= 2).sum(),100.0*(df.lods.values >= 2).mean()))

# Baseline model + scores

The original paper evaluates a *baseline model* with the addition of the various severity scores. 

> To measure predictive validity, a baseline risk model was created for in-hospital mortality based on preinfection criteria using multivariable logistic regression. The baseline model included age (as a fractional polynomial), sex, race/ethnicity (black, white, or other), and the weighted Charlson comorbidity score (as fractional polynomial) as a measure of chronic comorbidities.

This baseline model includes:

* age (fractional polynomial)
* sex
* ethnicity
* Charlson comorbidities (fractional polynomial)

We will reproduce this model, with the following caveats:

1. We will build and evaluate the model on the same dataset, so our estimates are "apparent"
2. We will use Elixhauser comorbidities, not Charlson comorbidities
3. We may not have identical fractional polynomial terms (as we are rebuilding the model on our dataset)

The following code block extracts the covariates for the baseline model.

In [None]:
X_header = ['age','elixhauser_hospital','ethnicity','gender','hospital_expire_flag','angus',
       'qsofa','sofa','sepsis3','sirs','lods']

X = df[X_header].values

# add column for gender - yes/no "is male?"
X = np.column_stack([X, np.in1d(X[:,3],('M'))])
idxGender = X.shape[1]-1

# code ethnicity as black/white/other - white is reference
X = np.column_stack([X, np.in1d(X[:,2],('BLACK/AFRICAN AMERICAN','BLACK/CAPE VERDEAN','BLACK/HAITIAN','BLACK/AFRICAN'))])


X = np.column_stack([X, np.in1d(X[:,2],('WHITE','WHITE - RUSSIAN','WHITE - OTHER EUROPEAN','WHITE - BRAZILIAN',
                 'WHITE - EASTERN EUROPEAN'))])

idxEthnicity = X.shape[1]-1
X[:,idxEthnicity] = (X[:,idxEthnicity]==0) & (X[:,idxEthnicity-1]==0) # this is equivalent to "ethnicity != (white or black)"


# subselect our columns of interest, cast to float
idxKeep = [x for x in range(X.shape[1]) if x not in (2, 3)]
X = X[:, idxKeep].astype(float)
X_header = [xval for x, xval in enumerate(X_header) if x in idxKeep]
X_header.extend(['is_male', 'race_black', 'race_other'])

# remove those with NaN outcome
idxBad = np.isnan(X[:,0])
X = X[~idxBad,:]
print('Removed {} patients with no outcome ({:2.2f}%).'.format(np.sum(idxBad), np.mean(idxBad)*100.0))

df_mdl = pd.DataFrame.from_records(X, columns=X_header)
df_mdl.head()

# Save the data to file

The dataframes will be loaded directly from a file, rather than the database.

In [None]:
df.to_csv('sepsis3-df.csv',sep=',',index=False)

# we'll also write out the design matrix for the MFP model here
np.savetxt('sepsis3-design-matrix.csv', X, fmt='%4.4f', delimiter=',', header=','.join(X_header), comments='')

The `X` data which was written to 'sepsis3-design-matrix.csv' will be used by the `print_auc_table_baseline` function to evaluate the AUROC of the scores when incorporated with the baseline model.

Here is an example:

In [None]:
print('Table of AUROCs of scores on their own, with p-values.')
preds_header = ['sirs','sofa','lods','qsofa']
target_header = 'hospital_expire_flag'
su.print_auc_table(df, preds_header, target_header)

# model development
print('\nBaseline model development...')
model = logit(formula=target_header + " ~ age + elixhauser_hospital + race_black + race_other + is_male", data=df_mdl).fit()
print(model.summary())

print('\nAUROC of the baseline, and models built using baseline covariates + score listed..')

# printing AUROC for models with each score
print('{:10s} {:0.3f}'.format('Baseline', metrics.roc_auc_score(df_mdl[target_header],model.predict())))
for score_added in ['sirs','qsofa','sofa','lods']:
    model = logit(formula=target_header + " ~ age + elixhauser_hospital + race_black + race_other + is_male + " + score_added,
                  data=df_mdl).fit(disp=0)
    print('{:10s} {:0.3f}'.format(score_added, metrics.roc_auc_score(df_mdl[target_header],model.predict())))