# Sepsis-3 in MIMIC-III

This is the primary notebook for analyzing sepsis-3 in the MIMIC-III database. Before running this notebook, you'll need the `sepsis3-df.csv` file in the local directory: either by downloading it directly from PhysioNet or running the SQL scripts enclosed on the MIMIC-III database. See `sepsis-3-get-data.ipynb` for more detail.

In [None]:
from __future__ import print_function

# Import libraries
import os
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import subprocess
import tableone
from collections import OrderedDict
from statsmodels.formula.api import logit
from IPython.display import display, HTML # used to print out pretty pandas dataframes

from sepsis_utils import sepsis_utils as su
from sepsis_utils import roc_utils as ru
from sepsis_utils import venn

# venn diagram with matplotlib-venn
#from matplotlib_venn import venn3

# used to calculate AUROC/accuracy
from sklearn import metrics

# default colours for prettier plots
col = [[0.9047, 0.1918, 0.1988],
    [0.2941, 0.5447, 0.7494],
    [0.3718, 0.7176, 0.3612],
    [1.0000, 0.5482, 0.1000],
    [0.4550, 0.4946, 0.4722],
    [0.6859, 0.4035, 0.2412],
    [0.9718, 0.5553, 0.7741],
    [0.5313, 0.3359, 0.6523]];
marker = ['v','o','d','^','s','o','+']
ls = ['-','-','-','-','-','s','--','--']

import colorsys
def gg_color_hue(n):
    hues = np.linspace(15, 375, n)
    hsv_tuples = [(x*1.0/360.0, 0.5, 0.8) for x in hues]
    rgb_tuples = map(lambda x: colorsys.hsv_to_rgb(*x), hsv_tuples)
    return rgb_tuples


%matplotlib inline
plt.style.use('ggplot')

font = {'family' : 'DejaVu Sans',
        'size'   : 20}

matplotlib.rc('font', **font)

In [None]:
df = pd.read_csv('sepsis3-df.csv')

# add the composite outcome
df['composite_outcome'] = ( (df['hospital_expire_flag']==1) | (df['icu_los']>=3) ).astype(int)

labels = OrderedDict([['suspicion_poe', 'BC + ABX (Prescribed)']])

# add some other useful variables
df['blood culture'] = (~df['blood_culture_time'].isnull())
df['suspicion_poe'] = (~df['suspected_infection_time_poe_days'].isnull())

df['abx_poe'] = (~df['antibiotic_time_poe'].isnull())

df['sepsis-3'] = (df['suspicion_poe']==1) & (df['sofa']>=2)
df['sofa>=2'] = (df['sofa']>=2)


for c in ['intime','outtime',
          'suspected_infection_time_mv','suspected_infection_time',
          'suspected_infection_time_poe', 'blood_culture_time']:
    if c in df.columns:
        if df[c].dtype == 'object':
            df[c] = pd.to_datetime(df[c])

# list of the sepsis definitions
sepsis_list = ['sepsis_angus','sepsis_martin', 'sepsis_explicit',
               'sepsis_cdc','sepsis_nqf',
               'sepsis-3']

# Results for Abstract

In [None]:
# see get-data for the exclusions
print('{:5g} patients.'.format(df.shape[0]))

print('{:5g} ({:2.0f}%) suspected of infection.'.format(
        df['suspicion_poe'].sum(), df['suspicion_poe'].sum()*100.0/df.shape[0]))

print('{:5g} ({:2.2f}%) have SOFA >= 2.'.format(
    df['sofa>=2'].sum(),100.0*df['sofa>=2'].mean()))

print('{:5g} ({:2.2f}%) have Sepsis-3 criteria (intersection of above two).'.format(
    df['sepsis-3'].sum(),100.0*df['sepsis-3'].mean()))

print('{:5g} ({:2.2f}%) have suspicion and SOFA < 2.'.format(
        ((df['sofa>=2']==0) & (df['suspicion_poe']==1)).sum(),
        ((df['sofa>=2']==0) & (df['suspicion_poe']==1)).sum()*100.0/df.shape[0]))

for c in sepsis_list:
    print('{:5g} ({:3.2f}%) - {}'.format(
        df[c].sum(), df[c].sum()*100.0/df.shape[0], c))

print('{:5g} ({:2.2f}%) have Sepsis-3 criteria but not Angus.'.format(
        ((df['sepsis_angus']==0) & (df['sepsis-3']==1)).sum(),
        ((df['sepsis_angus']==0) & (df['sepsis-3']==1)).sum()*100.0/df.shape[0]))


np.random.seed(21381)
# cronbach alpha for construct validity
reload(su)
print('\n === Cronbach Alpha ===')
su.cronbach_alpha_table(df, sepsis_list)

# correlation coefficients
# TODO: tetrachoric correlation coefficient
print('\n === Correlation Coefficient === (currently pearson, should be tetra)')
su.corrcoef_table(df, sepsis_list)

# Results section

We now print out the results in the same order as they are in the paper.

# Demographics

In [None]:
# Call the print_demographics subfunction, which prints out a reasonably formatted table
reload(su)
su.print_demographics(df)

print('\nAlive vs. dead')
su.print_demographics(df, idx=(df.hospital_expire_flag.values==1))

print('')

print('{:5g} have SIRS >= 2 ({:2.2f}%) on admission.'.format(
    (df.sirs.values >= 2).sum(),100.0*(df.sirs.values >= 2).mean()))

print('{:5g} have qSOFA >= 2 ({:2.2f}%) on admission.'.format(
    (df.qsofa.values >= 2).sum(),100.0*(df.qsofa.values >= 2).mean()))

print('{:5g} have SOFA >= 2 ({:2.2f}%).'.format(
    (df.sofa.values >= 2).sum(),100.0*(df.sofa.values >= 2).mean()))

print('{:5g} have LODS >= 2 ({:2.2f}%).'.format(
    (df.lods.values >= 2).sum(),100.0*(df.lods.values >= 2).mean()))

In [None]:
# list probability of outcome for each score
scores = ['suspicion_poe','sofa>=2',
          'sepsis-3',
          'sepsis_angus','sepsis_martin','sepsis_explicit',
          'sepsis_cdc','sepsis_nqf']

scores_dict = {
    'suspicion_poe': 'Suspected infection',
    'sofa>=2': 'SOFA >= 2',
    'sepsis-3': 'Sepsis-3',
    'sepsis_angus': 'Angus et al. criteria',
    'sepsis_martin': 'Martin et al. criteria',
    'sepsis_explicit': 'Explicit',
    'sepsis_cdc': 'CDC',
    'sepsis_nqf': 'CMS'
}
target_header = "hospital_expire_flag"
idx = df[target_header]==1

print()
print('=== {} ==='.format(target_header))
print()
print('{:15s}\t{:8s}\t{:5s}\t{:5s}'.format(
    'Criteria','N','p(death|c)', 'p(death|~c)'))
for c in scores:
    print('{:15s}\t{:4d}, {:2.1f}%\t{:2.1f}%\t\t{:2.1f}%'.format(
            c,
            np.sum( df[c]==1 ),
            np.sum( df[c]==1 )*100.0/df.shape[0],
            np.sum( (df[c]==1)&idx )*100.0 / np.sum( df[c]==1 ),
            np.sum( (df[c]!=1)&idx )*100.0 / np.sum( df[c]!=1 )
        ))
    
target_header = "composite_outcome"
idx = df[target_header]==1

print()
print('=== {} ==='.format(target_header))
print()

print('{:15s}\t{:8s}\t{:5s}\t{:5s}'.format(
    'Criteria','N','p(death|c)', 'p(death|~c)'))
for c in scores:
    print('{:15s}\t{:4d}, {:2.1f}%\t{:2.1f}%\t\t{:2.1f}%'.format(
            c,
            np.sum( df[c]==1 ),
            np.sum( df[c]==1 )*100.0/df.shape[0],
            np.sum( (df[c]==1)&idx )*100.0 / np.sum( df[c]==1 ),
            np.sum( (df[c]!=1)&idx )*100.0 / np.sum( df[c]!=1 )
        ))

In [None]:

# print the frequencies
# list probability of outcome for each score
scores = ['suspicion_poe','sofa>=2',
          'sepsis-3',
          'sepsis_angus','sepsis_martin','sepsis_explicit',
          'sepsis_cdc','sepsis_nqf']

scores_dict = {
    'suspicion_poe': 'Suspected infection',
    'sofa>=2': 'SOFA >= 2',
    'sepsis-3': 'Sepsis-3',
    'sepsis_angus': 'Angus et al. criteria',
    'sepsis_martin': 'Martin et al. criteria',
    'sepsis_explicit': 'Explicit',
    'sepsis_cdc': 'CDC',
    'sepsis_nqf': 'CMS'
}

idx = df['hospital_expire_flag']==1
idxComp = df['composite_outcome']==1

score_plot = np.zeros( [len(scores), 3] )
for i, c in enumerate(scores):
    # proportion of patients
    score_plot[i, 0] = np.sum( df[c]==1 )*100.0/df.shape[0]
    
    # with mort
    score_plot[i, 1] = np.sum( (df[c]==1)&idx )*100.0 / np.sum( df[c]==1 )
    # with comp
    score_plot[i, 2] = np.sum( (df[c]==1)&idxComp )*100.0 / np.sum( df[c]==1 )
    
S = len(scores)

idxSort = np.argsort(score_plot[:,0])
plt.figure()
plt.barh( range(S), score_plot[idxSort,0], color=col[1], align='center')
plt.barh( range(S), score_plot[idxSort,0]*score_plot[idxSort,2]/100.0, color=col[3], align='center', height=0.6)
plt.barh( range(S), score_plot[idxSort,0]*score_plot[idxSort,1]/100.0, color=col[0], align='center', height=0.4)

plt.yticks(range(S), [scores_dict[scores[x]] for x in idxSort])
plt.xlabel('Percentage of patients')
plt.xlim([0,100])
plt.show()

print('')
print('{:15s}\t{:8s}\t{:5s}\t{:5s}\t{:5s}\t{:5s}'.format(
    'Criteria','N','p(death|c)', 'p(death|~c)', 'p(comp|c)', 'p(comp|~c)'))
for i in idxSort[-1::-1]:
    c=scores[i]
    print('{:15s}\t{:4d}, {:2.1f}%\t{:2.1f}%\t\t{:2.1f}%\t\t{:2.1f}%\t\t{:2.1f}%'.format(
            c,
            np.sum( df[c]==1 ),
            np.sum( df[c]==1 )*100.0/df.shape[0],
            np.sum( (df[c]==1)&idx )*100.0 / np.sum( df[c]==1 ),
            np.sum( (df[c]!=1)&idx )*100.0 / np.sum( df[c]!=1 ),
            np.sum( (df[c]==1)&idxComp )*100.0 / np.sum( df[c]==1 ),
            np.sum( (df[c]!=1)&idxComp )*100.0 / np.sum( df[c]!=1 )
        ))
    

print('')
print('{},{},{},{},{},{}'.format(
    'Criteria','N','p(death|c)', 'p(death|~c)', 'p(comp|c)', 'p(comp|~c)'))
# same print but to csv
for i in idxSort[-1::-1]:
    c=scores[i]
    print('{},{:4d} {:2.1f}%,{:2.1f}%,{:2.1f}%,{:2.1f}%,{:2.1f}%'.format(
            scores_dict[c],
            np.sum( df[c]==1 ),
            np.sum( df[c]==1 )*100.0/df.shape[0],
            np.sum( (df[c]==1)&idx )*100.0 / np.sum( df[c]==1 ),
            np.sum( (df[c]!=1)&idx )*100.0 / np.sum( df[c]!=1 ),
            np.sum( (df[c]==1)&idxComp )*100.0 / np.sum( df[c]==1 ),
            np.sum( (df[c]!=1)&idxComp )*100.0 / np.sum( df[c]!=1 )
        ))

In [None]:
np.random.seed(7891)
# AUROC of sofa for the composite outcome
target_header = 'composite_outcome'
pred_header = 'sofa'
# test model on its own
auc, ci = ru.calc_auc(df[pred_header].values, df[target_header].values, with_ci=True)
print('{} - univariable against {}'.format(pred_header,target_header))
print('AUROC = {:0.3f} [{:0.2f} - {:0.2f}]'.format(auc, ci[0], ci[1]))

# sepsis3 defined as qSOFA >= 2 and SOFA >= 2
yhat_dict = OrderedDict([['SOFA>=2', df['sofa>=2']]
                        ])

stats_all = su.get_op_stats(yhat_dict, df[target_header].values)

su.print_op_stats(stats_all)

# Sepsis

Create the sepsis-3 criteria: SOFA >= 2 and suspicion of infection.

In [None]:
print('{:5g} ({:3.2f}%)  first ICU stay for adults.'.format(
        df['icustay_id'].count(), 100))
print('{:5g} ({:3.2f}%)  suspected of infection'.format(
        np.sum(df['suspicion_poe']),
        np.sum(df['suspicion_poe'])*100.0/df.shape[0]))
print('{:5g} ({:3.2f}%)  with a positive blood culture'.format(
    df['blood_culture_positive'].sum(), df['blood_culture_positive'].sum()*100.0/df.shape[0]))

print()
for i, c in enumerate(labels):
    print('{:5g} ({:3.2f}%) - {}'.format(
        df[c].sum(), df[c].sum()*100.0/df.shape[0], c))

In [None]:
# venn diagram with matplotlib-venn
from matplotlib_venn import venn3

venn_labels = OrderedDict([
        ['sepsis_martin', 'Martin et al. criteria'],
        ['sepsis_angus', 'Angus et al. criteria'],
        ['sepsis-3', 'Sepsis-3 criteria']
    ])

sets = list()
set_names = list()
for c in venn_labels:
    idx = df[c]==1
    sets.append(set(df.loc[idx,'icustay_id']))
    set_names.append(venn_labels[c])

set_other = set(df['icustay_id'].values).difference(*sets)

plt.figure(figsize=[16,16])
plt.rcParams.update({'font.size': 15})
venn3(sets, set_names,
      subset_label_formatter=lambda x: '{:,}\n{:2.1f}%'.format(x, x*100.0/df.shape[0]))
plt.savefig('sepsis3-venn-blood-culture.png')
plt.show()

set_other = set(df['icustay_id'].values).difference(*sets)
# Other numbers for above venn diagram
print('{} patients ({:2.1f}%) satisfied all criteria.'.format(len(set.intersection(*sets)),
     len(set.intersection(*sets))*100.0 / df.shape[0]))
print('{} patients ({:2.1f}%) satisfied no criteria.'.format(
        len(set_other),
        len(set_other)*100.0 / df.shape[0]))

# pair-wise counts
for i, c1 in enumerate(venn_labels):
    for j, c2 in enumerate(venn_labels):
        if i<=j:
            continue
        else:
            set_both = set.intersection(sets[i],sets[j])
            print('{:2.1f}% ({}) - {} & {}'.format(
                    len(set_both)*100.0 / df.shape[0], len(set_both),c1, c2))

In [None]:
venn_labels = OrderedDict([
        ['sepsis_martin', 'Martin criteria'],
        ['sepsis_angus', 'Angus criteria'],
        ['suspicion_poe', 'Suspicion of infection']
    ])

sets = list()
set_names = list()
for c in venn_labels:
    idx = df[c]==1
    sets.append(set(df.loc[idx,'icustay_id']))
    set_names.append(venn_labels[c])

set_other = set(df['icustay_id'].values).difference(*sets)

plt.figure(figsize=[16,16])
plt.rcParams.update({'font.size': 15})
venn3(sets, set_names,
      subset_label_formatter=lambda x: '{:,}\n{:2.1f}%'.format(x, x*100.0/df.shape[0]))
plt.savefig('sepsis3-venn-blood-culture.png')
plt.show()

set_other = set(df['icustay_id'].values).difference(*sets)
# Other numbers for above venn diagram
print('{} patients ({:2.1f}%) satisfied all criteria.'.format(len(set.intersection(*sets)),
     len(set.intersection(*sets))*100.0 / df.shape[0]))
print('{} patients ({:2.1f}%) satisfied no criteria.'.format(
        len(set_other),
        len(set_other)*100.0 / df.shape[0]))

# Angus & sepsis-3
for i, c1 in enumerate(venn_labels):
    for j, c2 in enumerate(venn_labels):
        if i<=j:
            continue
        else:
            set_both = set.intersection(sets[i],sets[j])
            print('{:2.1f}% ({}) - {} & {}'.format(
                    len(set_both)*100.0 / df.shape[0], len(set_both),c1, c2))

In [None]:
reload(venn)

venn_labels = OrderedDict([
        ['sofa>=2', 'SOFA >= 2'],
        ['sepsis_angus', 'Angus criteria'],
        ['suspicion_poe', 'Suspicion of infection'],
        ['sepsis_martin', 'Martin criteria']
    ])

sets = list()
set_names = list()
for c in venn_labels:
    idx = df[c]==1
    sets.append(set(df.loc[idx,'icustay_id']))
    set_names.append(venn_labels[c])

fontdict = {'fontsize': 15, 'fontweight': 'normal'}
venn.venn4(sets, set_names, show_plot=False, fontdict=fontdict, fill='percent', figsize=[12,12])
leg = plt.legend('off')
leg.remove()
plt.show()

# also with just percent
venn.venn4(sets, set_names, show_plot=False, fontdict=fontdict, fill='percent_only')
leg = plt.legend('off')
leg.remove()
plt.show()

print('{} patients ({:2.1f}%) satisfied all criteria.'.format(len(set.intersection(*sets)),
     len(set.intersection(*sets))*100.0 / df.shape[0]))
print('{} patients ({:2.1f}%) satisfied no criteria.'.format(
        len(set(df['icustay_id'].values).difference(set.union(*sets))),
        len(set(df['icustay_id'].values).difference(set.union(*sets)))*100.0 / df.shape[0]))

# Angus & sepsis-3
for i, c1 in enumerate(venn_labels):
    for j, c2 in enumerate(venn_labels):
        if i<=j:
            continue
        else:
            set_both = set.intersection(sets[i],sets[j])
            print('{:2.1f}% ({}) - {} & {}'.format(
                    len(set_both)*100.0 / df.shape[0], len(set_both),c1, c2))

In [None]:
reload(venn)

venn_labels = OrderedDict([
        ['sepsis-3', 'Sepsis-3 criteria'],
        ['sepsis_angus', 'Angus et al. criteria'],
        ['sepsis_explicit', 'Explicit criteria'],
        ['sepsis_martin', 'Martin et al. criteria']
    ])

sets = list()
set_names = list()
for c in venn_labels:
    idx = df[c]==1
    sets.append(set(df.loc[idx,'icustay_id']))
    set_names.append(venn_labels[c])

fontdict = {'fontsize': 15, 'fontweight': 'normal'}
venn.venn4(sets, set_names, show_plot=False, fontdict=fontdict, fill='percent', figsize=[12,12])
leg = plt.legend('off')
leg.remove()
plt.show()

# also with just percent
venn.venn4(sets, set_names, show_plot=False, fontdict=fontdict, fill='percent_only')
leg = plt.legend('off')
leg.remove()
plt.show()

print('{} patients ({:2.1f}%) satisfied all criteria.'.format(len(set.intersection(*sets)),
     len(set.intersection(*sets))*100.0 / df.shape[0]))
print('{} patients ({:2.1f}%) satisfied no criteria.'.format(
        len(set(df['icustay_id'].values).difference(set.union(*sets))),
        len(set(df['icustay_id'].values).difference(set.union(*sets)))*100.0 / df.shape[0]))



# Angus & sepsis-3
for i, c1 in enumerate(venn_labels):
    for j, c2 in enumerate(venn_labels):
        if i<=j:
            continue
        else:
            set_both = set.intersection(sets[i],sets[j])
            print('{:2.1f}% ({}) - {} & {}'.format(
                    len(set_both)*100.0 / df.shape[0], len(set_both),c1, c2))

# Venn with CDC/NQF

In [None]:
reload(venn)

venn_labels = OrderedDict([
        ['sepsis-3', 'Sepsis-3'],
        ['sepsis_cdc', 'CDC'],
        ['sepsis_nqf', 'NQF']
    ])

sets = list()
set_names = list()
for c in venn_labels:
    idx = df[c]==1
    sets.append(set(df.loc[idx,'icustay_id']))
    set_names.append(venn_labels[c])

fontdict = {'fontsize': 15, 'fontweight': 'normal'}
venn.venn3(sets, set_names, show_plot=False, fontdict=fontdict, fill='percent', figsize=[8,8])
leg = plt.legend('off')
leg.remove()
plt.show()

print('{} patients ({:2.1f}%) satisfied all criteria.'.format(len(set.intersection(*sets)),
     len(set.intersection(*sets))*100.0 / df.shape[0]))
print('{} patients ({:2.1f}%) satisfied no criteria.'.format(
        len(set(df['icustay_id'].values).difference(set.union(*sets))),
        len(set(df['icustay_id'].values).difference(set.union(*sets)))*100.0 / df.shape[0]))

# print out overlaps
for i, c1 in enumerate(venn_labels):
    for j, c2 in enumerate(venn_labels):
        if i<=j:
            continue
        else:
            set_both = set.intersection(sets[i],sets[j])
            print('{:2.1f}% ({}) - {} & {}'.format(
                    len(set_both)*100.0 / df.shape[0], len(set_both),c1, c2))

In [None]:
reload(venn)

venn_labels = OrderedDict([
        ['sepsis_angus', 'Angus'],
        ['sepsis_cdc', 'CDC'],
        ['sepsis_nqf', 'NQF']
    ])

sets = list()
set_names = list()
for c in venn_labels:
    idx = df[c]==1
    sets.append(set(df.loc[idx,'icustay_id']))
    set_names.append(venn_labels[c])

fontdict = {'fontsize': 15, 'fontweight': 'normal'}
venn.venn3(sets, set_names, show_plot=False, fontdict=fontdict, fill='percent', figsize=[9,9])
leg = plt.legend('off')
leg.remove()
plt.show()

print('{} patients ({:2.1f}%) satisfied all criteria.'.format(len(set.intersection(*sets)),
     len(set.intersection(*sets))*100.0 / df.shape[0]))
print('{} patients ({:2.1f}%) satisfied no criteria.'.format(
        len(set(df['icustay_id'].values).difference(set.union(*sets))),
        len(set(df['icustay_id'].values).difference(set.union(*sets)))*100.0 / df.shape[0]))

# print out overlaps
for i, c1 in enumerate(venn_labels):
    for j, c2 in enumerate(venn_labels):
        if i<=j:
            continue
        else:
            set_both = set.intersection(sets[i],sets[j])
            print('{:2.1f}% ({}) - {} & {}'.format(
                    len(set_both)*100.0 / df.shape[0], len(set_both),c1, c2))

In [None]:
reload(venn)

venn_labels = OrderedDict([
        ['sepsis_martin', 'Martin'],
        ['sepsis_angus', 'Angus criteria'],
        ['sepsis_cdc', 'CDC'],
        ['sepsis_nqf', 'NQF']
    ])

sets = list()
set_names = list()
for c in venn_labels:
    idx = df[c]==1
    sets.append(set(df.loc[idx,'icustay_id']))
    set_names.append(venn_labels[c])

fontdict = {'fontsize': 15, 'fontweight': 'normal'}
venn.venn4(sets, set_names, show_plot=False, fontdict=fontdict, fill='percent', figsize=[8,8])
leg = plt.legend('off')
leg.remove()
plt.show()

# also with just percent
venn.venn4(sets, set_names, show_plot=False, fontdict=fontdict, fill='percent_only')
leg = plt.legend('off')
leg.remove()
plt.show()

print('{} patients ({:2.1f}%) satisfied all criteria.'.format(len(set.intersection(*sets)),
     len(set.intersection(*sets))*100.0 / df.shape[0]))
print('{} patients ({:2.1f}%) satisfied no criteria.'.format(
        len(set(df['icustay_id'].values).difference(set.union(*sets))),
        len(set(df['icustay_id'].values).difference(set.union(*sets)))*100.0 / df.shape[0]))

# Angus & sepsis-3
for i, c1 in enumerate(venn_labels):
    for j, c2 in enumerate(venn_labels):
        if i<=j:
            continue
        else:
            set_both = set.intersection(sets[i],sets[j])
            print('{:2.1f}% ({}) - {} & {}'.format(
                    len(set_both)*100.0 / df.shape[0], len(set_both),c1, c2))

In [None]:
reload(venn)

venn_labels = OrderedDict([
        ['sepsis-3', 'Sepsis-3'],
        ['sepsis_angus', 'Angus criteria'],
        ['sepsis_cdc', 'CDC'],
        ['sepsis_nqf', 'NQF']
    ])

sets = list()
set_names = list()
for c in venn_labels:
    idx = df[c]==1
    sets.append(set(df.loc[idx,'icustay_id']))
    set_names.append(venn_labels[c])

fontdict = {'fontsize': 15, 'fontweight': 'normal'}
venn.venn4(sets, set_names, show_plot=False, fontdict=fontdict, fill='percent', figsize=[8,8])
leg = plt.legend('off')
leg.remove()
plt.show()

# also with just percent
venn.venn4(sets, set_names, show_plot=False, fontdict=fontdict, fill='percent_only')
leg = plt.legend('off')
leg.remove()
plt.show()

print('{} patients ({:2.1f}%) satisfied all criteria.'.format(len(set.intersection(*sets)),
     len(set.intersection(*sets))*100.0 / df.shape[0]))
print('{} patients ({:2.1f}%) satisfied no criteria.'.format(
        len(set(df['icustay_id'].values).difference(set.union(*sets))),
        len(set(df['icustay_id'].values).difference(set.union(*sets)))*100.0 / df.shape[0]))

# Angus & sepsis-3
for i, c1 in enumerate(venn_labels):
    for j, c2 in enumerate(venn_labels):
        if i<=j:
            continue
        else:
            set_both = set.intersection(sets[i],sets[j])
            print('{:2.1f}% ({}) - {} & {}'.format(
                    len(set_both)*100.0 / df.shape[0], len(set_both),c1, c2))

# Venn diagram with mortality

In [None]:
venn_labels = OrderedDict([
        ['hospital_expire_flag', 'In-hospital mortality'],
        ['sepsis_angus', 'Angus criteria'],
        ['sepsis-3', 'Sepsis-3 criteria']
    ])

sets = list()
set_names = list()
for c in venn_labels:
    idx = df[c]==1
    sets.append(set(df.loc[idx,'icustay_id']))
    set_names.append(venn_labels[c])

set_other = set(df['icustay_id'].values).difference(*sets)

plt.figure(figsize=[16,16])
plt.rcParams.update({'font.size': 15})
venn3(sets, set_names,
      subset_label_formatter=lambda x: '{:,}\n{:2.1f}%'.format(x, x*100.0/df.shape[0]))
plt.savefig('sepsis3-venn-blood-culture.png')
plt.show()

set_other = set(df['icustay_id'].values).difference(*sets)
# Other numbers for above venn diagram
print('{} patients ({:2.1f}%) satisfied all criteria.'.format(len(set.intersection(*sets)),
     len(set.intersection(*sets))*100.0 / df.shape[0]))
print('{} patients ({:2.1f}%) satisfied no criteria.'.format(
        len(set_other),
        len(set_other)*100.0 / df.shape[0]))

for i, c1 in enumerate(venn_labels):
    for j, c2 in enumerate(venn_labels):
        if i<=j:
            continue
        else:
            set_both = set.intersection(sets[i],sets[j])
            print('{:2.1f}% ({}) - {} & {}'.format(
                    len(set_both)*100.0 / df.shape[0], len(set_both), c1, c2))

In [None]:
reload(venn)

venn_labels = OrderedDict([
        ['sofa>=2', 'SOFA >= 2'],
        ['sepsis_angus', 'Angus criteria'],
        ['suspicion_poe', 'Suspicion of infection'],
        ['hospital_expire_flag', 'In-hospital mortality']
    ])

sets = list()
set_names = list()
for c in venn_labels:
    idx = df[c]==1
    sets.append(set(df.loc[idx,'icustay_id']))
    set_names.append(venn_labels[c])

fontdict = {'fontsize': 15, 'fontweight': 'normal'}
venn.venn4(sets, set_names, show_plot=False, fontdict=fontdict, fill='percent', figsize=[12,12])
leg = plt.legend('off')
leg.remove()
plt.show()

# also with just percent
venn.venn4(sets, set_names, show_plot=False, fontdict=fontdict, fill='percent_only')
leg = plt.legend('off')
leg.remove()
plt.show()

print('{} patients ({:2.1f}%) satisfied all criteria.'.format(len(set.intersection(*sets)),
     len(set.intersection(*sets))*100.0 / df.shape[0]))
print('{} patients ({:2.1f}%) satisfied no criteria.'.format(
        len(set(df['icustay_id'].values).difference(set.union(*sets))),
        len(set(df['icustay_id'].values).difference(set.union(*sets)))*100.0 / df.shape[0]))


for i, c1 in enumerate(venn_labels):
    for j, c2 in enumerate(venn_labels):
        if i<=j:
            continue
        else:
            set_both = set.intersection(sets[i],sets[j])
            print('{:2.1f}% ({}) - {} & {}'.format(
                    len(set_both)*100.0 / df.shape[0], len(set_both),c1, c2))

## add in year .. if available

In [None]:
if not os.path.isfile('mimiciii_fiscal_year.csv'):
    print('Cannot create plot as years are not available.')
else:
    yr = pd.read_csv('mimiciii_fiscal_year.csv')
    # set columns to lower case
    yr.columns = [x.lower() if x != 'FISCALYEAR' else 'year' for x in yr.columns]
    
    yr = yr.merge(df, how='inner', left_on='hadm_id', right_on='hadm_id')
    
    yr_keep = np.linspace(2009,2012,4)
    # create a dataframe that indicates if a patient has:
    # heart rate, blood pressure, temperature, respiration rate
    # in the first day.

    sepsis = ['sepsis_angus','sepsis_martin','sepsis_explicit',
              'sepsis_nqf','sepsis_cdc',
              'sepsis-3',
              #'septic_shock_explicit', 'severe_sepsis_explicit', 'suspicion_mv',
              'blood culture', 'abx_poe', 'suspicion_poe']


    grouped = yr.loc[np.in1d(yr['year'],yr_keep),:].groupby('year')

    plt.figure(figsize=[12,8])

    # get the x-axis from the original data - right now it's plotting against 0:NUMBER_OF_YEARS
    year = np.unique(yr['year'].values)
    Y = year.size

    # set x-axis labels to years
    #ax.set_xticks(range(Y))
    #ax.set_xticklabels(year,fontsize=14)

    pretty_labels = {'sepsis_angus': 'Angus et al. criteria',
                    'sepsis_martin': 'Martin et al. criteria',
                    'sepsis_explicit': 'Explicit ICD-9 coded sepsis',
                    'sepsis_nqf': 'CMS NQF #0500 criteria',
                    'sepsis_cdc': 'CDC Epicenters surveillence criteria',
                    'septic_shock_explicit': 'ICD-9 code 785.52 (septic shock)',
                    'severe_sepsis_explicit': 'ICD-9 code 995.92 (severe sepsis)',
                    'sofa>=2': 'SOFA >= 2',
                    'sepsis-3': 'Sepsis-3 criteria',
                    'blood culture': 'Blood culture',
                    'abx_poe': 'Antibiotic ordered',
                    'suspicion_mv': 'Blood culture + Antibiotics (IV)',
                    'suspicion_poe': 'Suspicion (Blood culture + Antibiotics)',
                    'suspicion_piv': 'Blood culture + Antibiotics (POE IV)'}
    
    # if listed here, we use a special marker
    # otherwise we just use 'o'
    marker_special = {'severe_sepsis_explicit': '^',
                      'sepsis_martin': '^',
                      'sepsis_explicit': 's',
                      'sepsis_angus': 'd',
                      'suspicion_poe': 'd'
                     }
    
    # prevent overlapping labels by changing the y_pos
    y_pos_fix = {'suspicion_poe': -1,
                'sepsis_martin': 0.4,
                'sepsis_explicit': -1.5,
                'sepsis_cdc': 1.3,
                'sepsis_angus': -1.3}
    # ===================== #
    # === PLOT THE DATA === #
    # ===================== #  
    for i, s in enumerate(sepsis):
        marker='o'
        if s in marker_special:
            marker=marker_special[s]
        p = plt.plot(grouped.apply(lambda subf: subf[s].mean()*100.0), label=pretty_labels[s],
                linewidth=2,linestyle='-',marker=marker,markersize=12)


        # Add a text label to the right end of every line
        y_pos = yr.loc[ yr['year'] == yr_keep[-1],s ].mean()*100.0
        x_pos = 2012.85
        
        if s in y_pos_fix:
            y_pos = y_pos + y_pos_fix[s]
            
        # plot the symbol
        plt.plot(x_pos-0.1, y_pos+1.0, linestyle='',
                 color=p[0].get_color(), clip_on=False,
                 marker=marker, markersize=12)
        plt.text(x_pos, y_pos, pretty_labels[s],
                 fontsize=18, fontweight='bold', color=p[0].get_color())    


    # pretty the plot
    plt.ylabel('Percentage of Patients', fontsize=24)
    #plt.legend(loc=[0.6,0.45])
    plt.xticks(yr_keep, ['{:g}'.format(x) for x in yr_keep])
    plt.ylim([0,100])
    plt.xlim([yr_keep[0]-.5,yr_keep[-1]+.5])
    plt.xlabel('Fiscal Year of Admission', fontsize=24)
    plt.savefig('SepsisOverTime.png')
    
    plt.show()
    # print table
    print('{:15s}'.format('Score'),end='')
    for i, y in enumerate(yr_keep):
        print('\t{:g}'.format(y),end='')
    print('')
    
    for i, s in enumerate(sepsis):
        print('{:15s}'.format(s),end='')
        for y in yr_keep:
                print('\t{:2.1f}'.format( np.sum(yr.loc[yr['year']==y,s])*100.0/np.sum(yr['year']==y)),end='')
        print('')

In [None]:
plt.xticks?

# Mortality rates for each group

In [None]:
target_header = "hospital_expire_flag"
idx = df[target_header]==1

# make a confusion matrix with multiple scores in each square
scores = ['sepsis_angus','sepsis_martin','sepsis_explicit','suspicion_poe','sepsis-3','sofa>=2']

print('{:15s} {:15s} {:15s}'.format('0','dead','alive','outcome %'))

for c in scores:
    print('{:15s} {:4d} {:1.1f}%\t{:5d} {:1.1f}%  {:1.1f}%'.format(
            c,
            np.sum( (df[c]!=1)&idx ),  np.sum( (df[c]!=1)&idx )*100.0/df.shape[0],
            np.sum( (df[c]!=1)&~idx ), np.sum( (df[c]!=1)&~idx )*100.0/df.shape[0],
            np.sum( (df[c]!=1)&idx )*100.0/np.sum(df[c]!=1)
        ))
print()
print('1')
for c in scores:
    print('{:15s} {:4d} {:1.1f}%\t{:5d} {:1.1f}%  {:1.1f}%'.format(
            c,
            np.sum( (df[c]==1)&idx ),  np.sum( (df[c]==1)&idx )*100.0/df.shape[0],
            np.sum( (df[c]==1)&~idx ), np.sum( (df[c]==1)&~idx )*100.0/df.shape[0],
            np.sum( (df[c]==1)&idx )*100.0/np.sum(df[c]==1)
        ))

# Composite outcome for each group

In [None]:
target_header = "composite_outcome"
idx = (df['hospital_expire_flag']==1) | (df['icu_los']>=3)

# make a confusion matrix with multiple scores in each square
scores = ['sepsis_angus','sepsis_martin','sepsis_explicit','suspicion_poe','sepsis-3','sofa>=2']


print('{:15s} {:12s} {:15s}'.format('0','dead/hi-LOS','alive/lo-LOS','outcome %'))

for c in scores:
    print('{:15s} {:4d} {:1.1f}%   {:5d} {:1.1f}%  {:1.1f}%'.format(
            c,
            np.sum( (df[c]!=1)&idx ),  np.sum( (df[c]!=1)&idx )*100.0/df.shape[0],
            np.sum( (df[c]!=1)&~idx ), np.sum( (df[c]!=1)&~idx )*100.0/df.shape[0],
            np.sum( (df[c]!=1)&idx )*100.0/np.sum(df[c]!=1)
        ))
print()
print('1')
for c in scores:
    print('{:15s} {:4d} {:1.1f}%   {:5d} {:1.1f}%  {:1.1f}%'.format(
            c,
            np.sum( (df[c]==1)&idx ),  np.sum( (df[c]==1)&idx )*100.0/df.shape[0],
            np.sum( (df[c]==1)&~idx ), np.sum( (df[c]==1)&~idx )*100.0/df.shape[0],
            np.sum( (df[c]==1)&idx )*100.0/np.sum(df[c]==1)
        ))

# Mortality: operating point statistics

In [None]:
target_header = "hospital_expire_flag"

# sepsis3 defined as qSOFA >= 2 and SOFA >= 2
yhat_dict = OrderedDict([['SOFA', df.sofa.values >= 2],
                        ['SIRS', df.sirs.values >= 2],
                        ['qSOFA', df.qsofa.values >= 2]])

stats_all = su.get_op_stats(yhat_dict, df[target_header].values)

su.print_op_stats(stats_all)

# Composite outcome: operating point statistics

In [None]:
target_header = "composite_outcome"

# sepsis3 defined as qSOFA >= 2 and SOFA >= 2
yhat_dict = OrderedDict([['SOFA', df.sofa.values >= 2],
                        ['SIRS', df.sirs.values >= 2],
                        ['qSOFA', df.qsofa.values >= 2]])

stats_all = su.get_op_stats(yhat_dict, df[target_header].values)

su.print_op_stats(stats_all)

## Cronbach alpha and Kuder-Richardson Formula 20 agreement

In [None]:
np.random.seed(21381)
# cronbach alpha for construct validity
reload(su)
print(' === Cronbach Alpha ===')
su.cronbach_alpha_table(df, ['sepsis_angus','sepsis_martin', 'sepsis_explicit','sepsis-3'])

print('')
print(' === KR20 ===')
su.kr20_table(df, ['sepsis_angus','sepsis_martin', 'sepsis_explicit','sepsis-3'])

print('Cronbach alpha is a generalization of KR20 to ordinal items.')

# Severity of illness stats

In [None]:
print('{:5g} ({:3.1f}%) first ICU stay for adults.'.format(
        df['icustay_id'].count(), 100))
for c in ['sirs','qsofa','sofa','sepsis-3',
          'sepsis_angus','sepsis_martin','sepsis_explicit']:
    if df[c].max() == 1:
        print('{:5g} ({:3.1f}%)  with {}'.format(
                (df[c]==1).sum(),
                (df[c]==1).sum()*100.0/df.shape[0], c))
    else:
        print('{:5g} ({:3.1f}%)  with {} >= 2'.format(
                (df[c]>=2).sum(),
                (df[c]>=2).sum()*100.0/df.shape[0], c))

In [None]:
print('{:5g} ({:3.1f}%) first ICU stay for adults.'.format(
        df['icustay_id'].count(), 100))

N = (df['qsofa_norx']<2).sum()
print('{:5g} ({:3.1f}%)  with qSOFA < 2'.format(N, N*100.0/df.shape[0]))

N = ((df['qsofa_resprate_score_norx']==0)&(df['qsofa_resprate_score']==1)).sum()
print('{:5g} ({:3.1f}%)  with mech vent increase'.format(N, N*100.0/df.shape[0]))

N = ((df['qsofa_sysbp_score_norx']==0)&(df['qsofa_sysbp_score']==1)).sum()
print('{:5g} ({:3.1f}%)  with vasopressor increase'.format(N, N*100.0/df.shape[0]))

N = ( \
     ((df['qsofa_resprate_score_norx']==0)&(df['qsofa_resprate_score']==1)) & \
     ((df['qsofa_sysbp_score_norx']==0)&(df['qsofa_sysbp_score']==1)) \
    ).sum()
print('{:5g} ({:3.1f}%)  with both increased'.format(N, N*100.0/df.shape[0]))

N = ((df['qsofa_norx']<2)&(df['qsofa']>=2)).sum()
print('{:5g} ({:3.1f}%)  with qSOFA increased to >= 2'.format(N, N*100.0/df.shape[0]))

N = (df['qsofa']>=2).sum()
print('{:5g} ({:3.1f}%)  with qSOFA>=2 (incl treatment flags)'.format(N, N*100.0/df.shape[0]))

## AUROC of severity of illness scores

In [None]:
"""
# ensure bootstrap always results in the same confidence intervals
np.random.seed(978236412)

# define outcome
target_header = "hospital_expire_flag"

# define the covariates to be added in the model (used for table of AUROCs)
preds_header = ['sirs','qsofa','sofa']
preds = su.calc_predictions(df, preds_header, target_header, model=None)

print('AUROC table for unadjusted scores:')
su.print_auc_table(preds, df[target_header].values, preds_header)
print('\n')
"""

# Mortality: build models with different covariates

In [None]:
# define outcome
target_header = "hospital_expire_flag"

# define the covariates to be added in the model (used for table of AUROCs)
preds_header = ['sirs','qsofa','qsofa_norx','sofa']
preds = su.calc_predictions(df, preds_header, target_header, model=None)
preds_logreg = su.calc_predictions(df, preds_header, target_header, model='logreg')


print('AUROC table for unadjusted scores:')
su.print_auc_table(preds, df[target_header].values, preds_header)
print('\n')

print('AUROC table for adjusted scores (using logistic regression):')
su.print_auc_table(preds_logreg, df[target_header].values, preds_header)
print('\n')

In [None]:
from statsmodels.formula.api import logit
target_header = "hospital_expire_flag"
pred_header = 'sofa'
# test model on its own
auc = ru.calc_auc(df[pred_header].values, df[target_header].values, with_ci=False)
print('{} - univariable'.format(pred_header))
print('AUROC={:0.3f}'.format(auc))



# test model performance with addition of lactate
df['lactate_max_imputed'] = df['lactate_max']
df['lactate_max_imputed'].fillna(value=0, inplace=True)
formula = target_header + " ~ lactate_missing + lactate_max_imputed"
formula += " + " + pred_header
print('{}'.format(formula))
model = logit(formula=formula,data=df).fit(disp=0)
auc, ci = ru.calc_auc(model.predict(), df[target_header].values, with_ci=True, alpha=0.05)

print('AUROC={:0.3f} [{:0.3f} - {:0.3f}]'.format(auc,ci[0],ci[1]))
print(model.summary2())

# test model performance with age and comorbid burden
formula  = target_header + " ~ age + elixhauser_hospital"
formula += " + " + pred_header
print('{}'.format(formula))
model = logit(formula=formula,data=df).fit(disp=0)
auc, ci = ru.calc_auc(model.predict(), df[target_header].values, with_ci=True, alpha=0.05)

print('AUROC={:0.3f} [{:0.3f} - {:0.3f}]'.format(auc,ci[0],ci[1]))
print(model.summary2())


# test model performance with all above covariates
df['lactate_max_imputed'] = df['lactate_max']
df['lactate_max_imputed'].fillna(value=0, inplace=True)
formula = target_header + " ~ age + elixhauser_hospital + lactate_missing + lactate_max_imputed"
formula += " + " + pred_header
print('{}'.format(formula))
model = logit(formula=formula,data=df).fit(disp=0)
auc, ci = ru.calc_auc(model.predict(), df[target_header].values, with_ci=True, alpha=0.05)

print('AUROC={:0.3f} [{:0.3f} - {:0.3f}]'.format(auc,ci[0],ci[1]))
print(model.summary2())

# Composite outcome - prediction performance

In [None]:
# define outcome
target_header = 'composite_outcome'

# define the covariates to be added in the model (used for table of AUROCs)
preds_header = ['sirs','qsofa','qsofa_norx', 'sofa']
preds = su.calc_predictions(df, preds_header, target_header, model=None)
preds_logreg = su.calc_predictions(df, preds_header, target_header, model='logreg')


print('AUROC table for unadjusted scores:')
su.print_auc_table(preds, df[target_header].values, preds_header)
print('\n')

print('AUROC table for adjusted scores (using logistic regression):')
su.print_auc_table(preds_logreg, df[target_header].values, preds_header)
print('\n')

In [None]:
from statsmodels.formula.api import logit
target_header = 'composite_outcome'
pred_header = 'sofa'
# test model on its own
auc = ru.calc_auc(df[pred_header].values, df[target_header].values, with_ci=False)
print('{} - univariable'.format(pred_header))
print('AUROC={:0.3f}'.format(auc))




# test model performance with addition of lactate
df['lactate_max_imputed'] = df['lactate_max']
df['lactate_max_imputed'].fillna(value=0, inplace=True)
formula = target_header + " ~ lactate_missing + lactate_max_imputed"
formula += " + " + pred_header
print('{}'.format(formula))
model = logit(formula=formula,data=df).fit(disp=0)
auc, ci = ru.calc_auc(model.predict(), df[target_header].values, with_ci=True, alpha=0.05)

print('AUROC={:0.3f} [{:0.3f} - {:0.3f}]'.format(auc,ci[0],ci[1]))
print(model.summary2())

# test model performance with age and comorbid burden
formula  = target_header + " ~ age + elixhauser_hospital"
formula += " + " + pred_header
print('{}'.format(formula))
model = logit(formula=formula,data=df).fit(disp=0)
auc, ci = ru.calc_auc(model.predict(), df[target_header].values, with_ci=True, alpha=0.05)

print('AUROC={:0.3f} [{:0.3f} - {:0.3f}]'.format(auc,ci[0],ci[1]))
print(model.summary2())


# test model performance with all above covariates
df['lactate_max_imputed'] = df['lactate_max']
df['lactate_max_imputed'].fillna(value=0, inplace=True)
formula = target_header + " ~ age + elixhauser_hospital + lactate_missing + lactate_max_imputed"
formula += " + " + pred_header
print('{}'.format(formula))
model = logit(formula=formula,data=df).fit(disp=0)
auc, ci = ru.calc_auc(model.predict(), df[target_header].values, with_ci=True, alpha=0.05)

print('AUROC={:0.3f} [{:0.3f} - {:0.3f}]'.format(auc,ci[0],ci[1]))
print(model.summary2())

In [None]:
print('Odds ratios for final model w/ lactate')
for p in model.params.index:
    print('{:20s} {:1.2f}'.format(p, np.exp(model.params[p])))

## Lactate groups

In [None]:
groups = OrderedDict( [['No lactate', df['lactate_max'].isnull()],
         ['0-2', (df['lactate_max']>=0) & (df['lactate_max']<=2)],
         ['2-4', (df['lactate_max']> 2) & (df['lactate_max']<=4)],
         ['>4', (df['lactate_max']> 4) & (df['lactate_max']<=200)]] )


idxA = (df['sofa']>=2).values
strAdd = ['SOFA< 2','SOFA>=2']
x_sofa, lbl_sofa = su.create_grouped_hist(df, groups, idxA, strAdd=strAdd, targetStr='hospital_expire_flag')


idxA = (df['sofa']>=2).values
strAdd = ['SOFA< 2','SOFA>=2']
x_sofa_composite, lbl_sofa_composite = su.create_grouped_hist(df, groups, idxA, strAdd=strAdd, targetStr='composite_outcome')

plt.figure(figsize=[10,6])
xi = np.linspace(0,7,8)
plt.bar(xi+0.1,x_sofa,facecolor='b', label='SOFA',width=0.4)
plt.bar(xi+0.5,x_sofa_composite,facecolor='r',alpha=0.5,label='SOFA composite',width=0.4)
plt.xticks(np.linspace(0,7,8)+0.3, [x.replace('SOFA','').replace('No lactate','N/A') for x in lbl_sofa])
plt.legend(loc='upper left')
plt.ylabel("Hospital mortality / composite")
plt.text(7.8,-0.09,'Lactate\nSOFA')
plt.show()

In [None]:
groups = OrderedDict( [['No lactate', df['lactate_max'].isnull()],
         ['0-2', (df['lactate_max']>=0) & (df['lactate_max']<=2)],
         ['2-4', (df['lactate_max']> 2) & (df['lactate_max']<=4)],
         ['>4', (df['lactate_max']> 4) & (df['lactate_max']<=200)]] )


idxA = (df['sofa']>=2).values
strAdd = ['SOFA< 2','SOFA>=2']
x_sofa, lbl_sofa = su.create_grouped_hist(df, groups, idxA, strAdd=strAdd, targetStr='hospital_expire_flag')


idxA = (df['sofa']>=2).values
strAdd = ['SOFA< 2','SOFA>=2']
x_sofa_composite, lbl_sofa_composite = su.create_grouped_hist(df, groups, idxA, strAdd=strAdd, targetStr='composite_outcome')

print('{:10s}\tSOFA< 2\tSOFA>=2'.format(''),end='')
print('\t\tSOFA< 2\tSOFA>=2'.format(''))
for i in range(len(lbl_sofa)/2):
    i1 = 2*i
    i2 = 2*i+1
    
    print('{:10s}\t{:2.1f}\t{:2.1f}'.format( lbl_sofa[i1].split('\n')[0], 100.0*x_sofa[i1], 100.0*x_sofa[i2] ), end='')
    print('\t\t{:2.1f}\t{:2.1f}'.format( 100.0*x_sofa_composite[i1], 100.0*x_sofa_composite[i2] ))