In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import psycopg2
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.formula.api import logit


import sys
import os
# the following line adds the parent folder to the path
# it lets us import the sepsis_utils package
sys.path.insert(1, os.path.join(sys.path[0], '..'))
from sepsis_utils import sepsis_utils as su
from sepsis_utils import roc_utils as ru

from sklearn.pipeline import Pipeline

# used for train/test splits
from sklearn.cross_validation import train_test_split

# used to impute mean for data
from sklearn.preprocessing import Imputer

# normalize the data
from sklearn import preprocessing

# logistic regression is our model of choice
from sklearn.linear_model import LogisticRegression

# used to create confusion matrix
from sklearn.metrics import confusion_matrix

from sklearn.cross_validation import cross_val_score

# used to calculate AUROC/accuracy
from sklearn import metrics

# for calibration curve of severity scores
from sklearn.calibration import calibration_curve

# default colours for prettier plots
col = [[0.9047, 0.1918, 0.1988],
    [0.2941, 0.5447, 0.7494],
    [0.3718, 0.7176, 0.3612],
    [1.0000, 0.5482, 0.1000],
    [0.4550, 0.4946, 0.4722],
    [0.6859, 0.4035, 0.2412],
    [0.9718, 0.5553, 0.7741],
    [0.5313, 0.3359, 0.6523]];
marker = ['v','o','d','^','s','o','+']
ls = ['-','-','-','-','-','s','--','--']
%matplotlib inline

from __future__ import print_function

# Load the data from SQL database

In [None]:
# create a database connection

# below config used on pc70
sqluser = 'alistairewj'
dbname = 'mimic'
schema_name = 'mimiciii'

# Connect to local postgres version of mimic
con = psycopg2.connect(dbname=dbname, user=sqluser)

excl = "adult = 1 and icustay_num = 1 and (suspected_infection_time < intime + interval '1' day)"
df = su.get_data(exclusions=excl)
dd = su.get_physiologic_data(con)

# close the database connection as we are finished extracting data
con.close()

## What factors would improve Sepsis-3 guidelines?

In [None]:
# initialize our dataframe to the cohort
dm = df.merge(dd, how='inner', on='icustay_id',suffixes=('','_dd'))
dm.set_index('icustay_id',inplace=True)

# only look at icustay_ids with suspected infection
iid_suspected = dm.loc[(~dm['suspected_infection_time'].isnull().values),:].index
dm = dm.loc[iid_suspected]

# we subselect to patients classified as positive by sepsis-3
idxData = (dm.qsofa.values >= 2) & (dm.sofa.values>=2)

# define targets using angus criteria
y = dm.angus.values == 1

#TODO: REMOVE OTHER UNDESIRABLE COLUMNS !

# create an iterator to get all but the first column
idx = [i for i in range(dm.columns.values.size) if dm.columns[i] in dd.columns]
X_data = dm[idx].values

# create the header from the column index we made earlier
X_header = [dm.columns[i] for i in idx]

X = X_data[idxData,:]
y = y[idxData]

# get feature/predictor matrix as numpy array
#X_nan = df[idx].isnull().values
# combine the arrays
#X = np.column_stack((X_data,X_nan))
#X_header = [X_header,[s + '_NaN' for s in X_header]]

# # flatten the list of lists into a single list
#X_header = [item for sublist in X_header for item in sublist]

# impute mean for missing values
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(X)

# custom scaling of data to avoid normalizing to missing data
sigma = np.ones(X.shape[1])

for i in range(X.shape[1]):
    tmp = X[~np.isnan(X[:,i]),i]
    if tmp.size > 1:
        sigma[i] = np.sqrt(np.var(tmp))

# # print the imp statistics
# print('{:20s}: {:14s} {:10s}').format('Header','     Mean','stdev')
# for i in range(len(X_header)):
#     print('{:20s}: {:10.4f} {:10.4f}').format(X_header[i], imp.statistics_[i], sigma[i])

X_tr = imp.transform(X)

# Logit Model
model = sm.Logit(y, X_tr)
results = model.fit()

y_hat = results.predict(exog=X_tr, transform=False)
y_pred = np.round(y_hat)


# generate evaluation metrics
print('Accuracy = {}'.format(metrics.accuracy_score(y, y_pred)))
print('AUROC = {}'.format(metrics.roc_auc_score(y, y_hat)))



su.print_cm(y, y_pred) # print confusion matrix

# train logit model using scikit
#model = LogisticRegression(fit_intercept=True)
#results = model.fit(X_train, y)
# predict class labels for the test set
#y_pred = results.predict(X_train)
#y_hat = results.predict_proba(X_train)
#y_hat = y_hat[:,1]


# let's look at adjusted odds ratios - everything is in standard deviation units
# N.B. if using sklearn, change to results.coef_.flatten()
oddsratio = np.exp(results.params * sigma)

# sort by value of odds ratio
sort_indices = np.argsort(oddsratio, axis=0)
oddsratio = oddsratio[sort_indices]

# create p-value array
pvalue = results.pvalues
pvalue = pvalue[sort_indices]

# also create a labels vector which is sorted
lbls = [X_header[i] for i in sort_indices]


# split into two vectors:
#   (i)  significant at p<0.05
#   (ii) insignificant at p<0.05
ytick = np.asarray(range(oddsratio.size))

idxSignificant = pvalue<=0.05

or_sig = oddsratio[idxSignificant]
lbl_sig = [lbl for i, lbl in enumerate(lbls) if idxSignificant[i] == True]
ytick_sig = ytick[idxSignificant]

or_insig = oddsratio[~idxSignificant]
lbl_insig = [lbl for i, lbl in enumerate(lbls) if idxSignificant[i] == False]
ytick_insig = ytick[~idxSignificant]

# now plot these odds ratios
plt.figure(figsize=[12,20])


plt.plot(or_insig, ytick_insig, 's', markersize=8, color=col[0],label='p >  0.05') # insignificant
plt.plot(or_sig, ytick_sig, 'o', markersize=8, color=col[1],label='p <= 0.05') # significant
plt.legend(loc='lower right')
plt.plot([1.,1.],[0,oddsratio.size],'k--')


ax = plt.gca()
ax.set_yticks(range(oddsratio.size))
ax.set_yticklabels(lbls,fontsize=14,fontweight='bold')
ax.set_ylim([-1,oddsratio.size])
ax.set_xticklabels( ['%2.2f' % i for i in ax.get_xticks()], fontsize=14 )
plt.xlabel('Odds ratios (exponentiated coefficient)')
plt.grid()
plt.show()

# Subsequent analyses

It would be interesting to evaluate if SIRS/Sepsis-3 differ in performance after subgrouping the data into categories based upon:

* WBC
* Temperature
* Age
* Gender
* Immunosuppression
    * Prednisone, Prednisolone (Orapred), Methylprednisolone (Medrol), Dexamethasone (Decadron), Hydrocortisone (Cortef), Cortisone
    * Cyclophosphamide (Cytoxan)
    * Cisplatin (Platinol), Carboplatin (Paraplatin)
    * Azathioprine  (Imuran)
    * Mercaptopurine (Purinethol)
    * Methotrexate/MTX (Trexall, Rasuvo)
    * Rituximab (Rituxan, MabThera, Zytux)
    * Basiliximab (Simulect)
    * Daclizumab (Zenapax)
    * Cyclosporin/Ciclosporin (Neoral, Sandimmune)
    * Tacrolimus (Prograf, Advagraf, Protopic)
    * Sirolimus (Rapamune)
    * Infliximab (Remicade)
    * Etanercept (Enbrel)
    * Adalimumab (Humira)
    * Mycophenolate (CellCept, Myfortic)
* BMI
    * < 18.5
    * 18.5 - 24.9
    * 25 - 29.9
    * 30 - 49.9
    * \> 50
* Metastatic cancer (Elixhauser comorbidity)
* Diabetes (Elixhauser comorbidity)

In [None]:
# test age - have to add age to the dataframe from misc
dm_tmp = dm.merge(misc, how='left', left_index=True, right_on='icustay_id', suffixes=('','_misc'))

# define "targets", angus critera
y = dm_tmp.angus.values == 1

# define "predictions" according to the SEPSIS-3 guidelines:
#  suspicion of infection, qSOFA >= 2, and SOFA >= 2
yhat_all = [ dm_tmp.qsofa.values >= 2, dm_tmp.qsofa.values >= 2, dm_tmp.qsofa.values >= 2 ]
yhat_names = ['qSOFA','old','young']

# the below filters each group to a subset of patients

idx_group = [ ~np.isnan(dm_tmp.qsofa.values), dm_tmp.age.values >= 70, dm_tmp.age.values < 70 ]


results = su.print_op_stats(yhat_all, y, yhat_names=yhat_names,
                  idx=idx_group)

# Appendix

In [None]:
# debug plot for outliers

plt.figure(figsize=[12,9])
# the histogram of the data
n, bins, patches = plt.hist(dm.pao2fio2_min.dropna().values, bins=np.asarray(range(200))*10, normed=True, facecolor='green', alpha=0.75)

plt.xlabel('pao2fio2_min')
plt.ylabel('Probability')
plt.grid(True)

plt.show()