In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
import sklearn
from sklearn.metrics import roc_curve, auc
from sklearn.externals import joblib
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
from sklearn import preprocessing

pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', -1)

import matplotlib.pyplot as plt
import datetime

import seaborn as sns
sns.set(style='ticks')

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

# Set up SQL Alchemy engine and session
Base = automap_base()

# Doing basic probes on data locally
engine = create_engine("postgresql://mimicuser:TokyoP%40rkDr!pgolf@localhost/mimic")

# Reflect the tables
Base.prepare(engine, reflect=True, schema='mimiciii')

# mapped classes are now created with names by default
# matching that of the table name.
Admission = Base.classes.admissions
Patient = Base.classes.patients
LabEvent = Base.classes.labevents
BioEvent = Base.classes.microbiologyevents

session = Session(engine)

%env DATABASE_URL=postgresql://mimicuser:TokyoP%40rkDr!pgolf@localhost/mimic
        
import os
import psycopg2
import pandas as pd
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', -1)
def get_connection():
    dsn = os.environ.get('DATABASE_URL')
    return psycopg2.connect(dsn)


conn = get_connection()

In [2]:
admission_pne = pd.read_sql(sql="select * from mimiciii.admissions where diagnosis like 'PNEUMONIA';", con=conn)
patients = pd.read_sql(sql="select * from mimiciii.patients;", con=conn)

In [3]:
patients = patients[['subject_id', 'gender', 'dod', 'dob']]
colum = ['subject_id', 'hadm_id', 'admittime']
admission = admission_pne[colum]

In [5]:
patient_info = admission.join(patients.set_index('subject_id'), on='subject_id')
patient_info.shape

(1566, 6)

In [6]:
def period(row, period):
    #print(type(row['deathtime']))
    if pd.isnull(row['dod']):
        return 0
    elif row['dod'] - row['admittime'] > period:
        return 0
    else:
        return 1
patient_info['death_period'] = patient_info.apply (lambda row: period (row, pd.Timedelta('40 days')),axis=1)
patient_info = patient_info.drop_duplicates('subject_id')
### drop overslap patients

In [8]:
patient_info[:5]

Unnamed: 0,subject_id,hadm_id,admittime,gender,dod,dob,death_period
0,357,101651,2199-10-20 12:05:00,M,2201-08-02,2135-03-22,0
1,368,105889,2137-07-11 17:56:00,M,2141-01-28,1837-07-11,0
3,370,123421,2153-03-04 01:17:00,F,2153-03-12,2069-08-26,1
4,68,170467,2173-12-15 16:16:00,F,2174-02-11,2132-02-29,0
5,85,112077,2167-07-25 18:49:00,M,2167-09-12,2090-09-18,0


In [9]:
def itemid_to_value(target, itemid, label, dataframe):
    value = dataframe.loc[dataframe[itemid] == target][label].values[0]
    return value

In [12]:
# def plotroc(fpr_, tpr_, filename):
#     roc_auc = auc(fpr_, tpr_)

#     plt.figure(figsize=(7,7))
#     plt.plot(fpr_, tpr_, color='darkorange', lw=1, label='ROC curve (area = %0.2f)' % roc_auc)
#     plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
#     plt.xlim([0.0, 1.0])
#     plt.ylim([0.0, 1.05])
#     plt.xlabel('False Positive Rate', fontsize=15)
#     plt.ylabel('True Positive Rate', fontsize=15)
#     plt.title('Receiver operating characteristic', fontsize=15)
#     plt.legend(loc="lower right", fontsize=15)
#     plt.savefig(filename)
#     plt.show()
# def plot_auc(model, test_x, test_y, filename):
#     pred_model = model.predict_proba(test_x)
#     #pred_model = model.predict(test_x)
#     pred_model = pred_model.tolist()
#     pred_model_list = []
#     for i in range(len(pred_model)):
#         pred_model_list.append(pred_model[i][1])
#         #pred_model_list.append(pred_model[i])
#     pred_model_list = np.asarray(pred_model_list)
#     fpr1, tpr1, thresholds1 = roc_curve(test_y, pred_model_list)

#     plotroc(fpr1, tpr1, filename)

In [13]:
def plotroc(fpr_, tpr_, filename):
    roc_auc1 = auc(fpr_[0], tpr_[0])
    roc_auc2 = auc(fpr_[1], tpr_[1])
    roc_auc3 = auc(fpr_[2], tpr_[2])
    
    plt.figure(figsize=(7,7))
    plt.plot(fpr_[0], tpr_[0], color='sienna', lw=2, label='LR AUC = %0.2f' % roc_auc1)
    plt.plot(fpr_[1], tpr_[1], color='forestgreen', lw=2, label='SVM AUC = %0.2f' % roc_auc2)
    plt.plot(fpr_[2], tpr_[2], color='crimson', lw=2, label='RF AUC = %0.2f' % roc_auc3)
    
    plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate', fontsize=17)
    plt.ylabel('True Positive Rate', fontsize=17)
    plt.title('Receiver operating characteristic', fontsize=17)
    plt.legend(loc="lower right", fontsize=18)
    plt.savefig(filename)
    plt.show()
def plot_auc(model1, model2, model3, test_x, test_y, filename):
    pred_model = model1.predict_proba(test_x)
    pred_model = pred_model.tolist()
    pred_model_list = []
    for i in range(len(pred_model)):
        pred_model_list.append(pred_model[i][1])
    pred_model_list = np.asarray(pred_model_list)
    fpr1, tpr1, thresholds1 = roc_curve(test_y, pred_model_list)
    
    
    pred_model = model2.predict_proba(test_x)
    pred_model = pred_model.tolist()
    pred_model_list = []
    for i in range(len(pred_model)):
        pred_model_list.append(pred_model[i][1])
    pred_model_list = np.asarray(pred_model_list)
    fpr2, tpr2, thresholds1 = roc_curve(test_y, pred_model_list)
    
    
    pred_model = model3.predict_proba(test_x)
    pred_model = pred_model.tolist()
    pred_model_list = []
    for i in range(len(pred_model)):
        pred_model_list.append(pred_model[i][1])
    pred_model_list = np.asarray(pred_model_list)
    fpr3, tpr3, thresholds1 = roc_curve(test_y, pred_model_list)
    
    f = [fpr1, fpr2, fpr3]
    t = [tpr1, tpr2, tpr3]
    plotroc(f, t, filename)