In [37]:
import os
import sys
import re
from tempfile import mkdtemp

print(sys.version_info)

home_dir = os.getenv("HOME")
print(os.getenv("PYTHONPATH"))

import numpy as np
import pickle
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import metrics, tree
from sklearn.preprocessing import StandardScaler, Imputer, LabelEncoder
from sklearn.metrics import precision_recall_curve, average_precision_score, brier_score_loss, make_scorer
from sklearn.linear_model import LogisticRegression, ElasticNet, ElasticNetCV, ARDRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier

from fancyimpute import SoftImpute
from ehr_utils import *
from xgboost import XGBClassifier

sys.version_info(major=3, minor=6, micro=7, releaselevel='final', serial=0)
/Users/bhizzle/codepy:/Users/bhizzle/cgen:/usr/local:/Users/bhizzle/usr/local:/usr/local/caffe2/python:


ModuleNotFoundError: No module named 'fancyimpute'

In [None]:
# set to False to hide IDs    
verbose = False

## this variable is the column that we will use as the target variable for the model
TARGET_VARIABLE = 'INPT_DEATH_YN'

MIN_ASA_STATUS=1
MAX_ASA_STATUS=5
MIN_AGE=18
MAX_AGE=89

# read data into dataframe 

In [None]:
main_filtered_f = "main_Nov21_2017_Feb_13_2018.filtered.main.txt"

exp_prefix = "preop_asa"
dir_to_save_files = os.path.join("paper/", exp_prefix)

if not os.path.exists(dir_to_save_files):
    os.makedirs(dir_to_save_files)

In [38]:
df = pd.read_csv(main_filtered_f,sep="|")
print df.shape
print len(df.columns)
if verbose:
    display(df.iloc[0:20, :])

SyntaxError: Missing parentheses in call to 'print'. Did you mean print(df.shape)? (<ipython-input-38-1a42f2af2ea1>, line 2)

# drop any rows that are exact copies of another row

In [39]:
print df.shape
df.drop_duplicates(inplace=True)
print df.shape

SyntaxError: Missing parentheses in call to 'print'. Did you mean print(df.shape)? (<ipython-input-39-580e934a7e09>, line 1)

# read in features file

In [40]:
FEATURES_PATH = os.path.join(dir_to_save_files, 'EHR_MAIN_FEATURES.csv')
features_df = pd.read_csv(FEATURES_PATH)

features_dict = {name:list(col.dropna()) for name,col in features_df.items()}
print(features_dict.keys())

final_features = features_dict['final_features']
cat_to_drop = features_dict['cat_to_drop']
outcome_vars = features_dict['outcome_vars']
feat_to_drop = features_dict['feat_to_drop']
cat_vars = features_dict['cat_vars']
contin_vars = features_dict['contin_vars']
bool_outcome_vars = features_dict['bool_outcome_vars']

NameError: name 'dir_to_save_files' is not defined

# Filter out surgeries that don't occur in RR or SM operating rooms

In [41]:
print(df.LOCATION_GROUP.unique())
#df = df[df['LOCATION_GROUP'].isin(['RR OR', 'SM OR','SM SC','SM OB OR','RR OB OR'])]
print(df.shape)
df = df[df['LOCATION_GROUP'].isin(['RR OR', 'SM OR', 'SM SC'])]
print(df.shape)

NameError: name 'df' is not defined

# Filter out surgeries that were not INPATIENT, SAME DAY ADMIT, EMERGENCY, or OVERNIGHT RECOVERY

In [42]:
print "Shape before filtering out outpatient surgeries:", df.shape
#df = df[df['PATIENT_CLASS'].isin(['INPATIENT', 'SAME DAY ADMIT', 'EMERGENCY', 'OVERNIGHT RECOVERY'])]
df = df[df['PAT_CLASS'].isin(['INPATIENT', 'SAME DAY ADMIT', 'EMERGENCY', 'OVERNIGHT RECOVERY'])]
print "Shape after filtering out outpatient surgeries:", df.shape

SyntaxError: Missing parentheses in call to 'print'. Did you mean print("Shape before filtering out outpatient surgeries:", df.shape)? (<ipython-input-42-22291916d42d>, line 1)

# Filter based on ASA status, age

In [43]:
try:
    print "Shape before filtering out based on ASA_STATUS:", df.shape
    print "ASA_STATUS mean:", df.ASA_STATUS.mean()
    df = df[(df["ASA_STATUS"] <= MAX_ASA_STATUS) & (df["ASA_STATUS"] >= MIN_ASA_STATUS)]
    print "Shape after filtering out based on ASA_STATUS:", df.shape
    print "ASA_STATUS mean:", df.ASA_STATUS.mean()
except AttributeError:
    pass
print "==================================="
print "Mean age:", df.AGE_LT_90.mean()
print "STD age:", df.AGE_LT_90.std()
df = df[(df["AGE_LT_90"] <= MAX_AGE) & (df["AGE_LT_90"] >= MIN_AGE)]
print "Mean age:", df.AGE_LT_90.mean()
print "STD age:", df.AGE_LT_90.std()
print "Shape after filtering out based on AGE_LT_90:", df.shape

SyntaxError: Missing parentheses in call to 'print'. Did you mean print("Shape before filtering out based on ASA_STATUS:", df.shape)? (<ipython-input-43-9508c88373ba>, line 2)

# check demographic distributions

In [44]:
print "Number of Patients:", df.shape[0]
print "Patients with in-hospital mortality: {} ({}%)".format(df.INPT_DEATH_YN.value_counts()[1], df.INPT_DEATH_YN.value_counts(normalize="True")[1]*100)
print "Patients with kidney failure: {} ({}%)".format(df.AKIN_EVENT.value_counts()[1], df.AKIN_EVENT.value_counts(normalize="True")[1]*100)
print "Mean age:", df.AGE_LT_90.mean(), " std:", df.AGE_LT_90.std()
print "Number of female patients: {} ({}%)".format(df[df["GENDER"] == "F"].shape[0], df[df["GENDER"] == "F"].shape[0]/float(df.shape[0])*100)

try:
    print "Number of patients in RR OR: {} ({}%)".format(df.LOCATION_GROUP.value_counts()["RR OR"], df.LOCATION_GROUP.value_counts(normalize="True")["RR OR"]*100)
    print "Number of patients in SM OR: {} ({}%)".format(df.LOCATION_GROUP.value_counts()["SM OR"], df.LOCATION_GROUP.value_counts(normalize="True")["SM OR"]*100)
    print "Number of patients in SM SC: {} ({}%)".format(df.LOCATION_GROUP.value_counts()["SM SC"], df.LOCATION_GROUP.value_counts(normalize="True")["SM SC"]*100)
except AttributeError:
    pass
print("="*40)
try:
    print "ASA Status:", df.ASA_STATUS.value_counts()
    print "ASA Status (%):", (df.ASA_STATUS.value_counts()/df.shape[0])*100
except AttributeError:
    pass

print("="*40)
print "Mean age of mortalities:", df[df["INPT_DEATH_YN"] == 1].AGE_LT_90.mean(), " std:", df[df["INPT_DEATH_YN"] == 1].AGE_LT_90.std()
print("Number of female mortalities: {} ({}%)".format(df[df["GENDER"] == "F"]["INPT_DEATH_YN"].sum(), 
                                                     df[df["GENDER"] == "F"]["INPT_DEATH_YN"].sum()/float(df["INPT_DEATH_YN"].sum())*100))
print("Number of male mortalities: {} ({}%)".format(df[df["GENDER"] == "M"]["INPT_DEATH_YN"].sum(), 
                                                     df[df["GENDER"] == "M"]["INPT_DEATH_YN"].sum()/float(df["INPT_DEATH_YN"].sum())*100))
try:
    print("="*40)
    print("Number of mortalities stratified by location")
    print(df.groupby("LOCATION_GROUP")["INPT_DEATH_YN"].sum())                                
    print(df.groupby("LOCATION_GROUP")["INPT_DEATH_YN"].sum()/float(df["INPT_DEATH_YN"].sum())*100)
except AttributeError:
    pass
print("="*40)
print("Number of mortalities stratified by ASA status")
print(df.groupby("ASA_STATUS")["INPT_DEATH_YN"].sum())
print(df.groupby("ASA_STATUS")["INPT_DEATH_YN"].sum()/float(df["INPT_DEATH_YN"].sum())*100)

if verbose:
    print(df["CASE_SRV_NAME"].value_counts())
    print(df["CASE_SRV_NAME"].value_counts()/df.shape[0])*100
    print("="*40)
    print(df[df["INPT_DEATH_YN"] == 1]["CASE_SRV_NAME"].value_counts())
    print(df[df["INPT_DEATH_YN"] == 1]["CASE_SRV_NAME"].value_counts()/df["INPT_DEATH_YN"].sum())*100

SyntaxError: Missing parentheses in call to 'print'. Did you mean print("Number of Patients:", df.shape[0])? (<ipython-input-44-7a9b31f35a0d>, line 1)

# Remove outlier values

In [45]:
string_cols = ['PRE_SURG_LOCATION', 'CASE_SRV_NAME_GROUP', 'CASE_SRV_NAME', 'PRIMARY_CPT',
                                  'GENDER', 'HCUP_DESC', 'CPT_DESC', 'PAT_CLASS', 'OR_CASE_ID', 'ADMSN_ID']
dff = df.drop(['PRE_SURG_LOCATION', 'CASE_SRV_NAME_GROUP', 'CASE_SRV_NAME', 'PRIMARY_CPT',
                                  'GENDER', 'HCUP_DESC', 'CPT_DESC', 'PAT_CLASS', 'OR_CASE_ID', 'ADMSN_ID'], axis=1)
dff = df.select_dtypes(include=['float64'])
df_string_cols = df[df.columns.difference(dff.columns.values)]
print df_string_cols.columns.values
#display(dff.describe())
print (np.abs(st.zscore(dff, axis=1)) > 3)
#print dff.sub(dff.mean()).div(dff.std()).abs().lt(3)
df_no_outliers = dff[dff.sub(dff.mean()).div(dff.std()).abs().lt(4)]
df_no_outliers[df_string_cols.columns.values] = df_string_cols
if verbose:
    display(df_no_outliers.describe(include="all"))
df = df_no_outliers

SyntaxError: Missing parentheses in call to 'print'. Did you mean print(df_string_cols.columns.values)? (<ipython-input-45-6a0f0d2cd52b>, line 7)

# Remove variables related to lab times (i.e. *.HRS_2_SURGERY)

In [46]:
#remove variables that have to do with time
# cols_to_keep_no_hrs2surgery = [c for c in df.columns if not c.endswith(".HRS_2_SURGERY")]
# print cols_to_keep_no_hrs2surgery
# print len(cols_to_keep_no_hrs2surgery)
# df=df[cols_to_keep_no_hrs2surgery]

# Remove unnecessary features 

In [47]:
# save this for checking predictions over time
admsn_surgery_number = df["ADMSN_SURGERY_NUMBER"]
print admsn_surgery_number.shape
or_case_id_number = df["OR_CASE_ID"]
admsn_ids = df['ADMSN_ID']

SyntaxError: Missing parentheses in call to 'print'. Did you mean print(admsn_surgery_number.shape)? (<ipython-input-47-9e38ddde2288>, line 3)

In [None]:
df = df[final_features + [TARGET_VARIABLE]]

# One-hot encode categorical variables

In [None]:
for var in cat_vars:
    try:
        # drop_first uses k-1 dummies out of k categories
        print var
        df = pd.get_dummies(df, columns=[var], drop_first=True)
        pass
    except ValueError:
        pass
# remove categorical variables (string values)
for var in cat_vars:
    try:
        df.drop(var, axis=1, inplace=True)
        pass
    except ValueError:
        print var, 'already dropped'

# Remove features we don't want to include

In [None]:
if len(feature_whitelist) == 0:
    for cat in cat_to_drop:
        try:
            df.drop(cat, axis=1, inplace=True)
        except ValueError:
            print cat, 'already dropped'
#print df.columns.values
for col in sorted(df.columns.values):
    print col, "\t\t", df[col].dtype

# Remove target variables from data frame

In [None]:
print "Column names:", df.columns.values
try:
    y = np.ravel(df[TARGET_VARIABLE])
    #asa_status = df["ASA_STATUS"]
    df.drop(TARGET_VARIABLE, axis=1, inplace=True, errors='ignore')
    df.drop(outcome_vars, axis=1, inplace=True, errors='ignore')
    input_death_yn = df['INPT_DEATH_YN']
except KeyError:
    print TARGET_VARIABLE, "already dropped"
# http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Imputer.html
# default stragegy: mean
# if len(feature_whitelist) > 0:
#     feature_whitelist = [c for c in feature_whitelist if not c.endswith(".HRS_2_SURGERY")]
#     df = df[feature_whitelist]
print df.isnull().sum()

# Standardize training, testing data

In [None]:
class StandardizeWithNaN(TransformerMixin, BaseEstimator):
    '''This estimator is for standardizing a dataset that has missing data'''
    def __init__(self):
        self.X_mean = []
        self.X_std = []
        pass

    def fit(self, X, y=None):
        # get mean and standard deviation of columns
        self.X_mean = np.nanmean(X, axis=0)
        self.X_std  = np.nanstd(X, axis=0) 
        return self

    def transform(self, X):
        # subtract mean and divide by standard deviation
        return (X - self.X_mean)/self.X_std

In [None]:
scaler = StandardizeWithNaN()
scaler.fit(X_test)
X_test = scaler.transform(X_test) 

# Impute missing values

In [None]:
class SoftImputeEstimator(TransformerMixin, BaseEstimator):
    '''This estimator is for wrapping the SoftImpute algorithm'''
    def __init__(self, max_iters=200, verbose=True):
        self.max_iters = max_iters
        self.verbose = verbose
        self.fit_count = 0
        self.transform_count = 0
        pass

    def fit(self, X, y=None):
        self.fit_count += 1
        print("SoftImputeEstimator fit count: {}".format(self.fit_count))
        return self

    def transform(self, X):
        self.transform_count += 1
        print("SoftImputeEstimator transform count: {}".format(self.transform_count))
        try:
            # subtract mean and divide by standard deviation
            return SoftImpute(max_iters=self.max_iters, verbose=self.verbose).complete(X.replace(np.inf, np.nan))
        # ValueError raised if no values need to be imputed
        except ValueError:
            return np.array(X)

In [48]:
print("imputing X_test")
#print(np.isnan(X_test).any())
X_test = si.transform(X_test.replace(np.inf, np.nan))

imputing X_test


NameError: name 'si' is not defined

# Load model 

# Predict classes and get probability of labels

In [49]:
model_predictions = model.predict(np.array(X_test)
model_probs = model.predict_proba(X_test)

SyntaxError: invalid syntax (<ipython-input-49-5af6e5cb60d8>, line 2)

# If we have true labels, see how well the model did 

In [50]:
plot_accuracy_roc_auc(models, model_names, model_predictions, model_probs, y_train, y_test, "accuracy_roc_auc.tif")

NameError: name 'plot_accuracy_roc_auc' is not defined

In [None]:
plot_roc_curve(models, model_names, model_probs, y_test, os.path.join(dir_to_save_files,"roc_curve.tif"))

In [None]:
plot_precision_recall_curve(models, model_names, model_probs, y_test, os.path.join(dir_to_save_files, "precision_recall_curve.tif"))