In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import GradientBoostingRegressor
from customTransformers import (ColumnSelectTransformer, DiagnosisFrameTransformer, 
                                ColumnMergeTransformer
                               )

In [2]:
X_train, y_train = pd.read_pickle('./data/pickle/preproc/df_patient_admit_icu_notes__20210206_singleICUSTAY_TRAIN_final.pkl')

In [4]:
X_train['TEXT'].iloc[67]

'  cta chest ww crecons non coronary cta abd pelvis clip clip number radiology reason eval known aortic dissection changes progression pr admitting diagnosis thoracic dissection contrast omnipaque amt hospital medical condition year old woman aortic dissection reason examination eval known aortic dissection changes progression prior imaging please perform contraindications iv contrast wet read mlhh sun stable appearance type dissection extending past sca takeoff prox iliac unchanged intramural hematoma level aortic arch false lumen primarily supplies renal subtle hypoperfusion kidney infarct new tiny nonhemorrhagic pleural pericardial effusions inc active long segment acute inflammation rlq distal ileum chronic ibc changes ti segments colon wet read version wet read version mlhh sun stable appearance type dissection extending past sca takeoff prox iliac unchanged intramural hematoma level aortic arch false lumen primarily supplies renal subtle hypoperfusion kidney infarct inc active lo

In [3]:
file = './data/pickle/models/gradientboost__LOS__20210215.pkl'
model_data = pickle.load(open(file,'rb'))

In [9]:
model_data['estimator'].best_estimator_.named_steps.reg.n_features_

10123

In [10]:
best = model_data['estimator'].best_estimator_

In [15]:
los_dft = best.named_steps.features.transformer_list[1][1].named_steps.dft

In [16]:
file = './data/pickle/models/gradientboost__LOS__20210305_forFlask_from_20210215.pkl'
pickle.dump((los_dft, best), open(file,'wb'))

In [15]:
model_data['estimator'].best_params_

{'features__text_feats__count__max_features': 10000,
 'features__text_feats__count__min_df': 0.0001,
 'features__text_feats__count__ngram_range': (1, 2),
 'features__text_feats__tfid__norm': 'l2',
 'reg__criterion': 'mse',
 'reg__loss': 'ls'}

In [16]:
count = CountVectorizer(max_features=10000, min_df=0.0001, ngram_range=(1,2))
tfidf = TfidfTransformer(norm='l2')
reg = GradientBoostingRegressor(criterion='mse', loss='ls')

In [17]:
cols = ['GENDER','ADMISSION_TYPE','ADMISSION_LOCATION','INSURANCE','LANGUAGE',
       'RELIGION','MARITAL_STATUS','ETHNICITY']
num_cols = ['ADMIT_AGE']

ohe = ColumnTransformer([('categorical', OneHotEncoder(handle_unknown='ignore'), cols)])
demog_feats = FeatureUnion([
    ('stdscl', ColumnTransformer([('numerical', StandardScaler(), num_cols)])),
    ('ohe',  ohe)
])

text_feats = Pipeline([
    ('dft', DiagnosisFrameTransformer(['DIAGNOSIS'])),
    ('cmt', ColumnMergeTransformer(['DIAGNOSIS','TEXT'])),
    ('cst', ColumnSelectTransformer('DIAGNOSIS_TEXT')),
    ('count', count),
    ('tfidf', tfidf)
])

feats_union = FeatureUnion([
    ('demog_feats', demog_feats),
    ('text_feats', text_feats)
])

lin_pipe = Pipeline([
    ('features', feats_union),
    ('reg', reg)
])

lin_est = lin_pipe.fit(X_train,X_train['LOS'])

In [17]:
X_test, y_test = pd.read_pickle('./data/pickle/preproc/df_patient_admit_icu_notes__20210206_singleICUSTAY_TEST_final.pkl')

In [18]:
x_test = X_test.iloc[0]

In [31]:
best.predict(x_test.to_frame().transpose())[0]

3.9693878474247333

In [29]:
x_test.to_frame().transpose()

Unnamed: 0,SUBJECT_ID,HADM_ID,ICUSTAY_ID,ADMITTIME,CHARTTIME,INTIME,GENDER,ADMIT_AGE,ADMISSION_TYPE,ADMISSION_LOCATION,...,RELIGION,MARITAL_STATUS,ETHNICITY,DIAGNOSIS,TEXT,DBSOURCE,LOS,DAYS_EVENT_TO_ICU,ICU_URGENCY,SAMEDAY_EVENT_TO_ICU
67659,41738,184992,249970,2171-01-07 13:56:59,2171-01-07 20:19:00,2171-01-09 11:03:58,M,84,EMERGENCY,CLINIC REFERRAL/PREMATURE,...,JEWISH,WIDOWED,WHITE,WORSENING DYSPNEA ON EXERTION;AORTIC STENOSIS\...,,metavision,3.0567,1.87984,questionable,0


In [19]:
X_test.columns

Index(['SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'ADMITTIME', 'CHARTTIME',
       'INTIME', 'GENDER', 'ADMIT_AGE', 'ADMISSION_TYPE', 'ADMISSION_LOCATION',
       'INSURANCE', 'LANGUAGE', 'RELIGION', 'MARITAL_STATUS', 'ETHNICITY',
       'DIAGNOSIS', 'TEXT', 'DBSOURCE', 'LOS', 'DAYS_EVENT_TO_ICU',
       'ICU_URGENCY', 'SAMEDAY_EVENT_TO_ICU'],
      dtype='object')

In [20]:
y_pred = lin_est.predict(X_test)

In [56]:
from bokeh.plotting import figure, output_notebook, show
from bokeh.models import ColumnDataSource, Legend, HoverTool
from bokeh.palettes import Spectral4
from bokeh.layouts import row

output_notebook()

In [66]:
from bokeh.transform import factor_cmap
from bokeh.palettes import Spectral4

p1 = figure(title="LOS Model Performance", tooltips='', plot_height=400, plot_width=500)

source = ColumnDataSource({
    'x': X_test['LOS'],
    'y': y_pred,
    'groups': X_test['ICU_URGENCY']
})

labels = ['stable','questionable','urgent','immediate']
p1.circle('x', 'y', fill_color=None, 
         line_color=factor_cmap('groups', Spectral4, labels),
         size=5, source=source, legend_field='groups',
          alpha=0.2, selection_alpha=0.8)

p1.legend.location = "top_left"
p1.legend.click_policy="hide"
p1.xaxis.axis_label = 'True LOS'
p1.yaxis.axis_label = 'Predicted LOS'
p1.title.text_font_size = '16pt'
p1.yaxis.axis_label_text_font_size = '14pt'
p1.xaxis.axis_label_text_font_size = '14pt'
p1.yaxis.major_label_text_font_size = '11pt'
p1.xaxis.major_label_text_font_size = '11pt'

from sklearn.metrics import mean_squared_error

rmse = []
for urg in  X_test['ICU_URGENCY'].unique():
    mask = X_test['ICU_URGENCY']==urg
    rmse.append((mean_squared_error(X_test.LOS[mask], y_pred[mask]))**1/2)
    
factors = ['stable','questionable','urgent','immediate']
source = ColumnDataSource(data=dict(factors=factors, rmse=rmse, color=Spectral4))

hover = HoverTool(
  tooltips=[
      ("Level", "@factors"),
      ("RMSE", "@rmse"),
  ]
)

p2 = figure(x_range=factors, plot_height=400,
      plot_width=500, tools=[hover], title="RMSE by Urgency Level")

p2.vbar(x='factors', top='rmse', color='color', width=0.4, source=source)

p2.title.text_font_size = '16pt'
p2.yaxis.axis_label_text_font_size = '14pt'
p2.yaxis.major_label_text_font_size = '11pt'
p2.xaxis.major_label_text_font_size = '11pt'
p2.xgrid.grid_line_color = None
p2.y_range.start = 0
p2.yaxis.axis_label = 'RMSE'
layout = row(p1, p2)
show(layout)

In [99]:
from bokeh.transform import factor_cmap
from bokeh.palettes import Spectral4
from sklearn.metrics import mean_squared_error
from bokeh.models import BoxSelectTool, BoxZoomTool, ResetTool
from bokeh.models import CheckboxGroup, CustomJS

hover1 = HoverTool(
  tooltips=[
      ("True", "@x"),
      ("Predicted", "@y"),
  ]
)

hover2 = HoverTool(
  tooltips=[
      ("Level", "@x"),
      ("RMSE", "@y"),
  ]
)

p1 = figure(title="LOS Model Performance", tools=[hover1, BoxSelectTool(), BoxZoomTool(), ResetTool()], plot_height=400, plot_width=500)
p2 = figure(x_range=factors, plot_height=400,
      plot_width=500, tools=[hover2, BoxSelectTool(), ResetTool()], title="RMSE by Urgency Level")

labels = ['stable','questionable','urgent','immediate']

rmse = []
sources1 = {}
sources2= {}
circles = {}
bars = {}
for ix,urg in  enumerate(labels):
    mask = X_test['ICU_URGENCY']==urg
    sources1[urg] = ColumnDataSource({
        'x': X_test.LOS[mask],
        'y': y_pred[mask],
    })
    sources2[urg] = ColumnDataSource({
        'x': [urg],
        'y': [(mean_squared_error(X_test.LOS[mask], y_pred[mask]))**1/2]
    })
    circles[urg] = p1.circle('x', 'y', fill_color=None, alpha=0.4, selection_alpha=0.8,
         selection_fill_color=Spectral4[ix], line_color=Spectral4[ix], size=5, source=sources1[urg], legend_label=urg)
    bars[urg] = p2.vbar(x='x', top='y', color=Spectral4[ix], width=0.4, source=sources2[urg])

    
checkbox = CheckboxGroup(labels=labels,
                         active=[0, 1, 2, 3], width=100)

callback = CustomJS(args=dict(circles=circles, bars=bars, checkbox=checkbox), code="""
circles['stable'].visible = 0 in checkbox.active;
circles['questionable'].visible = 1 in checkbox.active;
circles['urgent'].visible = 2 in checkbox.active;
circles['immediate'].visible = 3 in checkbox.active;
bars['stable'].visible = 0 in checkbox.active;
bars['questionable'].visible = 1 in checkbox.active;
bars['urgent'].visible = 2 in checkbox.active;
bars['immediate'].visible = 3 in checkbox.active;
""")
checkbox.js_on_change('active', callback)

p1.legend.location = "top_left"
p1.xaxis.axis_label = 'True LOS (days)'
p1.yaxis.axis_label = 'Predicted LOS (days)'
p1.title.text_font_size = '16pt'
p1.yaxis.axis_label_text_font_size = '14pt'
p1.xaxis.axis_label_text_font_size = '14pt'
p1.yaxis.major_label_text_font_size = '11pt'
p1.xaxis.major_label_text_font_size = '11pt'

p2.title.text_font_size = '16pt'
p2.yaxis.axis_label_text_font_size = '14pt'
p2.yaxis.major_label_text_font_size = '11pt'
p2.xaxis.major_label_text_font_size = '11pt'
p2.xgrid.grid_line_color = None
p2.y_range.start = 0
p2.yaxis.axis_label = 'RMSE'

layout = row(checkbox, p1, p2)
show(layout)

In [21]:
ex_dict = {'immediate': 8065, 'questionable': 10545,
          'stable': 68757,'urgent': 10547}

In [27]:
X_test_reset = X_test.reset_index()

In [31]:
X_test_reset[X_test_reset['index']==ex_dict['stable']]

Unnamed: 0,index,SUBJECT_ID,HADM_ID,ICUSTAY_ID,ADMITTIME,CHARTTIME,INTIME,GENDER,ADMIT_AGE,ADMISSION_TYPE,...,MARITAL_STATUS,ETHNICITY,DIAGNOSIS,TEXT,DBSOURCE,LOS,DAYS_EVENT_TO_ICU,ICU_URGENCY,SAMEDAY_EVENT_TO_ICU,DIAGNOSIS_TEXT
110,68757,43320,154851.0,233289,2128-08-10 07:15:00,2128-08-14 10:08:00,2128-09-07 21:56:52,M,74,ELECTIVE,...,MARRIED,WHITE,bladder cancer sda,portable abdomen clip clip number radiology ...,metavision,0.8666,24.492269,stable,0,bladder cancer sda portable abdomen clip clip...


In [30]:
X_test_reset[X_test_reset['index']==ex_dict['questionable']]

Unnamed: 0,index,SUBJECT_ID,HADM_ID,ICUSTAY_ID,ADMITTIME,CHARTTIME,INTIME,GENDER,ADMIT_AGE,ADMISSION_TYPE,...,MARITAL_STATUS,ETHNICITY,DIAGNOSIS,TEXT,DBSOURCE,LOS,DAYS_EVENT_TO_ICU,ICU_URGENCY,SAMEDAY_EVENT_TO_ICU,DIAGNOSIS_TEXT
8423,10545,5285,127903.0,259613,2202-09-22 07:15:00,2202-09-22 15:53:00,2202-09-24 12:02:26,F,60,ELECTIVE,...,MARRIED,WHITE,coronary artery disease sda aortic valve repla...,carotid series complete clip clip number rad...,carevue,4.2838,1.839884,questionable,0,coronary artery disease sda aortic valve repla...


In [32]:
X_test_reset[X_test_reset['index']==ex_dict['urgent']]

Unnamed: 0,index,SUBJECT_ID,HADM_ID,ICUSTAY_ID,ADMITTIME,CHARTTIME,INTIME,GENDER,ADMIT_AGE,ADMISSION_TYPE,...,MARITAL_STATUS,ETHNICITY,DIAGNOSIS,TEXT,DBSOURCE,LOS,DAYS_EVENT_TO_ICU,ICU_URGENCY,SAMEDAY_EVENT_TO_ICU,DIAGNOSIS_TEXT
17523,10547,5285,127903.0,259613,2202-09-22 07:15:00,2202-09-23 14:45:00,2202-09-24 12:02:26,F,60,ELECTIVE,...,MARRIED,WHITE,coronary artery disease sda aortic valve repla...,carotid series complete clip clip number rad...,carevue,4.2838,0.887106,urgent,1,coronary artery disease sda aortic valve repla...


In [33]:
X_test_reset[X_test_reset['index']==ex_dict['immediate']]

Unnamed: 0,index,SUBJECT_ID,HADM_ID,ICUSTAY_ID,ADMITTIME,CHARTTIME,INTIME,GENDER,ADMIT_AGE,ADMISSION_TYPE,...,MARITAL_STATUS,ETHNICITY,DIAGNOSIS,TEXT,DBSOURCE,LOS,DAYS_EVENT_TO_ICU,ICU_URGENCY,SAMEDAY_EVENT_TO_ICU,DIAGNOSIS_TEXT
165,8065,3986,103534.0,200718,2146-05-12 18:28:59,2146-05-12 09:50:00,2146-05-12 18:30:10,M,66,EMERGENCY,...,MARRIED,WHITE,aneurysm,,carevue,5.0752,0.00081,immediate,1,aneurysm


In [34]:
y_pred[110], y_pred[8423], y_pred[17523], y_pred[165]

(4.877959077894241, 4.022171011115424, 4.037218922113228, 5.18970092460429)