In [5]:
import pandas as pd
import numpy as np
import pickle

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import GradientBoostingRegressor
from customTransformers import (ColumnSelectTransformer, DiagnosisFrameTransformer, 
                                ColumnMergeTransformer
                               )

In [2]:
X_train, y_train = pd.read_pickle('./data/pickle/preproc/df_patient_admit_icu_notes__20210206_singleICUSTAY_TRAIN_final.pkl')

In [3]:
file = './data/pickle/models/gradientboost__LOS__20210215.pkl'
model_data = pickle.load(open(file,'rb'))

In [4]:
model_data['estimator'].best_params_

{'features__text_feats__count__max_features': 10000,
 'features__text_feats__count__min_df': 0.0001,
 'features__text_feats__count__ngram_range': (1, 2),
 'features__text_feats__tfid__norm': 'l2',
 'reg__criterion': 'mse',
 'reg__loss': 'ls'}

In [6]:
count = CountVectorizer(max_features=10000, min_df=0.0001, ngram_range=(1,2))
tfidf = TfidfTransformer(norm='l2')
reg = GradientBoostingRegressor(criterion='mse', loss='ls')

In [7]:
cols = ['GENDER','ADMISSION_TYPE','ADMISSION_LOCATION','INSURANCE','LANGUAGE',
       'RELIGION','MARITAL_STATUS','ETHNICITY']
num_cols = ['ADMIT_AGE']

ohe = ColumnTransformer([('categorical', OneHotEncoder(handle_unknown='ignore'), cols)])
demog_feats = FeatureUnion([
    ('stdscl', ColumnTransformer([('numerical', StandardScaler(), num_cols)])),
    ('ohe',  ohe)
])

text_feats = Pipeline([
    ('dft', DiagnosisFrameTransformer(['DIAGNOSIS'])),
    ('cmt', ColumnMergeTransformer(['DIAGNOSIS','TEXT'])),
    ('cst', ColumnSelectTransformer('DIAGNOSIS_TEXT')),
    ('count', count),
    ('tfidf', tfidf)
])

feats_union = FeatureUnion([
    ('demog_feats', demog_feats),
    ('text_feats', text_feats)
])

lin_pipe = Pipeline([
    ('features', feats_union),
    ('reg', reg)
])

lin_est = lin_pipe.fit(X_train,X_train['LOS'])

In [8]:
X_test, y_test = pd.read_pickle('./data/pickle/preproc/df_patient_admit_icu_notes__20210206_singleICUSTAY_TEST_final.pkl')

In [9]:
y_pred = lin_est.predict(X_test)

In [13]:
from bokeh.plotting import figure, output_notebook, show
from bokeh.models import ColumnDataSource
from bokeh.palettes import Spectral4

output_notebook()

In [15]:
X_test.columns

Index(['SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'ADMITTIME', 'CHARTTIME',
       'INTIME', 'GENDER', 'ADMIT_AGE', 'ADMISSION_TYPE', 'ADMISSION_LOCATION',
       'INSURANCE', 'LANGUAGE', 'RELIGION', 'MARITAL_STATUS', 'ETHNICITY',
       'DIAGNOSIS', 'TEXT', 'DBSOURCE', 'LOS', 'DAYS_EVENT_TO_ICU',
       'ICU_URGENCY', 'SAMEDAY_EVENT_TO_ICU', 'DIAGNOSIS_TEXT'],
      dtype='object')

In [20]:
from bokeh.transform import factor_cmap
from bokeh.palettes import Spectral4

p = figure(title="Model Performance", tooltips='', plot_height=500, plot_width=600)

source = ColumnDataSource({
    'x': X_test['LOS'],
    'y': y_pred,
    'groups': X_test['ICU_URGENCY']
})

p.circle('x', 'y', fill_color=None, 
         line_color=factor_cmap('groups', Spectral4, ['stable','questionable','urgent','immediate']),
         size=5, source=source)
show(p)

In [21]:
ex_dict = {'immediate': 8065, 'questionable': 10545,
          'stable': 68757,'urgent': 10547}

In [27]:
X_test_reset = X_test.reset_index()

In [31]:
X_test_reset[X_test_reset['index']==ex_dict['stable']]

Unnamed: 0,index,SUBJECT_ID,HADM_ID,ICUSTAY_ID,ADMITTIME,CHARTTIME,INTIME,GENDER,ADMIT_AGE,ADMISSION_TYPE,...,MARITAL_STATUS,ETHNICITY,DIAGNOSIS,TEXT,DBSOURCE,LOS,DAYS_EVENT_TO_ICU,ICU_URGENCY,SAMEDAY_EVENT_TO_ICU,DIAGNOSIS_TEXT
110,68757,43320,154851.0,233289,2128-08-10 07:15:00,2128-08-14 10:08:00,2128-09-07 21:56:52,M,74,ELECTIVE,...,MARRIED,WHITE,bladder cancer sda,portable abdomen clip clip number radiology ...,metavision,0.8666,24.492269,stable,0,bladder cancer sda portable abdomen clip clip...


In [30]:
X_test_reset[X_test_reset['index']==ex_dict['questionable']]

Unnamed: 0,index,SUBJECT_ID,HADM_ID,ICUSTAY_ID,ADMITTIME,CHARTTIME,INTIME,GENDER,ADMIT_AGE,ADMISSION_TYPE,...,MARITAL_STATUS,ETHNICITY,DIAGNOSIS,TEXT,DBSOURCE,LOS,DAYS_EVENT_TO_ICU,ICU_URGENCY,SAMEDAY_EVENT_TO_ICU,DIAGNOSIS_TEXT
8423,10545,5285,127903.0,259613,2202-09-22 07:15:00,2202-09-22 15:53:00,2202-09-24 12:02:26,F,60,ELECTIVE,...,MARRIED,WHITE,coronary artery disease sda aortic valve repla...,carotid series complete clip clip number rad...,carevue,4.2838,1.839884,questionable,0,coronary artery disease sda aortic valve repla...


In [32]:
X_test_reset[X_test_reset['index']==ex_dict['urgent']]

Unnamed: 0,index,SUBJECT_ID,HADM_ID,ICUSTAY_ID,ADMITTIME,CHARTTIME,INTIME,GENDER,ADMIT_AGE,ADMISSION_TYPE,...,MARITAL_STATUS,ETHNICITY,DIAGNOSIS,TEXT,DBSOURCE,LOS,DAYS_EVENT_TO_ICU,ICU_URGENCY,SAMEDAY_EVENT_TO_ICU,DIAGNOSIS_TEXT
17523,10547,5285,127903.0,259613,2202-09-22 07:15:00,2202-09-23 14:45:00,2202-09-24 12:02:26,F,60,ELECTIVE,...,MARRIED,WHITE,coronary artery disease sda aortic valve repla...,carotid series complete clip clip number rad...,carevue,4.2838,0.887106,urgent,1,coronary artery disease sda aortic valve repla...


In [33]:
X_test_reset[X_test_reset['index']==ex_dict['immediate']]

Unnamed: 0,index,SUBJECT_ID,HADM_ID,ICUSTAY_ID,ADMITTIME,CHARTTIME,INTIME,GENDER,ADMIT_AGE,ADMISSION_TYPE,...,MARITAL_STATUS,ETHNICITY,DIAGNOSIS,TEXT,DBSOURCE,LOS,DAYS_EVENT_TO_ICU,ICU_URGENCY,SAMEDAY_EVENT_TO_ICU,DIAGNOSIS_TEXT
165,8065,3986,103534.0,200718,2146-05-12 18:28:59,2146-05-12 09:50:00,2146-05-12 18:30:10,M,66,EMERGENCY,...,MARRIED,WHITE,aneurysm,,carevue,5.0752,0.00081,immediate,1,aneurysm


In [34]:
y_pred[110], y_pred[8423], y_pred[17523], y_pred[165]

(4.877959077894241, 4.022171011115424, 4.037218922113228, 5.18970092460429)