In [1]:
import pandas as pd
import numpy as np
import pickle

In [4]:
X_train, y_train = pd.read_pickle('./data/pickle/preproc/df_patient_admit_icu_notes__20210206_singleICUSTAY_TRAIN_final.pkl')

# LOS

In [9]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from customTransformers import (ColumnSelectTransformer, DiagnosisFrameTransformer, 
                                EstimatorTransformer, LinearNonlinear, ColumnMergeTransformer
                               )
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
import matplotlib.pyplot as plt
from sklearn.metrics import average_precision_score
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.metrics import plot_roc_curve, plot_confusion_matrix
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import plot_confusion_matrix
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor

## modeling separately

In [None]:
cols = ['GENDER','ADMISSION_TYPE','ADMISSION_LOCATION','INSURANCE','LANGUAGE',
       'RELIGION','MARITAL_STATUS','ETHNICITY']
num_cols = ['ADMIT_AGE']

demog_feats2 = FeatureUnion([
    ('stdscl', ColumnTransformer([('numerical', StandardScaler(), num_cols)])),
    ('ohe',  ColumnTransformer([('categorical', OneHotEncoder(handle_unknown='ignore'), cols)]))
])
demog_pipe2 = Pipeline([
    ('features', demog_feats2),
    ('ridge', Ridge())
])
demog_params2 = {
    'ridge__alpha': (0.01, 0.1, 1, 10, 100),
    'ridge__normalize': ('uniform', 'distance'),
    'ridge__fit_intercept': (True, False)
}
simp = SimpleImputer(strategy='median')
y_train_imp = simp.fit(X_train[['LOS']]).transform(X_train[['LOS']])
lin_gs_regressor2 = GridSearchCV(demog_pipe2, demog_params2, cv=KFold(n_splits=5, shuffle=True),verbose=3,n_jobs=-1)
lin_est2 = lin_gs_regressor2.fit(X_train,y_train_imp)
lin_est2.best_params_,lin_est2.best_score_

In [None]:
diagn_pipe2 = Pipeline([
    ('cst', ColumnSelectTransformer(['DIAGNOSIS'])),
    ('dst', DiagnosisFrameTransformer()),
    ('count',CountVectorizer(max_features=3000)),
    ('tfid', TfidfTransformer()),
    ('rfr', RandomForestRegressor(n_estimators=50))
])

diagn_params2 = {
    'rfr__criterion': ['mse','mae'],
    'rfr__max_features': ['sqrt', 'log2'],
    'rfr__min_samples_split': [0.2,0.225, 0.25],
    'rfr__min_samples_leaf': [0.01, 0.02, 0.03],
#     'rfr__ccp_alpha': [0,0.5,1],
    'rfr__max_depth': [35, 40, 45, 50]
}
# y_train_imp.values.ravel() --> DataConversionWarning: A column-vector y was passed
##                       when a 1d array was expected. Please change the shape of y to 
##                       (n_samples,), for example using ravel().
diagn_gs_classifier2 = GridSearchCV(diagn_pipe2, diagn_params2, cv=KFold(n_splits=5, shuffle=True),verbose=3,n_jobs=12)
tfidf_est2 = diagn_gs_classifier2.fit(X_train,y_train_imp.ravel())
tfidf_est2.best_params_,tfidf_est2.best_score_

## linear/nonlinear model fitting both simultaneously

In [7]:
cols = ['GENDER','ADMISSION_TYPE','ADMISSION_LOCATION','INSURANCE','LANGUAGE',
       'RELIGION','MARITAL_STATUS','ETHNICITY']
num_cols = ['ADMIT_AGE']

count = CountVectorizer(max_features=3000)
diagn_feats = Pipeline([
    ('cst', ColumnSelectTransformer(['DIAGNOSIS'])),
    ('dst', DiagnosisFrameTransformer()),
    ('count',count),
    ('tfid', TfidfTransformer()),
])

ohe = ColumnTransformer([('categorical', OneHotEncoder(handle_unknown='ignore'), cols)])
all_feats = FeatureUnion([
    ('stdscl', ColumnTransformer([('numerical', StandardScaler(), num_cols)])),
    ('ohe',  ohe),
    ('diagn', diagn_feats)
])

In [None]:
all_pipe_a = Pipeline([
    ('features', all_feats),
    ('ridge', Ridge())
])

all_params3a = {
    'ridge__alpha': (0.01, 0.1, 1, 10, 100),
    'ridge__normalize': ('uniform', 'distance'),
    'ridge__fit_intercept': (True, False)
}
simp = SimpleImputer(strategy='median')
y_train_imp = simp.fit(X_train[['LOS']]).transform(X_train[['LOS']])
lin_gs_regressor3 = GridSearchCV(all_pipe_a, all_params3a, cv=KFold(n_splits=5, shuffle=True),verbose=3,n_jobs=-1)
lin_est3 = lin_gs_regressor3.fit(X_train,y_train_imp)
lin_est3.best_params_,lin_est3.best_score_

In [None]:
all_pipe_b = Pipeline([
    ('features', all_feats),
    ('rfr', RandomForestRegressor(n_estimators=75))
])

all_params3b = {
    'rfr__criterion': ['mse','mae'],
    'rfr__max_features': ['sqrt', 'log2'],
    'rfr__min_samples_split': [0.2, 0.225, 0.25],
    'rfr__min_samples_leaf': [0.0075, 0.01, 0.02, 0.03],
    'rfr__ccp_alpha': [0, 0.5, 1],
    'rfr__max_depth': [30, 35, 40, 45, 50, 55]
}
# simp = SimpleImputer(strategy='median')
# y_train_imp = simp.fit(X_train[['LOS']]).transform(X_train[['LOS']])
nl_gs_regressor3 = GridSearchCV(all_pipe_b, all_params3b, cv=KFold(n_splits=5, shuffle=True),verbose=3,n_jobs=6)
nl_est3 = nl_gs_regressor3.fit(X_train,X_train[['LOS']])
# nl_est3 = nl_gs_regressor3.fit(X_train,y_train_imp.ravel())
nl_est3.best_params_,nl_est3.best_score_

In [8]:
linreg = Ridge(alpha=10,normalize ='uniform', fit_intercept=False)
nonlinreg = RandomForestRegressor(n_estimators=75, criterion='mse',
                                  max_depth=45,min_samples_leaf=0.0075,
                                  min_samples_split=0.2, max_features = 'sqrt')
regressor = LinearNonlinear(lin=linreg,nonlin=nonlinreg)
l_nl_pipe = Pipeline([
        ('features', all_feats), # features
        ('regressor', regressor) # Ridge' + RandomForest
    ])

l_nl_est4 = l_nl_pipe.fit(X_train,X_train['LOS'].to_numpy().ravel())
# l_nl_est4 = l_nl_pipe.fit(X_train,y_train_imp.ravel())

## gradient boosting

In [11]:
cols = ['GENDER','ADMISSION_TYPE','ADMISSION_LOCATION','INSURANCE','LANGUAGE',
       'RELIGION','MARITAL_STATUS','ETHNICITY']
num_cols = ['ADMIT_AGE']

ohe = ColumnTransformer([('categorical', OneHotEncoder(handle_unknown='ignore'), cols)])
demog_feats = FeatureUnion([
    ('stdscl', ColumnTransformer([('numerical', StandardScaler(), num_cols)])),
    ('ohe',  ohe)
])

text_feats = Pipeline([
    ('dft', DiagnosisFrameTransformer(['DIAGNOSIS'])),
    ('cmt', ColumnMergeTransformer(['DIAGNOSIS','TEXT'])),
    ('cst', ColumnSelectTransformer('DIAGNOSIS_TEXT')),
    ('count', CountVectorizer()),
    ('tfid', TfidfTransformer())
])

feats_union = FeatureUnion([
    ('demog_feats', demog_feats),
    ('text_feats', text_feats)
])

reg_pipe = Pipeline([
    ('features', feats_union),
    ('reg', GradientBoostingRegressor())
])

reg_params = {
    'features__text_feats__count__max_features': [2500, 5000, 10000],
    'features__text_feats__count__min_df': [0.00001, 0.0001],
    'features__text_feats__count__ngram_range': [(1,1),(1,2)],
    'features__text_feats__tfid__norm': ['l1','l2'],
    'reg__loss': ['ls','huber'],
    'reg__criterion': ['mse','friedman_mse'],
}

lin_gs_classifier = GridSearchCV(reg_pipe, reg_params, cv=KFold(n_splits=5, shuffle=True),verbose=3,n_jobs=7)
lin_est = lin_gs_classifier.fit(X_train,X_train['LOS'])

lin_est.best_params_,lin_est.best_score_

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  18 tasks      | elapsed: 24.7min
[Parallel(n_jobs=7)]: Done 114 tasks      | elapsed: 154.1min
[Parallel(n_jobs=7)]: Done 274 tasks      | elapsed: 388.2min
[Parallel(n_jobs=7)]: Done 480 out of 480 | elapsed: 737.9min finished


({'features__text_feats__count__max_features': 10000,
  'features__text_feats__count__min_df': 0.0001,
  'features__text_feats__count__ngram_range': (1, 2),
  'features__text_feats__tfid__norm': 'l2',
  'reg__criterion': 'mse',
  'reg__loss': 'ls'},
 0.41783777928818394)

In [12]:
# for diagnoses + text columns
file = './data/pickle/models/gradientboost__LOS__20210215.pkl'
model_data = {
    'numeric_cols': num_cols,
    'categorical_cols': cols,
    'diagnosis_col': ['DIAGNOSIS'],
    'ohe_categoricals': ohe,
    'feature_union': text_feats,
    'gridsearch': lin_gs_classifier,
    'estimator': lin_est
}
pickle.dump(model_data,open(file,'wb'))

In [3]:
file = './data/pickle/models/lin-nonlin__LOS__20210131.pkl'
model_data = pickle.load(open(file,'rb'))

In [5]:
model_data.keys()

dict_keys(['numeric_cols', 'categorical_cols', 'diagnosis_col', 'ohe_categoricals', 'feature_union', 'count_vectorizor', 'estimator'])

In [6]:
X_test, y_test = pd.read_pickle('./data/pickle/preproc/df_patient_admit_icu__20210130_singleICUSTAY_TEST_final.pkl')

In [7]:
ix = 28

foo = pd.DataFrame(X_test.iloc[ix]).T
foo = foo.to_dict()
bar = {key: val for key in foo for val in foo[key].values()}
xx_test = pd.DataFrame(bar,index=[0])

y_pred = model_data['estimator'].predict(xx_test)[0]

In [8]:
y_pred

[3.4874231435949032]

In [9]:
X_test['LOS'].iloc[ix]

4.515