 # Urgency Score

In [3]:
import pandas as pd
import numpy as np
import pickle

In [4]:
X_train, y_train = pd.read_pickle('./data/pickle/preproc/df_patient_admit_icu_notes__20210206_singleICUSTAY_TRAIN_final.pkl')

In [5]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from customTransformers import (ColumnSelectTransformer, DiagnosisFrameTransformer, 
                                EstimatorTransformer, LinearNonlinear, ColumnMergeTransformer
                               )
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
import matplotlib.pyplot as plt
from sklearn.metrics import average_precision_score
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.metrics import plot_roc_curve, plot_confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import plot_confusion_matrix
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor

In [6]:
cols = ['GENDER','ADMISSION_TYPE','ADMISSION_LOCATION','INSURANCE','LANGUAGE',
       'RELIGION','MARITAL_STATUS','ETHNICITY']
num_cols = ['ADMIT_AGE']

ohe = ColumnTransformer([('categorical', OneHotEncoder(handle_unknown='ignore'), cols)])
demog_feats = FeatureUnion([
    ('stdscl', ColumnTransformer([('numerical', StandardScaler(), num_cols)])),
    ('ohe',  ohe)
])

In [7]:
X_train = DiagnosisFrameTransformer(['DIAGNOSIS']).fit_transform(X_train)

In [8]:
text_feats = Pipeline([
    ('dft', DiagnosisFrameTransformer(['DIAGNOSIS'])),
    ('cmt', ColumnMergeTransformer(['DIAGNOSIS','TEXT'])),
    ('cst', ColumnSelectTransformer('DIAGNOSIS_TEXT')),
    ('count', CountVectorizer()),
    ('tfid', TfidfTransformer())
])

feats_union = FeatureUnion([
    ('demog_feats', demog_feats),
    ('text_feats', text_feats)
])

lin_pipe = Pipeline([
    ('features', feats_union),
    ('reg', LogisticRegression(solver='saga'))
])

lin_params = {'features__text_feats__count__max_features': [11000],
  'features__text_feats__count__min_df': [5e-05],
  'features__text_feats__count__ngram_range': [(1, 2)],
  'features__text_feats__tfid__norm': ['l2'],
  'reg__C': [5],
  'reg__class_weight': [None],
  'reg__l1_ratio': [0.1],
  'reg__multi_class': ['ovr'],
  'reg__penalty': ['elasticnet']}

# lin_params = {
#     'features__text_feats__count__max_features': [10500, 11000, 12000],
#     'features__text_feats__count__min_df': [0.00001, 0.00005],
#     'features__text_feats__count__ngram_range': [(1,2)],
#     'features__text_feats__tfid__norm': ['l2'],
#     'reg__penalty': ['elasticnet'],
#     'reg__C': [1, 3, 5],
#     'reg__class_weight': [None],
#     'reg__multi_class': ['ovr'],
#     'reg__l1_ratio': [0, 0.1, 0.2]
# }

In [9]:
pd.set_option('mode.chained_assignment',None)
ordenc = OrdinalEncoder(categories=[['stable','questionable','urgent','immediate']])
y_train_enc = ordenc.fit_transform(pd.DataFrame(y_train)) 
lin_gs_classifier = GridSearchCV(lin_pipe, lin_params, cv=StratifiedKFold(n_splits=5, shuffle=True),verbose=3,n_jobs=14)
lin_est = lin_gs_classifier.fit(X_train,y_train_enc)

lin_est.best_params_,lin_est.best_score_

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=14)]: Using backend LokyBackend with 14 concurrent workers.
[Parallel(n_jobs=14)]: Done   2 out of   5 | elapsed: 29.7min remaining: 44.5min
[Parallel(n_jobs=14)]: Done   5 out of   5 | elapsed: 30.0min finished
  return f(**kwargs)


({'features__text_feats__count__max_features': 11000,
  'features__text_feats__count__min_df': 5e-05,
  'features__text_feats__count__ngram_range': (1, 2),
  'features__text_feats__tfid__norm': 'l2',
  'reg__C': 5,
  'reg__class_weight': None,
  'reg__l1_ratio': 0.1,
  'reg__multi_class': 'ovr',
  'reg__penalty': 'elasticnet'},
 0.6453629946778363)

In [10]:
# for diagnoses + text columns
file = './data/pickle/models/log__URGENCY__20210216_withTEXT_diagnNoNumerics.pkl'

ohe.fit(X_train)
feats_union.fit(X_train)
text_feats.fit(X_train)
model_data = {
    'numeric_cols': num_cols,
    'categorical_cols': cols,
    'diagnosis_col': ['DIAGNOSIS'],
    'ohe_categoricals': ohe,
    'feature_union': feats_union,
    'text_vect': text_feats,
    'ord_enc_y': ordenc,
    'gridsearch': lin_gs_classifier,
    'estimator': lin_est
}
pickle.dump(model_data,open(file,'wb'))