In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
X_train, y_train = pd.read_pickle('./data/pickle/preproc/df_patient_admit_icu__20210130_singleICUSTAY_TRAIN_final.pkl')

## SAMEDAY ICU

In [5]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from customTransformers import ColumnSelectTransformer, DiagnosisFrameTransformer, EstimatorTransformer, LinearNonlinear
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
import matplotlib.pyplot as plt
from sklearn.metrics import average_precision_score
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.metrics import plot_roc_curve, plot_confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import plot_confusion_matrix
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor

In [None]:
cols = ['GENDER','ADMISSION_TYPE','ADMISSION_LOCATION','INSURANCE','LANGUAGE',
       'RELIGION','MARITAL_STATUS','ETHNICITY']

num_cols = ['ADMIT_AGE']
demog_feats = FeatureUnion([
    ('stdscl', ColumnTransformer([('numerical', StandardScaler(), num_cols)])),
    ('ohe',  ColumnTransformer([('categorical', OneHotEncoder(handle_unknown='ignore'), cols)]))
])
demog_pipe = Pipeline([
    ('features', demog_feats),
    ('reg', LogisticRegression(solver='liblinear'))
])
demog_params = {
    'reg__penalty': ['l1','l2','none'],
    'reg__C': [0.01, 0.1, 1, 10],
    'reg__class_weight': ['none','balanced'],
}

# lin_gs_regressor = GridSearchCV(demog_pipe, demog_params, cv=StratifiedKFold(n_splits=5, shuffle=True),verbose=3,n_jobs=-1)
# lin_est = lin_gs_regressor.fit(X_train,X_train['SAMEDAY_ADM_TO_ICU'])

In [None]:
lin_est1.best_params_,lin_est1.best_score_

In [None]:
diagn_pipe = Pipeline([
    ('cst', ColumnSelectTransformer(['DIAGNOSIS'])),
    ('dst', DiagnosisFrameTransformer()),
    ('count',CountVectorizer(max_features=3000)),
    ('tfid', TfidfTransformer()),
    ('rfc', RandomForestClassifier())
])

diagn_params = {
    'rfc__criterion': ['gini','entropy'],
    'rfc__max_features': ['auto', 'sqrt', 'log2'],
    'rfc__min_samples_split': [2,5,8,11],
    'rfc__min_samples_leaf': [1,4,7],
    'rfc__class_weight': ['balanced', 'balanced_subsample'],
}

diagn_gs_regressor = GridSearchCV(diagn_pipe, diagn_params, cv=StratifiedKFold(n_splits=5, shuffle=True),verbose=3,n_jobs=10)
tfidf_est = diagn_gs_regressor.fit(X_train,X_train['SAMEDAY_ADM_TO_ICU'])

In [None]:
tfidf_est.best_params_,tfidf_est.best_score_

In [None]:
demog_est1c = LogisticRegression(solver='liblinear', C=0.1, class_weight='balanced', penalty='l2')
demog_pipe1c = Pipeline([
    ('features', demog_feats1),
    ('reg', demog_est1c)
])

diagn_est = RandomForestClassifier(class_weight='balanced_subsample',criterion='entropy',
                                  max_features='log2',min_samples_leaf=1,
                                  min_samples_split=2)
diagn_pipe = Pipeline([
    ('cst', ColumnSelectTransformer(['DIAGNOSIS'])),
    ('dst', DiagnosisFrameTransformer()),
    ('count',CountVectorizer(max_features=3000)),
    ('tfid', TfidfTransformer()),
    ('rfc', diagn_est)
])

union = FeatureUnion([
    ('demog', EstimatorTransformer(demog_pipe)),# FeatureUnions use the same syntax as Pipelines
    ('diagn',EstimatorTransformer(diagn_pipe))
])


full_pipe = Pipeline([
    ('union',union),
    ('xgb', GradientBoostingClassifier())
])

xgb_params = {
    'xgb__loss': ['deviance','exponential'],
    'xgb__learning_rate': [0.05, 0.1, 0.2],
    'xgb__subsample': [0.8, 1, 1.2], # max is 1
    'xgb__criterion': ['mse','friedman_mse'],
    'xgb__min_samples_leaf': [1,4,7],
    'xgb__max_depth': [2,3,4,5],
    'xgb__max_features': ['auto','log2',None]
}

xgb_gs_classifier = GridSearchCV(full_pipe, xgb_params, cv=StratifiedKFold(n_splits=5, shuffle=True),verbose=3,n_jobs=12)
xgb_est = xgb_gs_classifier1c.fit(X_train,X_train['SAMEDAY_ADM_TO_ICU'])

xgb_est.best_params_,xgb_est.best_score_

In [None]:
file = './data/pickle/models/log_rf_xgb__SAMEDAY__20210127.pkl'
pickle.dump((demog_pipe1c, diagn_pipe1c, xgb_est1c),open(file,'wb'))