In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_pickle('./data/pickle/preproc/df_patient_admit_icu__20210119.pkl')

In [3]:
import re
from nltk.corpus import stopwords
stopWords = stopwords.words('english')
stopWords.extend(['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z'])
stopWords = set(stopWords)

def remove_stopwords(diagnosis_list):
    return [d for d in diagnosis_list if d not in stopWords]

def cleanup_raw_diagnoses(series):
    series = series.str.strip()
    
    cleaned = []
    regex_split = re.compile(r'[\|/;|,]')
    regex_sub1 = re.compile(r"[\|/\.-]+")
    for ix,diagnosis in enumerate(series):
        if pd.isna(diagnosis):
            diagnosis = ''
        
        diagnosis = diagnosis.replace('\\',' ')
        diagnosis = diagnosis.replace("'",' ')
        diagnosis_list = regex_split.split(diagnosis)
        diagnosis_list = [d.strip() for d in diagnosis_list]
        diagnosis_list = remove_stopwords(diagnosis_list)
        
        diagnosis = ' '.join(diagnosis_list)
        diagnosis = regex_sub1.sub(' ', diagnosis)
        cleaned.append(diagnosis)
        
    return cleaned

In [4]:
cleaned = cleanup_raw_diagnoses(df['DIAGNOSIS'])

In [5]:
from nltk import word_tokenize

def diagnosis_tokenizer(diagnosis):
    return word_tokenize(diagnosis)

In [6]:
tokens_list = []
for diagn in cleaned:
    tokens_list.extend(diagnosis_tokenizer(diagn))

In [7]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

pipe = Pipeline([('count', CountVectorizer(max_features=3000)),
                 ('tfid', TfidfTransformer(sublinear_tf=True))])

pipe.fit(tokens_list)

Pipeline(steps=[('count', CountVectorizer(max_features=3000)),
                ('tfid', TfidfTransformer(sublinear_tf=True))])

In [8]:
tfidf_names=pipe.named_steps['count'].get_feature_names()

In [9]:
pipe['tfid'].idf_

array([11.69512168, 11.98280375, 11.98280375, ..., 12.38826886,
       11.98280375, 11.0019745 ])

In [11]:
tfidf_vals = pipe['tfid'].idf_
sorted_tfids = sorted(zip(tfidf_names,tfidf_vals),key=lambda x: -x[1])

In [12]:
vec = CountVectorizer(max_features=3000)
vec_fit = vec.fit_transform(tokens_list)

In [13]:
count_names = vec.get_feature_names();    
count_vals = np.asarray(vec_fit.sum(axis=0))[0]
sorted_counts = sorted(zip(count_names,count_vals),key=lambda x: -x[1])

In [None]:
from bokeh.plotting import ColumnDataSource, figure, output_notebook,show
from bokeh.layouts import row, column
from bokeh.models import CustomJS, Slider
import numpy as np
output_notebook()

In [None]:
i=7
ix1 = 0
ix2 = 100

foo = [w[0] for w in sorted_counts[ix1:ix2]]
bar = [c[1] for c in sorted_counts[ix1:ix2]]

p1 = figure(x_range=foo,background_fill_color="#fafafa")
p1.vbar(x=foo, top=bar, width=0.9)
p1.xaxis.major_label_orientation = 'vertical'

baz = [w[0] for w in sorted_tfids[ix1:ix2]]
bam = [c[1] for c in sorted_tfids[ix1:ix2]]

p2 = figure(x_range=baz,background_fill_color="#fafafa")
p2.vbar(x=baz, top=bam, width=0.9)
p2.xaxis.major_label_orientation = 'vertical'

show(row(p1,p2))

In [None]:
print(foo)
print(baz)

In [None]:
df.columns

In [14]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from customTransformers import ColumnSelectTransformer, DiagnosisFrameTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

count_pipe = Pipeline([
    ('cst', ColumnSelectTransformer(['DIAGNOSIS'])),
    ('dst', DiagnosisFrameTransformer()),
    ('count',CountVectorizer(max_features=3000)),
    ('rfc', RandomForestClassifier())
])

tfidf_pipe = Pipeline([
    ('cst', ColumnSelectTransformer(['DIAGNOSIS'])),
    ('dst', DiagnosisFrameTransformer()),
    ('count',CountVectorizer(max_features=3000)),
    ('tfid', TfidfTransformer()),
    ('rfc', RandomForestClassifier())
])

params = {
    'rfc__criterion': ['gini','entropy'],
    'rfc__max_features': ['auto', 'sqrt', 'log2'],
    'rfc__min_samples_split': [2,5,8,11],
    'rfc__min_samples_leaf': [1,4,7],
    'rfc__class_weight': ['balanced', 'balanced_subsample'],
}

In [23]:
gs_regressor = GridSearchCV(count_pipe, params, cv=KFold(n_splits=5, shuffle=True),verbose=3,n_jobs=-1)
count_est = gs_regressor.fit(df,df['SAMEDAY_ADM_TO_ICU'])

Fitting 5 folds for each of 144 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 256 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 480 tasks      | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed: 16.5min finished


In [15]:
params = {
    'rfc__criterion': ['entropy'],
    'rfc__max_features': ['auto'],
    'rfc__min_samples_split': [2],
    'rfc__min_samples_leaf': [1],
    'rfc__class_weight': ['balanced_subsample'],
}

In [16]:
gs_regressor = GridSearchCV(tfidf_pipe, params, cv=KFold(n_splits=5, shuffle=True),verbose=3,n_jobs=-1)
tfidf_est = gs_regressor.fit(df,df['SAMEDAY_ADM_TO_ICU'])

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   43.2s remaining:  1.1min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   43.7s finished


In [25]:
count_est.best_params_,count_est.best_score_

({'rfc__class_weight': 'balanced_subsample',
  'rfc__criterion': 'entropy',
  'rfc__max_features': 'auto',
  'rfc__min_samples_leaf': 1,
  'rfc__min_samples_split': 2},
 0.730375594554699)

In [17]:
tfidf_est.best_params_,tfidf_est.best_score_

({'rfc__class_weight': 'balanced_subsample',
  'rfc__criterion': 'entropy',
  'rfc__max_features': 'auto',
  'rfc__min_samples_leaf': 1,
  'rfc__min_samples_split': 2},
 0.7362965392816139)

In [18]:
from sklearn.metrics import confusion_matrix
y_pred = tfidf_est.predict(df)

In [19]:
confusion_matrix(df['SAMEDAY_ADM_TO_ICU'],y_pred)

array([[10847,  3264],
       [ 8951, 37908]])

In [2]:
import pandas as pd
import numpy as np

In [3]:
X_train, y_train = pd.read_pickle('./data/pickle/preproc/df_patient_admit_icu__20210126_TRAIN_final.pkl')

In [4]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from customTransformers import ColumnSelectTransformer, DiagnosisFrameTransformer, EstimatorTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [16]:
## TRYING TO IMPLEMENT 4 OVR CLASSIFICATION SYSTEMS FOR ICU_URGENCY.
# 1. CREATE AND TUNE OVR for demographics
# 2. use multinomial classifier for diagnoses
# 3. use all 5 outputs as features in a second multinomial to pool results

cols = ['GENDER','ADMISSION_TYPE','ADMISSION_LOCATION','INSURANCE','LANGUAGE',
       'RELIGION','MARITAL_STATUS','ETHNICITY','ADMIT_AGE']

demog_feats = FeatureUnion([
    ('stdscl', ColumnTransformer([('numerical', StandardScaler(), ['ADMIT_AGE'])])),
    ('ohe',  ColumnTransformer([('categorical', OneHotEncoder(handle_unknown='ignore'), cols)]))
])

demog_pipe = Pipeline([
    ('features', demog_feats),
    ('reg', LogisticRegression(solver='saga'))
])
demog_params = {
    'reg__penalty': ['l1','l2','none'],
    'reg__C': [0.01, 0.1, 1, 10],
    'reg__class_weight': ['none','balanced'],
    'reg__multi_class': ['ovr','multinomial'],
}

lin_gs_regressor = GridSearchCV(demog_pipe, demog_params, cv=StratifiedKFold(n_splits=5, shuffle=True),verbose=3,n_jobs=4)
lin_est = lin_gs_regressor.fit(X_train,y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:   31.8s
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:  2.5min
[Parallel(n_jobs=4)]: Done 240 out of 240 | elapsed:  7.0min finished


In [17]:
lin_est.best_params_,lin_est.best_score_

({'reg__C': 0.1,
  'reg__class_weight': 'none',
  'reg__multi_class': 'ovr',
  'reg__penalty': 'l2'},
 0.6278292394917798)

In [18]:
diagn_pipe = Pipeline([
    ('cst', ColumnSelectTransformer(['DIAGNOSIS'])),
    ('dst', DiagnosisFrameTransformer()),
    ('count',CountVectorizer(max_features=3000)),
    ('tfid', TfidfTransformer()),
    ('rfc', RandomForestClassifier())
])

diagn_params = {
    'rfc__criterion': ['gini','entropy'],
    'rfc__max_features': ['auto', 'sqrt', 'log2'],
    'rfc__min_samples_split': [2,5,8,11],
    'rfc__min_samples_leaf': [1,4,7],
    'rfc__class_weight': ['balanced', 'balanced_subsample'],
}

diagn_gs_regressor = GridSearchCV(diagn_pipe, diagn_params, cv=StratifiedKFold(n_splits=5, shuffle=True),verbose=3,n_jobs=4)
tfidf_est = diagn_gs_regressor.fit(X_train,y_train)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  4.3min
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed: 11.3min
[Parallel(n_jobs=4)]: Done 280 tasks      | elapsed: 27.3min
[Parallel(n_jobs=4)]: Done 504 tasks      | elapsed: 50.2min
[Parallel(n_jobs=4)]: Done 720 out of 720 | elapsed: 69.7min finished


In [19]:
tfidf_est.best_params_,tfidf_est.best_score_

({'rfc__class_weight': 'balanced',
  'rfc__criterion': 'gini',
  'rfc__max_features': 'sqrt',
  'rfc__min_samples_leaf': 1,
  'rfc__min_samples_split': 2},
 0.58750192130317)

In [29]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder

demog_est = LogisticRegression(solver='saga', C=0.1, class_weight='none', 
                               penalty='l2', multi_class='ovr')
demog_pipe = Pipeline([
    ('features', demog_feats),
    ('reg', demog_est)
])

diagn_est = RandomForestClassifier(class_weight='balanced',criterion='gini',
                                  max_features='sqrt',min_samples_leaf=1,
                                  min_samples_split=2)
diagn_pipe = Pipeline([
    ('cst', ColumnSelectTransformer(['DIAGNOSIS'])),
    ('dst', DiagnosisFrameTransformer()),
    ('count',CountVectorizer(max_features=3000)),
    ('tfid', TfidfTransformer()),
    ('rfc', diagn_est)
])

union = FeatureUnion([
    ('demog', EstimatorTransformer(demog_pipe)),# FeatureUnions use the same syntax as Pipelines
    ('diagn',EstimatorTransformer(diagn_pipe))
])


full_pipe = Pipeline([
    ('union',union),
    ('xgb', GradientBoostingClassifier())
])

xgb_params = {
    'xgb__loss': ['deviance','exponential'],
    'xgb__learning_rate': [0.05, 0.1, 0.2],
    'xgb__subsample': [0.8, 1], # max is 1
    'xgb__criterion': ['friedman_mse'],
    'xgb__min_samples_leaf': [1, 4, 7],
    'xgb__max_depth': [2,3,4],
    'xgb__max_features': ['auto']
}

xgb_gs_regressor = GridSearchCV(full_pipe, xgb_params, cv=StratifiedKFold(n_splits=3, shuffle=True),verbose=3,n_jobs=4)
xgb_est = xgb_gs_regressor.fit(X_train,LabelEncoder().fit(y_train).transform(y_train))

xgb_est.best_params_,xgb_est.best_score_

Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  7.6min
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed: 34.4min
[Parallel(n_jobs=4)]: Done 280 tasks      | elapsed: 58.0min
[Parallel(n_jobs=4)]: Done 324 out of 324 | elapsed: 64.0min finished


({'xgb__criterion': 'friedman_mse',
  'xgb__learning_rate': 0.05,
  'xgb__loss': 'deviance',
  'xgb__max_depth': 3,
  'xgb__max_features': 'auto',
  'xgb__min_samples_leaf': 7,
  'xgb__subsample': 0.8},
 0.6116328951484848)

In [31]:
X_test, y_test = pd.read_pickle('./data/pickle/preproc/df_patient_admit_icu__20210126_TEST_final.pkl')

In [38]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

y_test_ = LabelEncoder().fit(y_test).transform(y_test)
y_pred = xgb_est.predict(X_test)
print(confusion_matrix(y_test_,y_pred))
print(accuracy_score(y_test_,y_pred))
print(f1_score(y_test_,y_pred, average='weighted'))

[[5963  528   19  224]
 [ 828  553   71  221]
 [ 731  243   72  103]
 [1255  350   63  970]]
0.6198130227980975
0.5807983189727202


In [35]:
len(y_test_)

48776

In [39]:
import pickle
file = './data/pickle/models/log_rf_xgb__URGENCY__20210127.pkl'
pickle.dump((demog_pipe, diagn_pipe, xgb_est),open(file,'wb'))