In [34]:
import boto3
import pandas as pd
import numpy as np
from pathlib import Path
import json
import re
import pickle

from sqlalchemy import create_engine
from sqlalchemy.engine.url import URL

from sklearn.ensemble import forest, gradient_boosting
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.feature_selection import SelectFromModel, SelectKBest

import mlflow
from mlflow import log_metric, log_param, log_artifact
from mlflow.sklearn import log_model
from sklearn.model_selection import ParameterGrid
#import shap

In [2]:
train_start_date = '2017-01-01'
train_end_date = '2018-05-31'
valid_end_date = '2019-10-31'
test_end_date = '2019-06-25'

In [3]:
with open('/code/data/processed/full_notes.pickle', 'rb') as f:
    notes_df = pickle.load(f)

In [4]:
mask = notes_df['provider'] == 'avante'

In [5]:
# notes_df['provider'] = 'avante'
avante_note_text_mask = (notes_df['Section'] == 'Note Text') & (notes_df['provider'] == 'avante')
avante_notes_df = notes_df.loc[avante_note_text_mask,].copy()

In [6]:
master_patient_ids = pd.read_parquet('s3://saiva-restricted-data/raw/avante/master_patient_lookup.parquet')

In [7]:
avante_notes_df = avante_notes_df.merge(master_patient_ids, left_on=['PatientID', 'FacilityID'], right_on=['patientid', 'facilityid'], how='left')

In [None]:
train_y = pd.read_pickle('/code/data/processed/avante/03-train_target_3_day.pickle')
valid_y = pd.read_pickle('/code/data/processed/avante/03-valid_target_3_day.pickle')

In [None]:
len(train_y)

In [8]:
train_x = pd.read_pickle('/code/data/processed/avante/03-train_x.pickle')

In [24]:
valid_x = pd.read_pickle('/code/data/processed/avante/03-valid_x.pickle')
test_x = pd.read_pickle('/code/data/processed/avante/03-test_x.pickle')
# train_y = pd.read_pickle('/code/data/processed/train_y.pickle')

In [9]:
train_y = pd.read_pickle('/code/data/processed/avante/03-train_target_3_day.pickle')

In [10]:
training_cols = list(train_x.columns)

In [11]:
len(train_x)

1174744

In [12]:
avante_notes_df['censusdate'] = avante_notes_df['CreatedDate'].dt.normalize()

In [13]:
notes_patient_days = avante_notes_df.groupby(['masterpatientid', 'censusdate'])

embedding_cols = [c for c in avante_notes_df.columns if c.startswith('e_')]

note_aggs = notes_patient_days[embedding_cols].sum()
note_aggs.reset_index(inplace=True)

In [14]:
# note_aggs['masterpatientid'] = note_aggs['masterpatientid'].apply(lambda x: f'avante_{int(x)}')

In [15]:
train_ids = pd.read_pickle('/code/data/processed/avante/03-train_idens.pickle')
# valid_ids = pd.read_pickle('/code/data/processed/avante/03-valid_idens.pickle')
# test_ids = pd.read_pickle('/code/data/processed/avante/03-test_idens.pickle')

In [16]:
print('train_x', len(train_x))
print('train_ids', len(train_ids))

train_x 1174744
train_ids 1174744


In [17]:
train_x = pd.concat([train_ids.reset_index(drop=True), train_x.reset_index(drop=True)], axis=1)
# valid_x = pd.concat([valid_ids.reset_index(drop=True), valid_x.reset_index(drop=True)], axis=1)
# test_x = pd.concat([test_ids.reset_index(drop=True), test_x.reset_index(drop=True)], axis=1)

In [18]:
train_x = train_x.merge(note_aggs, on=['masterpatientid', 'censusdate'], how='left')
# valid_x = valid_x.merge(note_aggs, on=['masterpatientid', 'censusdate'], how='left')
# test_x = test_x.merge(note_aggs, on=['masterpatientid', 'censusdate'], how='left')

In [19]:
train_x['y'] = train_y

In [20]:
len(train_x)

1174744

In [21]:
valid_x = train_x.loc[train_x['censusdate'] > train_end_date].copy()
train_x = train_x.loc[train_x['censusdate'] <= valid_end_date].copy()

In [22]:
print('train', train_x['censusdate'].min(), train_x['censusdate'].max())
print('valid', valid_x['censusdate'].min(), valid_x['censusdate'].max())
# print('test', test_x['censusdate'].max())

train 2017-01-01 00:00:00 2018-10-31 00:00:00
valid 2018-06-01 00:00:00 2018-10-31 00:00:00


In [23]:
print('train', train_x['e_29'].isna().mean())
print('valid', valid_x['e_29'].isna().mean())
# print('test', test_x['e_29'].isna().mean())

train 0.30260465258813835
valid 0.2920457106067922


In [None]:
# test_x = test_x.loc[test_x['censusdate']<='2019-06-25', ] # Drop the last day of test data because we don't have notes for that day

In [24]:
train_y = train_x['y'].copy()
valid_y = valid_x['y'].copy()
train_ids = train_x[train_ids.columns].copy()
valid_ids = valid_x[train_ids.columns].copy()

In [25]:
training_cols = training_cols + embedding_cols

In [26]:
train_x = train_x[training_cols]
valid_x = valid_x[training_cols]

In [27]:
def fill_na_train(df):
    has_na = df.isna().sum() > 0
    d = df.loc[:, has_na].median()
    df = df.fillna(d)
    
    return df, d

def fill_na_valid(df, na_filler):
    return df.fillna(na_filler)

In [28]:
# fill in any remaining na's - now that we're not forwardfilling past info it's not correct to use a global imputation
# hence we impute on the train and apply to the valid
train_x, na_filler = fill_na_train(train_x)
valid_x = fill_na_valid(valid_x, na_filler)

In [29]:
param_grid = {
    'n_estimators':[48],
    'feat_select_threshold': ['64*median'],
    'max_features': ['auto'], 
    'min_samples_leaf': [400], 
    'class_weight': [None],
}

In [29]:
len(train_ids)

1019419

In [30]:
len(train_x)

1019419

In [31]:
len(valid_x)

155325

In [29]:
train_x.to_pickle('/code/data/processed/avante/03-train_x_with_notes.pickle')
valid_x.to_pickle('/code/data/processed/avante/03-valid_x_with_notes.pickle')
# train_ids.to_pickle('/code/data/processed/train_idens_with_notes.pickle')
# valid_ids.to_pickle('valid_idens_with_notes.pickle')

In [30]:
with open('/code/data/processed/train_y_with_notes.pickle', 'wb') as f:
    pickle.dump(train_y, f)

with open('/code/data/processed/valid_y_with_notes.pickle', 'wb') as f:
    pickle.dump(valid_y, f)

In [11]:
# with open('/code/data/processed/train_y_with_notes.pickle', 'rb') as f:
#     train_y = pickle.load(f)

# with open('/code/data/processed/valid_y_with_notes.pickle', 'rb') as f:
#     valid_y = pickle.load(f)

In [30]:
print('ready')

ready


In [31]:
mlflow.set_experiment('target_hosp_3_day_with_note_text')

In [32]:
train_y[np.isnan(train_y)] = 0
valid_y[np.isnan(valid_y)] = 0

In [33]:
for config in ParameterGrid(param_grid):
    print(f'Trying hyperparamters: {config}')
    
    with mlflow.start_run():
        feat_est = forest.RandomForestClassifier(
            n_estimators=config['n_estimators'],
            max_features=config['max_features'],
            min_samples_leaf=config['min_samples_leaf'],
            class_weight=config['class_weight'],
            n_jobs=-1,
            verbose=3
        )

        feat_selector = SelectFromModel(feat_est, threshold=config['feat_select_threshold'])
        train_x_new = feat_selector.fit_transform(train_x, train_y)

        clf = forest.RandomForestClassifier(
            n_estimators=config['n_estimators'],
            max_features=config['max_features'],
            min_samples_leaf=config['min_samples_leaf'],
            class_weight=config['class_weight'],
            n_jobs=-1,
            verbose=3
        )

        clf.fit(train_x_new, train_y)

        valid_x_new = feat_selector.transform(valid_x)

        train_preds = clf.predict_proba(train_x_new)
        valid_preds = clf.predict_proba(valid_x_new)

        for param in config:
            log_param(param, config[param])

        log_metric('train_aucroc', roc_auc_score(train_y, [pred[1] for pred in train_preds]))
        log_metric('train_ap', average_precision_score(train_y, [pred[1] for pred in train_preds]))
        log_metric('valid_aucroc', roc_auc_score(valid_y, [pred[1] for pred in valid_preds]))
        log_metric('valid_ap', average_precision_score(valid_y, [pred[1] for pred in valid_preds]))

        log_model(feat_selector, 'feat_selector')
        log_model(clf, "model")

        feature_selected_features = pd.DataFrame(zip(train_x.columns[feat_selector.get_support()], clf.feature_importances_), columns=['feature', 'rf_importance']).sort_values('rf_importance', ascending=False)
        feature_selected_features.to_csv('./feature_selected_features.csv', index=False)
        log_artifact('./feature_selected_features.csv')

        input_features = pd.DataFrame(train_x.columns, columns=['feature'])
        input_features.to_csv('./input_features.csv', index=False)
        log_artifact('./input_features.csv')
        
        with open('./na_filler.pickle','wb') as f: pickle.dump(na_filler, f, protocol=4)
        log_artifact('./na_filler.pickle')
        
        

Trying hyperparamters: {'class_weight': None, 'feat_select_threshold': '64*median', 'max_features': 'auto', 'min_samples_leaf': 400, 'n_estimators': 48}


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 48 concurrent workers.


building tree 1 of 48building tree 2 of 48

building tree 3 of 48
building tree 4 of 48
building tree 5 of 48
building tree 6 of 48
building tree 7 of 48
building tree 8 of 48
building tree 9 of 48
building tree 10 of 48
building tree 11 of 48
building tree 12 of 48
building tree 13 of 48
building tree 14 of 48
building tree 15 of 48
building tree 16 of 48
building tree 17 of 48
building tree 18 of 48building tree 19 of 48

building tree 20 of 48building tree 21 of 48
building tree 22 of 48
building tree 23 of 48
building tree 24 of 48
building tree 25 of 48
building tree 26 of 48
building tree 27 of 48
building tree 28 of 48
building tree 29 of 48
building tree 30 of 48
building tree 31 of 48
building tree 32 of 48
building tree 33 of 48
building tree 34 of 48
building tree 35 of 48
building tree 36 of 48
building tree 37 of 48
building tree 38 of 48
building tree 39 of 48
building tree 40 of 48
building tree 41 of 48
building tree 42 of 48

building tree 43 of 48
building tree 44 of 

[Parallel(n_jobs=-1)]: Done   4 out of  48 | elapsed:   45.7s remaining:  8.4min
[Parallel(n_jobs=-1)]: Done  21 out of  48 | elapsed:   48.3s remaining:  1.0min
[Parallel(n_jobs=-1)]: Done  38 out of  48 | elapsed:   50.8s remaining:   13.4s
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:   54.3s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 48 concurrent workers.


building tree 1 of 48
building tree 2 of 48
building tree 3 of 48
building tree 4 of 48
building tree 5 of 48
building tree 6 of 48
building tree 7 of 48
building tree 8 of 48
building tree 9 of 48
building tree 10 of 48
building tree 11 of 48
building tree 12 of 48
building tree 13 of 48
building tree 14 of 48
building tree 15 of 48
building tree 16 of 48
building tree 17 of 48
building tree 18 of 48
building tree 19 of 48
building tree 20 of 48
building tree 21 of 48
building tree 22 of 48
building tree 23 of 48
building tree 24 of 48
building tree 25 of 48
building tree 26 of 48
building tree 27 of 48
building tree 28 of 48
building tree 29 of 48
building tree 30 of 48
building tree 31 of 48
building tree 32 of 48
building tree 33 of 48
building tree 34 of 48
building tree 35 of 48
building tree 36 of 48
building tree 37 of 48
building tree 38 of 48
building tree 39 of 48
building tree 40 of 48building tree 41 of 48
building tree 42 of 48building tree 43 of 48


building tree 44 of 

[Parallel(n_jobs=-1)]: Done   4 out of  48 | elapsed:   45.5s remaining:  8.3min
[Parallel(n_jobs=-1)]: Done  21 out of  48 | elapsed:   49.0s remaining:  1.1min
[Parallel(n_jobs=-1)]: Done  38 out of  48 | elapsed:   50.7s remaining:   13.3s
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:   52.8s finished
[Parallel(n_jobs=48)]: Using backend ThreadingBackend with 48 concurrent workers.
[Parallel(n_jobs=48)]: Done   4 out of  48 | elapsed:    0.5s remaining:    5.0s
[Parallel(n_jobs=48)]: Done  21 out of  48 | elapsed:    0.5s remaining:    0.6s
[Parallel(n_jobs=48)]: Done  38 out of  48 | elapsed:    0.5s remaining:    0.1s
[Parallel(n_jobs=48)]: Done  48 out of  48 | elapsed:    0.6s finished
[Parallel(n_jobs=48)]: Using backend ThreadingBackend with 48 concurrent workers.
[Parallel(n_jobs=48)]: Done   4 out of  48 | elapsed:    0.1s remaining:    0.7s
[Parallel(n_jobs=48)]: Done  21 out of  48 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=48)]: Done  38 out of  48 | e

In [36]:
print('Done')

Done


In [None]:
feat_est = forest.RandomForestClassifier(
    n_estimators=1000,
    max_features='auto',
    min_samples_leaf=200,
    class_weight=None,
    n_jobs=-1
)

feat_selector = SelectFromModel(feat_est, threshold='32*median')
train_x_new = feat_selector.fit_transform(train_x, train_y)

clf = forest.RandomForestClassifier(
    n_estimators=1000,
    max_features='auto',
    min_samples_leaf=200,
    class_weight=None,
    n_jobs=-1
)

clf.fit(train_x_new, train_y)

valid_x_new = feat_selector.transform(valid_x)
valid_preds = clf.predict_proba(valid_x_new)

In [None]:
average_precision_score(valid_y, [pred[1] for pred in valid_preds])

In [None]:
valid_x_shap = pd.concat([valid_x.loc[:, feat_selector.get_support()].reset_index(drop=True), valid_idens.reset_index(drop=True)], axis=1)
valid_x_shap['preds'] = [pred[1] for pred in valid_preds]
valid_x_shap['target'] = valid_y

In [None]:
valid_x_shap = valid_x_shap[valid_x_shap.censusdate == pd.to_datetime('2019-02-20')]

In [None]:
sorted_valid = valid_x_shap.sort_values(by='preds', ascending=False)
sorted_valid_dr = sorted_valid.drop(columns=['preds', 'masterpatientid', 'censusdate'])
idens = sorted_valid.loc[:, ['masterpatientid', 'censusdate', 'preds', 'target']]

In [None]:
tt = sorted_valid_dr.head(10)

In [None]:
explainer = shap.TreeExplainer(clf)

In [None]:
shap_values = explainer.shap_values(tt)

In [None]:
shap.force_plot(explainer.expected_value[1], shap_values[1][0], tt.iloc[0])

In [None]:
out = []

for i in range(0,10):
    shaps = pd.DataFrame({'feature_name':tt.columns, 'shap_value': shap_values[1][i], 'feature_value': tt.iloc[i]}).sort_values(by='shap_value', ascending=False)
    shaps = shaps.head(n=10)
    shaps['masterpatientid'] = hash(str(idens.iloc[i].masterpatientid))
    shaps['censusdate'] = idens.iloc[i].censusdate
    shaps['prediction'] = idens.iloc[i].preds
    shaps['rehosped'] = idens.iloc[i].target
    out.append(shaps)
    

In [None]:
pd.concat(out).to_csv('/code/data/copd_model_2019-02-20.csv', index=False)

In [None]:
pd.DataFrame({'feature_name':tt.columns, 'shap_value': shap_values[1][0], 'feature_value': tt.iloc[0]}).sort_values(by='shap_value', ascending=False)