In [1]:
import pandas as pd
import pickle

In [2]:
with open('train_x_with_notes.pickle', 'rb') as f:
    train_x = pickle.load(f)

with open('train_idens_with_notes.pickle', 'rb') as f:
    train_idens = pickle.load(f)

In [3]:
print(train_idens['censusdate'].min(), train_idens['censusdate'].max())

2017-01-01 00:00:00 2018-04-30 00:00:00


In [4]:
with open('valid_x_with_notes.pickle', 'rb') as f:
    valid_x = pickle.load(f)

with open('valid_idens_with_notes.pickle', 'rb') as f:
    valid_idens = pickle.load(f)

In [5]:
print(valid_idens['censusdate'].min(), valid_idens['censusdate'].max())

2018-05-01 00:00:00 2018-08-19 00:00:00


In [6]:
notes = pd.read_pickle('data/processed/avante_notes.pickle')

In [16]:
print(notes['CreatedDate'].min(), notes['CreatedDate'].max())

2017-01-01 00:01:52.583000 2019-06-25 23:59:34.697000


In [13]:
import boto3
import pandas as pd
import numpy as np
from pathlib import Path
import json
import re
import pickle

from sqlalchemy import create_engine
from sqlalchemy.engine.url import URL

from sklearn.ensemble import forest, gradient_boosting
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.feature_selection import SelectFromModel, SelectKBest

import mlflow
from mlflow import log_metric, log_param, log_artifact
from mlflow.sklearn import log_model
from sklearn.model_selection import ParameterGrid

In [20]:
rehosps = pd.read_parquet('/code/data/raw/patient_rehosps.parquet')

In [25]:
rehosps['censusdate'] = pd.DatetimeIndex(rehosps['dateoftransfer']).normalize()

In [33]:
combined = pd.read_parquet('/code/data/processed/combined.parquet')

In [40]:
target = combined[['censusdate', 'masterpatientid', 'patientid', 'facilityid', 'hosp_target_3_day_hosp']]

In [43]:
target['masterpatientid'] = target['masterpatientid'].apply(lambda x: f'avante_{x}')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [44]:
train_y_df = train_idens.merge(target, on=['censusdate', 'masterpatientid'], how='inner')

In [47]:
valid_y_df = valid_idens.merge(target, on=['censusdate', 'masterpatientid'], how='inner')

In [48]:
len(valid_y_df) == len(valid_idens)

True

In [63]:
train_y_df.head()

Unnamed: 0,masterpatientid,censusdate,patientid,facilityid,hosp_target_3_day_hosp
0,avante_100054,2017-01-01,268105,9,
1,avante_100063,2017-01-01,267941,11,False
2,avante_100084,2017-01-01,268060,8,
3,avante_100373,2017-01-01,268634,15,
4,avante_1004,2017-01-01,1004,10,


In [62]:
train_x['facilityid'].astype(int).

0     9
1    11
2     8
3    15
4    10
5    11
6     4
7    11
8    12
9     3
Name: facilityid, dtype: int64

In [68]:
mlflow.set_experiment('target_hosp_3_day_with_note_text_new')

INFO: 'target_hosp_3_day_with_note_text_new' does not exist. Creating a new experiment


In [77]:
param_grid = {
    'n_estimators':[48],
    'feat_select_threshold': ['64*median'],
    'max_features': ['auto'], 
    'min_samples_leaf': [200], 
    'class_weight': [None],
}

In [104]:
train_y = train_y_df['hosp_target_3_day_hosp'].fillna(False)

In [105]:
valid_y = valid_y_df['hosp_target_3_day_hosp'].fillna(False)

In [107]:
for config in ParameterGrid(param_grid):
    print(f'Trying hyperparamters: {config}')
    
    with mlflow.start_run():
        feat_est = forest.RandomForestClassifier(
            n_estimators=config['n_estimators'],
            max_features=config['max_features'],
            min_samples_leaf=config['min_samples_leaf'],
            class_weight=config['class_weight'],
            n_jobs=-1,
            verbose=3
        )

        feat_selector = SelectFromModel(feat_est, threshold=config['feat_select_threshold'])
        train_x_new = feat_selector.fit_transform(train_x, train_y)

        clf = forest.RandomForestClassifier(
            n_estimators=config['n_estimators'],
            max_features=config['max_features'],
            min_samples_leaf=config['min_samples_leaf'],
            class_weight=config['class_weight'],
            n_jobs=-1,
            verbose=3
        )

        clf.fit(train_x_new, train_y.astype(int))

        valid_x_new = feat_selector.transform(valid_x)

        train_preds = clf.predict_proba(train_x_new)
        valid_preds = clf.predict_proba(valid_x_new)

        for param in config:
            log_param(param, config[param])

        log_metric('train_aucroc', roc_auc_score(train_y, [pred[1] for pred in train_preds]))
        log_metric('train_ap', average_precision_score(train_y, [pred[1] for pred in train_preds]))
        log_metric('valid_aucroc', roc_auc_score(valid_y, [pred[1] for pred in valid_preds]))
        log_metric('valid_ap', average_precision_score(valid_y, [pred[1] for pred in valid_preds]))

        log_model(feat_selector, 'feat_selector')
        log_model(clf, "model")

        feature_selected_features = pd.DataFrame(zip(train_x.columns[feat_selector.get_support()], clf.feature_importances_), columns=['feature', 'rf_importance']).sort_values('rf_importance', ascending=False)
        feature_selected_features.to_csv('./feature_selected_features.csv', index=False)
        log_artifact('./feature_selected_features.csv')

        input_features = pd.DataFrame(train_x.columns, columns=['feature'])
        input_features.to_csv('./input_features.csv', index=False)
        log_artifact('./input_features.csv')
        
        with open('./na_filler.pickle','wb') as f: pickle.dump(na_filler, f, protocol=4)
        log_artifact('./na_filler.pickle')
        
        print('valid_aucroc:', roc_auc_score(valid_y, [pred[1] for pred in valid_preds]))
        

Trying hyperparamters: {'class_weight': None, 'feat_select_threshold': '64*median', 'max_features': 'auto', 'min_samples_leaf': 200, 'n_estimators': 48}


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 48 concurrent workers.


building tree 1 of 48
building tree 2 of 48
building tree 3 of 48
building tree 4 of 48
building tree 5 of 48
building tree 6 of 48
building tree 7 of 48
building tree 8 of 48
building tree 9 of 48
building tree 10 of 48
building tree 11 of 48
building tree 12 of 48
building tree 13 of 48
building tree 14 of 48
building tree 15 of 48
building tree 16 of 48
building tree 17 of 48building tree 18 of 48
building tree 19 of 48

building tree 20 of 48
building tree 21 of 48
building tree 22 of 48building tree 23 of 48
building tree 24 of 48

building tree 25 of 48building tree 26 of 48

building tree 27 of 48
building tree 28 of 48
building tree 29 of 48
building tree 30 of 48
building tree 31 of 48
building tree 32 of 48
building tree 33 of 48
building tree 34 of 48
building tree 35 of 48
building tree 36 of 48
building tree 37 of 48
building tree 38 of 48building tree 39 of 48
building tree 40 of 48
building tree 41 of 48
building tree 42 of 48
building tree 43 of 48
building tree 44 of 4

[Parallel(n_jobs=-1)]: Done   4 out of  48 | elapsed:   34.3s remaining:  6.3min
[Parallel(n_jobs=-1)]: Done  21 out of  48 | elapsed:   36.3s remaining:   46.7s
[Parallel(n_jobs=-1)]: Done  38 out of  48 | elapsed:   37.3s remaining:    9.8s
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:   39.4s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 48 concurrent workers.


building tree 1 of 48
building tree 2 of 48
building tree 3 of 48
building tree 4 of 48
building tree 5 of 48
building tree 6 of 48
building tree 7 of 48
building tree 8 of 48
building tree 9 of 48
building tree 10 of 48
building tree 11 of 48
building tree 12 of 48
building tree 13 of 48
building tree 14 of 48
building tree 15 of 48building tree 16 of 48
building tree 17 of 48
building tree 18 of 48
building tree 19 of 48
building tree 20 of 48
building tree 21 of 48
building tree 22 of 48
building tree 23 of 48
building tree 24 of 48
building tree 25 of 48
building tree 26 of 48
building tree 27 of 48

building tree 28 of 48
building tree 29 of 48
building tree 30 of 48
building tree 31 of 48
building tree 32 of 48
building tree 33 of 48building tree 34 of 48

building tree 35 of 48
building tree 36 of 48
building tree 37 of 48
building tree 38 of 48
building tree 39 of 48
building tree 40 of 48
building tree 41 of 48
building tree 42 of 48
building tree 43 of 48
building tree 44 of 

[Parallel(n_jobs=-1)]: Done   4 out of  48 | elapsed:   33.7s remaining:  6.2min
[Parallel(n_jobs=-1)]: Done  21 out of  48 | elapsed:   36.5s remaining:   46.9s
[Parallel(n_jobs=-1)]: Done  38 out of  48 | elapsed:   37.8s remaining:   10.0s
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:   39.5s finished
[Parallel(n_jobs=48)]: Using backend ThreadingBackend with 48 concurrent workers.
[Parallel(n_jobs=48)]: Done   4 out of  48 | elapsed:    0.4s remaining:    4.4s
[Parallel(n_jobs=48)]: Done  21 out of  48 | elapsed:    0.4s remaining:    0.6s
[Parallel(n_jobs=48)]: Done  38 out of  48 | elapsed:    0.5s remaining:    0.1s
[Parallel(n_jobs=48)]: Done  48 out of  48 | elapsed:    0.5s finished
[Parallel(n_jobs=48)]: Using backend ThreadingBackend with 48 concurrent workers.
[Parallel(n_jobs=48)]: Done   4 out of  48 | elapsed:    0.1s remaining:    0.7s
[Parallel(n_jobs=48)]: Done  21 out of  48 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=48)]: Done  38 out of  48 | e

NameError: name 'na_filler' is not defined

In [108]:
print('valid_aucroc:', roc_auc_score(valid_y, [pred[1] for pred in valid_preds]))

valid_aucroc: 0.7873359645676679


In [96]:
train_y.shape

(960181,)

In [97]:
valid_x.shape

(141455, 2808)

In [98]:
train_x.shape

(960181, 2808)