In [1]:
# learn an estimate of the reward function, ie Rhat = f(s,a)

In [2]:
import tensorflow as tf
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import os
import cPickle as pickle

In [3]:
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [4]:
train_data = pd.read_csv('../data/rl_train_data_final_cont.csv')

In [5]:
val_data = pd.read_csv('../data/rl_val_data_final_cont.csv')

In [6]:
test_data = pd.read_csv('../data/rl_test_data_final_cont.csv')

In [7]:
train_data.head()

Unnamed: 0,bloc,icustayid,charttime,gender,age,elixhauser,re_admission,died_in_hosp,mortality_90d,Weight_kg,...,median_dose_vaso,max_dose_vaso,input_total_tev,input_4hourly_tev,output_total,output_4hourly,cumulated_balance_tev,vaso_input,iv_input,reward
0,0.0,3,7245052800,0.0,0.412568,0.0,0.0,0,1,0.262712,...,0.0,0.0,0.797351,0.939195,0.589916,0.750908,0.5545,0.0,4.0,0.125
1,0.22256,3,7245067200,0.0,0.412568,0.0,0.0,0,1,0.262712,...,0.0,0.0,0.83178,0.934543,0.674384,0.819589,0.580033,0.0,4.0,0.657321
2,0.356608,3,7245081600,0.0,0.412568,0.0,0.0,0,1,0.262712,...,0.0,0.0,0.833222,0.656575,0.765423,0.939329,0.555033,0.0,2.0,1.367788
3,0.452837,3,7245096000,0.0,0.412568,0.0,0.0,0,1,0.262712,...,0.0,0.0,0.834033,0.603831,0.783597,0.847073,0.5457,0.0,2.0,1.199099
4,0.527957,3,7245110400,0.0,0.412568,0.0,0.0,0,1,0.262712,...,0.0,0.0,0.834836,0.603831,0.794059,0.811583,0.539533,0.0,2.0,1.057596


In [63]:
def preproc(df):
    df_in = df.copy()
    keep_arr = list(np.loadtxt('../data/state_features.txt', dtype=str))
    keep_arr.append('iv_input')
    keep_arr.append('vaso_input')
    df_in = df_in[np.abs(df_in['reward']) >=10]
    features = df_in[keep_arr].values
    labels = df_in['reward'].values/15.0
    labels[labels < 0] = 0
    return features,labels

In [64]:
def batch_sample(batch_size, features, labels):
    idx = np.random.choice(np.arange(len(features)), batch_size, replace=False)
    return (np.vstack(features[idx]), labels[idx])

In [65]:
train_feat, train_labels = preproc(train_data)
val_feat, val_labels = preproc(val_data)
test_feat, test_labels = preproc(test_data)

In [66]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(train_feat, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [67]:
# for each state: if not terminal, deterministically get reward. if terminal, use clf to predict outcome,
# and estimated reward = 15 * outcome


In [91]:
feat_names = list(np.loadtxt('../data/state_features.txt', dtype=str))

In [92]:
envmodel_save_dir = './env_model_regression/'
est_state_val = pickle.load(open( envmodel_save_dir + "est_next_states_val.p", "rb" ))
est_state_test =  pickle.load(open( envmodel_save_dir + "est_next_states_test.p", "rb" ))

In [93]:
est_state_val = np.vstack(est_state_val)
est_state_test = np.vstack(est_state_test)

In [94]:
# Get the relevant fields from the estimated next states
lact_index = feat_names.index('Arterial_lactate')
sofa_index = feat_names.index('SOFA')

val_data['est_lactate'] = est_state_val[:, lact_index]
val_data['est_sofa'] = est_state_val[:, sofa_index]

test_data['est_lactate'] = est_state_test[:, lact_index]
test_data['est_sofa'] = est_state_test[: , sofa_index]

In [95]:
# undo the scaling on the next state prediction in order to calculate the reward
# unscaled = std_dev*(min_max_normalised*(max_norm - min_norm)) + mean

df_orig = pd.read_csv('../data/MKdataset07Feb17.csv')
df_orig_train = df_orig.loc[df_orig['icustayid'].isin(train_data['icustayid'])]

sofa_mean = df_orig_train['SOFA'].mean()
sofa_std = df_orig_train['SOFA'].std()
lact_mean = df_orig_train['Arterial_lactate'].mean()
lact_std = df_orig_train['Arterial_lactate'].std()

norm_sofa = (df_orig_train['SOFA'] - sofa_mean)/sofa_std
norm_lact = (df_orig_train['Arterial_lactate']-lact_mean)/lact_std

min_norm_sofa = min(norm_sofa)
max_norm_sofa = max(norm_sofa)

min_norm_lact = min(norm_lact)
max_norm_lact = max(norm_lact)

min_max_norm_sofa = (norm_sofa - min_norm_sofa)/(max_norm_sofa-min_norm_sofa)
min_max_norm_lact = (norm_lact - min_norm_lact)/(max_norm_lact-min_norm_lact)

In [96]:
# Add in the agent actions to both the validation and test dataframes
agent_actions_test = pickle.load(open('../continuous/dqn_normal/dqn_normal_actions_test.p', "rb" ))
agent_actions_val = pickle.load(open('../continuous/dqn_normal/dqn_normal_actions_val.p', "rb" ))

In [97]:
# unpack the actions into tuples of (iv,vaso)inv_action_map = {}
count = 0
for i in range(5):
    for j in range(5):
        inv_action_map[count] = [i,j]
        count += 1

In [98]:
# unpack the actions into tuples of (iv,vaso)
test_actions_arr = np.array([inv_action_map[i] for i in agent_actions_test])
val_actions_arr = np.array([inv_action_map[i] for i in agent_actions_val])

In [99]:
val_data['agent_iv'] = val_actions_arr[:,0]
val_data['agent_vaso'] = val_actions_arr[:,1]

test_data['agent_iv'] = test_actions_arr[:,0]
test_data['agent_vaso'] = test_actions_arr[:,1]

In [119]:
# Run the reward estimator
# Some constants taken from preproc/new_rewards:
c0 = -0.1/4
c1 = -0.5/4
c2 = -2
clf_features = list(np.loadtxt('../data/state_features.txt', dtype=str))
clf_features.append('agent_iv')
clf_features.append('agent_vaso')

def reward_estimator(df):
    df['unscaled_sofa_now'] = sofa_std*(df['SOFA']*(max_norm_sofa - min_norm_sofa) + min_norm_sofa) + sofa_mean
    df['unscaled_sofa_next'] = sofa_std*(df['est_sofa']*(max_norm_sofa - min_norm_sofa) + min_norm_sofa) + sofa_mean

    df['unscaled_lact_now'] = lact_std*(df['Arterial_lactate']*(max_norm_lact - min_norm_lact) + min_norm_lact) + lact_mean
    df['unscaled_lact_next'] = lact_std*(df['est_lactate']*(max_norm_lact - min_norm_lact) + min_norm_lact) + lact_mean

    rewards = []
    for count,i in enumerate(df.index):
        if count == len(df) - 1 or df.loc[i, 'icustayid'] != df.loc[df.index[count+1], 'icustayid']:
            feat = df.loc[i,clf_features].values
            feat = feat.reshape(1,-1)
            est_outcome = clf.predict(feat)
            try:
                if est_outcome == 0:
                    rewards.append(-15)
                else:
                    rewards.append(15)
            except ValueError:
                print est_outcome
                print len(est_outcome)
                raise
        else:
            lact_now = df.loc[i, 'unscaled_lact_now']
            sofa_now = df.loc[i, 'unscaled_sofa_now']
            lact_next = df.loc[i, 'unscaled_lact_next']
            sofa_next = df.loc[i, 'unscaled_sofa_next']
            reward = 0
            if sofa_next == sofa_now and sofa_next != 0:
                reward += c0
            reward += c1*(sofa_next-sofa_now)
            reward += c2*np.tanh(lact_next - lact_now)
            rewards.append(reward)
    return np.array(rewards)

In [120]:
val_rewards = reward_estimator(val_data)
test_rewards = reward_estimator(test_data)

In [121]:
with open(r"val_rewards.p", "wb") as f:
    pickle.dump(val_rewards, f)

In [122]:
with open(r"test_rewards.p", "wb") as f:
    pickle.dump(test_rewards, f)