In [1]:
# Implements the Doubly robust value estimator for the learned policies

In [285]:
import pandas as pd
import numpy as np
import cPickle as pickle

In [287]:
df_train = pd.read_csv('../data/rl_train_data_final_disc.csv')
df_train.head()

Unnamed: 0.1,Unnamed: 0,bloc,icustayid,state,reward,max_dose_vaso,iv_tev_in,mortality
0,0,1,3,1128,0,0.0,4.0,0
1,1,2,3,1128,0,0.0,4.0,0
2,2,3,3,1172,0,0.0,2.0,0
3,3,4,3,1172,0,0.0,2.0,0
4,4,5,3,464,0,0.0,2.0,0


In [None]:
df_val = pd.read_csv('../data/rl_val_data_final_disc.csv')

In [288]:
df_test = pd.read_csv('../data/rl_test_data_final_disc.csv')

In [289]:
# load in the policies for the physician on val and test sets
phys_policy_val = pickle.load(open("val_policy", "rb" ))
phys_policy_test = pickle.load(open("test_policy", "rb" ))

In [291]:
# get the indices for indexing into the policy matrix. 
indices = phys_policy_test[:,4].astype(int) + 5
phys_action_probs = phys_policy_test[range(len(phys_policy_test)),indices]

In [292]:
# Two cells below - load the actions and q values associated with the agent policy - these are outputted by the
# neural network used to learn the optimal policy and eval_policy respectively.

In [334]:
agent_actions = pickle.load(open('../continuous/dqn_normal/dqn_normal_actions_test.p', "rb" ))

In [335]:
agent_q = pickle.load(open('./eval_policy/policy_q_test', "rb" ))

In [336]:
# add the actions and q values associated with agent performance on the test set to the dataframe

In [337]:
df_test['agent_action'] = np.array(agent_actions)

In [338]:
df_test['agent_q'] = np.array(agent_q)

In [339]:
df_test['phys_prob'] = phys_action_probs

In [341]:
action_map = {}
count = 0
for iv in range(5):
    for vaso in range(5):
        action_map[(iv,vaso)] = count
        count += 1

In [344]:
unique_ids = df_test['icustayid'].unique()

In [2]:
values = [] # contains the value estimates for each trajectory
# This discount factor (gamma) should be the same as that used to find Q(s,a) for the physician policy (sarsa_physician)
# and in eval_policy
gamma = 1
for uid in unique_ids:
    trajectory = df_test.loc[df_test['icustayid'] == uid]
    cur_val = 0
    reversed_traj = trajectory.iloc[::-1]
    t = len(reversed_traj)
    for row in reversed_traj.index:
        state = df_test.ix[row,'state']
        
        iv = df_test.ix[row,'iv_tev_in']
        vaso = df_test.ix[row, 'max_dose_vaso']
        phys_action = action_map[(iv,vaso)]
        phys_prob = df_test.ix[row,'phys_prob']

        agent_action = reversed_traj.ix[row,'agent_action']

        agent_val = reversed_traj.ix[row, 'agent_q']
        
        reward = reversed_traj.ix[row, 'reward']

        # calculate rho - the importance sampling factor         
        if agent_action == phys_action:
            rho = 1.0/phys_prob
        else:
            rho = 0
        
#         # Debug related things below
#         print "row ", row
#         print "phys_prob ", phys_prob
#         print "state", state
#         print "actions ", agent_action, phys_action
#         print "agent val ", agent_val
#         print "cur val before update  ", cur_val
        
        
        cur_val = agent_val + rho*(reward + gamma*cur_val - agent_val)
        t -= 1
    
#         More debugging below
#         print "cur val after update  ", cur_val
#         print "rho ", rho
#         print "reward ", reward

    assert (t == 0)

#     TODO: sort out the problem when some of the vals are way too high - maybe exclude these
#     trajectories from the evaluation?

#     if abs(cur_val) > 1000:
#         print uid
#         bad_uids.append(uid)

    values.append(cur_val)

In [346]:
len(values)

3580

In [347]:
values = pd.Series(values)

# TODO - this is a hack to get rid of trajectories that resulted
# in extremely high values, as a result of rho being very large. 
# Think of a better way of doing this - perhaps by restricting
# the actions that the agent can take at any timestep?
values = (values[(values >= -15) & (values <= 15)])

In [348]:
len(values)

3443

In [349]:
np.mean(values)

9.7414900121408685

In [312]:
# #  The code here evaluates the expected return under the physician policy
# Note that this assumes a discount factor of 1 - need to adjust this to work for 
# other discount factors too
phys_vals = []
for uid in unique_ids:
    traj = df_test.loc[df_test['icustayid'] == uid]
    phys_vals.append(sum(traj['reward']))
np.mean(phys_vals)

11.170391061452515