In [None]:
# learn the physician policy - ie, pi(a|s)

In [2]:
import tensorflow as tf
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import os
import cPickle as pickle

In [3]:
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [4]:
train_data = pd.read_csv('../data/rl_train_data_final_cont.csv')

In [5]:
val_data = pd.read_csv('../data/rl_val_data_final_cont.csv')

In [6]:
test_data = pd.read_csv('../data/rl_test_data_final_cont.csv')

In [7]:
# Extract features (state vector) and labels (action taken) out of the dataframe for train 
# and val sets
def preproc(df_in, iv_bins = 5):
    df = df_in.copy()
    actions_raw = df[['iv_input', 'vaso_input']].values
    keep_arr = np.loadtxt('../data/state_features.txt', dtype=str)
    df = df[keep_arr]
    actions_proc = (iv_bins*actions_raw[:, 0] + actions_raw[:, 1]).astype(int)
    hist = np.histogram(actions_proc, 25)
    actions_proc = pd.get_dummies(actions_proc).values
    #print(hist) just to check
    return df.values, actions_proc

In [8]:
def batch_sample(batch_size, features, labels):
    idx = np.random.choice(np.arange(len(features)), batch_size)
    return (np.vstack(features[idx]), np.vstack(labels[idx]))

In [9]:
train_feat, train_labels = preproc(train_data)
val_feat, val_labels = preproc(val_data)
test_feat, test_labels = preproc(test_data)

In [10]:
feature_length = len(train_feat[0])
batch_size = 64
num_actions = 25
num_steps = 35000

In [11]:
# todo - reduce network size
class PolicyModel():
    def __init__(self):
        self.input_feat = tf.placeholder(tf.float32, shape = [None, feature_length])
        self.labels = tf.placeholder(tf.float32, shape = [None, num_actions])
        self.phase = tf.placeholder(tf.bool)
        
        self.fc_1 = tf.contrib.layers.fully_connected(self.input_feat, 64, activation_fn=tf.nn.relu)
        self.bn_1 = tf.contrib.layers.batch_norm(self.fc_1, center=True, scale=True, is_training=self.phase)
#         self.fc_2 = tf.contrib.layers.fully_connected(self.bn_1 , 256, activation_fn=tf.nn.relu)    
#         self.bn_2 = tf.contrib.layers.batch_norm(self.fc_2, center=True, scale=True, is_training=self.phase)
#         self.fc_3 = tf.contrib.layers.fully_connected(self.bn_2 , 128, activation_fn=tf.nn.relu)
#         self.bn_3 = tf.contrib.layers.batch_norm(self.fc_3, center=True, scale=True, is_training=self.phase)
        self.fc_4 = tf.contrib.layers.fully_connected(self.bn_1 , 64, activation_fn=tf.nn.relu)
        self.bn_4 = tf.contrib.layers.batch_norm(self.fc_4, center=True, scale=True, is_training=self.phase)
        
        self.logits = tf.contrib.layers.fully_connected(self.bn_4 , num_actions, activation_fn=None)
        self.output = tf.nn.softmax(self.logits)
        self.reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
        self.reg_constant = 0.1 
        
        self.accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(self.labels, 1), tf.argmax(self.output, 1)),'float32'))
        self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = self.logits, labels = self.labels)) + self.reg_constant*sum(self.reg_losses)

        
        self.update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(self.update_ops):
            self.train_step = tf.train.AdamOptimizer().minimize(self.loss)

In [12]:
# Prints out accuracy on the relevant dataset and returns the policy. 
# This is the probability of taking each action in the action space from that state

def get_policy(dataset,sess, mdl):

    if dataset == 'train':
        features, labels = train_feat,train_labels
    elif dataset == 'val':
        features, labels = val_feat,val_labels
    elif dataset == 'test':
        features, labels = test_feat,test_labels

    
    op = np.zeros((len(features), num_actions))
    total_acc = 0
    total_loss = 0
    j = 0
    while (j < len(features)):
        feat = None
        lbls = None
        if len(features) - j < batch_size:
            feat = features[j:-1]
            lbls = labels[j:-1]
        else:
            feat = features[j:j+batch_size]
            lbls = labels[j:j+batch_size]
        feat = feat.reshape(len(feat), feature_length)
        lbls = lbls.reshape(len(lbls), num_actions)
        if j%10000 == 0: print('Processing val set indx: ', j )
        softmax, accuracy, loss = sess.run([mdl.output, mdl.accuracy, mdl.loss], feed_dict={mdl.input_feat : feat, mdl.phase: 0, mdl.labels: lbls, mdl.phase: 0})
        total_acc += accuracy
        op[j:j+len(feat)] = softmax
        if len(features) - j < batch_size:
            j = len(features)
        else: j+=batch_size
        final_acc = total_acc/(len(op)/batch_size)
        total_loss += loss
    return op, final_acc, total_loss
    

In [13]:
def train():
    tf.reset_default_graph()
    mdl = PolicyModel()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # Don't use all GPUs 
    config.allow_soft_placement = True  # Enable manual control
    init = tf.global_variables_initializer()
    with tf.Session(config=config) as sess:
        sess.run(init)
        net_loss = 0
        net_accuracy = 0.0
        print('Starting training!')
        for i in range(num_steps):
            feat, labels = batch_sample(batch_size, train_feat, train_labels)
            
            _, loss, accuracy = sess.run([mdl.train_step, mdl.loss, mdl.accuracy], feed_dict={mdl.input_feat : feat, mdl.labels: labels, mdl.phase: 1})
            
            net_loss += loss
            net_accuracy += accuracy
            if i % 1000 == 0 and i > 0:
                av_loss = net_loss/1000.0
                av_accuracy = net_accuracy/1000.0
                print("Step: ", i, "Average loss is: ", av_loss, "Average accuracy is: ", av_accuracy)
                net_loss = 0.0
                net_accuracy = 0.0
            
            if i % 5000 == 0:
                print "Test on validation set"
                _, val_acc, val_loss = get_policy('val', sess, mdl)
                print('Val set accuracy, loss: ', val_acc, val_loss)
                
        # Commented out for now
        # train_policy, train_acc = get_policy('train')
        print "Finished, getting final accuracy"
        val_policy, val_acc, val_loss = get_policy('val', sess, mdl)
        test_policy, _, _ = get_policy('test',sess, mdl)
    print('Val set accuracy, loss: ', val_acc, val_loss)
    return val_policy, test_policy


In [14]:
val_policy, test_policy = train()

Starting training!
Test on validation set
('Processing val set indx: ', 0)
('Val set accuracy, loss: ', 0.013116776315789473, 1259.7755255699158)
('Step: ', 1000, 'Average loss is: ', 2.0802375934123991, 'Average accuracy is: ', 0.31106250000000002)
('Step: ', 2000, 'Average loss is: ', 1.7987357392311096, 'Average accuracy is: ', 0.33792187499999998)
('Step: ', 3000, 'Average loss is: ', 1.7654804812669753, 'Average accuracy is: ', 0.35026562500000002)
('Step: ', 4000, 'Average loss is: ', 1.7371502479314804, 'Average accuracy is: ', 0.356375)
('Step: ', 5000, 'Average loss is: ', 1.728486176609993, 'Average accuracy is: ', 0.35635937499999998)
Test on validation set
('Processing val set indx: ', 0)
('Val set accuracy, loss: ', 0.33115808823587078, 687.07190573215485)
('Step: ', 6000, 'Average loss is: ', 1.7163822487592697, 'Average accuracy is: ', 0.35684375000000002)
('Step: ', 7000, 'Average loss is: ', 1.6976348477602006, 'Average accuracy is: ', 0.36159374999999999)
('Step: ', 8

In [17]:
#  save the learned policy as a numpy array with the columns as icustayid, bloc, iv input, vaso input,
#  action index (of 25), and probability distribution over actions ( this is 25 columns)

In [18]:
v_data = val_data[['icustayid', 'bloc', 'iv_input', 'vaso_input']].values
val_actions = (5*val_data['iv_input'].values + val_data['vaso_input']).values.astype(int)
val_pickle = np.concatenate((v_data, val_actions.reshape(len(val_actions), 1), val_policy), axis = 1)

In [19]:
t_data = test_data[['icustayid', 'bloc', 'iv_input', 'vaso_input']].values
test_actions = (5*test_data['iv_input'].values + test_data['vaso_input']).values.astype(int)
test_pickle = np.concatenate((t_data, test_actions.reshape(len(test_actions), 1), test_policy), axis = 1)

In [22]:
with open(r"val_policy.p", "wb") as f:
    pickle.dump(val_pickle, f)

In [23]:
with open(r"test_policy.p", "wb") as f:
    pickle.dump(test_pickle, f)