In [8]:
from __future__ import division
import pickle
import random
import os
import math
import types
import uuid
import time
from copy import copy
from collections import defaultdict, Counter

import numpy as np
import gym
from gym import spaces, wrappers

import tempfile
import tensorflow as tf
# from tensorflow.contrib import rnn
from tensorflow.compat.v1.nn import rnn_cell as rnn
import zipfile

import baselines.common.tf_util as U

from baselines import logger
from baselines.common.schedules import LinearSchedule
from baselines import deepq
from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer
from baselines.deepq.simple import ActWrapper

from scipy.special import logsumexp

from pyglet.window import key as pygkey

ModuleNotFoundError: No module named 'baselines.common'

In [None]:
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
import matplotlib as mpl
mpl.rc('savefig', dpi=300)
mpl.rc('text', usetex=True)

In [None]:
data_dir = os.path.join('data', 'lunarlander-sim')

train synthetic pilot

In [None]:
throttle_mag = 0.75
def disc_to_cont(action):
    if type(action) == np.ndarray:
        return action
    # main engine
    if action < 3:
        m = -throttle_mag
    elif action < 6:
        m = throttle_mag
    else:
        raise ValueError
    # steering
    if action % 3 == 0:
        s = -throttle_mag
    elif action % 3 == 1:
        s = 0
    else:
        s = throttle_mag
    return np.array([m, s])

In [None]:
def mask_helipad(obs, replace=0):
  obs = copy(obs)
  if len(obs.shape) == 1:
    obs[8] = replace
  else:
    obs[:, 8] = replace
  return obs

def traj_mask_helipad(traj):
  return [mask_helipad(obs) for obs in traj]

In [None]:
n_act_dim = 6
n_obs_dim = 9

In [None]:
def onehot_encode(i, n=n_act_dim):
    x = np.zeros(n)
    x[i] = 1
    return x

def onehot_decode(x):
    l = np.nonzero(x)[0]
    assert len(l) == 1
    return l[0]

In [None]:
def make_env(using_lander_reward_shaping=False):
  env = gym.make('LunarLanderContinuous-v2')
  env.action_space = spaces.Discrete(n_act_dim)
  env.unwrapped._step_orig = env.unwrapped._step
  def _step(self, action):
      obs, r, done, info = self._step_orig(disc_to_cont(action))
      return obs, r, done, info
  env.unwrapped._step = types.MethodType(_step, env.unwrapped)
  env.unwrapped.using_lander_reward_shaping = using_lander_reward_shaping
  return env

In [None]:
env = make_env(using_lander_reward_shaping=True)

In [None]:
max_ep_len = 1000
n_training_episodes = 500

In [None]:
make_q_func = lambda: deepq.models.mlp([64, 64])

In [None]:
pilot_dqn_learn_kwargs = {
  'lr': 1e-3,
  'exploration_fraction': 0.1,
  'exploration_final_eps': 0.02,
  'target_network_update_freq': 1500,
  'print_freq': 100,
  'num_cpu': 5,
  'gamma': 0.99
}

In [None]:
full_pilot_scope = 'full_pilot'
full_pilot_q_func = make_q_func()
load_pretrained_full_pilot = True

In [None]:
max_timesteps = max_ep_len * (1 if load_pretrained_full_pilot else n_training_episodes)

In [None]:
raw_full_pilot_policy, full_pilot_reward_data = deepq.learn(
  env,
  q_func=full_pilot_q_func,
  max_timesteps=max_timesteps,
  scope=full_pilot_scope,
  **pilot_dqn_learn_kwargs
)

In [None]:
with open(os.path.join(data_dir, 'full_pilot_reward_data.pkl'), 'wb') as f:
  pickle.dump(full_pilot_reward_data, f, pickle.HIGHEST_PROTOCOL)

In [None]:
with open(os.path.join(data_dir, 'full_pilot_reward_data.pkl'), 'rb') as f:
  full_pilot_reward_data = pickle.load(f)

In [None]:
def run_ep(policy, env, max_ep_len=max_ep_len, render=False, pilot_is_human=False):
    if pilot_is_human:
      global human_agent_action
      human_agent_action = init_human_action()
    obs = env.reset()
    done = False
    totalr = 0.
    trajectory = None
    actions = []
    for step_idx in range(max_ep_len+1):
        if done:
            trajectory = info['trajectory']
            break
        action = policy(obs[None, :])
        obs, r, done, info = env.step(action)
        actions.append(action)
        if render:
          env.render()
        totalr += r
    outcome = r if r % 100 == 0 else 0
    return totalr, outcome, trajectory, actions

In [None]:
def full_pilot_policy(obs):
  with tf.variable_scope(full_pilot_scope, reuse=None):
    return raw_full_pilot_policy._act(obs)[0]

In [None]:
class LaggyPilotPolicy(object):
  def __init__(self):
    self.last_laggy_pilot_act = None
    
  def __call__(self, obs, lag_prob=0.8):
    if self.last_laggy_pilot_act is None or np.random.random() >= lag_prob:
      action = full_pilot_policy(obs)
      self.last_laggy_pilot_act = action
    return self.last_laggy_pilot_act

In [None]:
laggy_pilot_policy = LaggyPilotPolicy()

In [None]:
def noisy_pilot_policy(obs, noise_prob=0.15):
  action = full_pilot_policy(obs)
  if np.random.random() < noise_prob:
    action = (action + 3) % 6
  if np.random.random() < noise_prob:
    action = action//3*3 + (action + np.random.randint(1, 3)) % 3
  return action

In [None]:
def noop_pilot_policy(obs):
  return 1

In [None]:
def sensor_pilot_policy(obs, thresh=0.1):
  d = obs[0, 8] - obs[0, 0] # horizontal dist to helipad
  if d < -thresh:
    return 0
  elif d > thresh:
    return 2
  else:
    return 1

In [None]:
# begin debug

In [None]:
run_ep(full_pilot_policy, env, render=True)

In [None]:
env.close()

In [None]:
# end debug

In [None]:
def save_tf_vars(scope, path):
  sess = U.get_session()
  saver = tf.train.Saver([v for v in tf.global_variables() if v.name.startswith(scope + '/')])
  saver.save(sess, save_path=path)

In [None]:
def load_tf_vars(scope, path):
  sess = U.get_session()
  saver = tf.train.Saver([v for v in tf.global_variables() if v.name.startswith(scope + '/')])
  saver.restore(sess, path)

In [None]:
full_pilot_path = os.path.join(data_dir, 'full_pilot.tf')

In [None]:
save_tf_vars(full_pilot_scope, full_pilot_path)

In [None]:
load_tf_vars(full_pilot_scope, full_pilot_path)

evaluate synthetic pilot

In [None]:
pilot_names = ['full', 'laggy', 'noisy', 'noop', 'sensor']

In [None]:
n_eval_eps = 100

In [None]:
pilot_evals = [list(zip(*[run_ep(eval('%s_pilot_policy' % pilot_name), env, render=False) for _ in range(n_eval_eps)])) for pilot_name in pilot_names]

In [None]:
with open(os.path.join(data_dir, 'pilot_evals.pkl'), 'wb') as f:
  pickle.dump(dict(zip(pilot_names, pilot_evals)), f, pickle.HIGHEST_PROTOCOL)

In [None]:
mean_rewards = [np.mean(pilot_eval[0]) for pilot_eval in pilot_evals]
outcome_distrns = [Counter(pilot_eval[1]) for pilot_eval in pilot_evals]

In [None]:
print('\n'.join([str(x) for x in zip(pilot_names, mean_rewards, outcome_distrns)]))

In [None]:
n_videos = 10

In [None]:
for pilot_name in pilot_names:
  for i in range(n_videos):
    wrapped_env = wrappers.Monitor(env, os.path.join(data_dir, 'videos', '%s_pilot.%d' % (pilot_name, i)), force=True)
    run_ep(eval('%s_pilot_policy' % pilot_name), wrapped_env, render=False)
    wrapped_env.close()
    env.close()

train supervised goal decoder

In [None]:
pilot_name = 'full'

In [None]:
pilot_policy = eval('%s_pilot_policy' % pilot_name)
n_rollouts = 1000

In [None]:
rollouts = [run_ep(pilot_policy, env, render=False)[2:] for _ in range(n_rollouts)]

In [None]:
with open(os.path.join(data_dir, '%s_pilot_policy_rollouts.pkl' % pilot_name), 'wb') as f:
  pickle.dump(rollouts, f, pickle.HIGHEST_PROTOCOL)

In [None]:
with open(os.path.join(data_dir, '%s_pilot_policy_rollouts.pkl' % pilot_name), 'rb') as f:
  rollouts = pickle.load(f)

In [None]:
n_val_rollouts = 100

In [None]:
rollouts, val_rollouts = rollouts[:-n_val_rollouts], rollouts[-n_val_rollouts:]

In [None]:
def combined_rollout(states, actions):
  return np.array([np.concatenate((
    np.array(obs),
    onehot_encode(action))) for obs, action in zip(
      states[:-1] if len(states) == len(actions) + 1 else states, actions)])

In [None]:
def format_rollouts(rollouts):
  X_dat = np.zeros((len(rollouts), max_ep_len, n_obs_dim + n_act_dim))
  Y_dat = np.zeros((len(rollouts), max_ep_len))
  M_dat = np.zeros((len(rollouts), max_ep_len))
  for i, (states, actions) in enumerate(rollouts):
    Y_dat[i, :] = states[0][-1]
    X_dat[i, :len(actions), :] = traj_mask_helipad(combined_rollout(states, actions))
    M_dat[i, :len(actions)] = 1
  return X_dat, Y_dat, M_dat

In [None]:
X_dat, Y_dat, M_dat = format_rollouts(rollouts)

In [None]:
val_X_dat, val_Y_dat, val_M_dat = format_rollouts(val_rollouts)

In [None]:
example_idxes = list(range(X_dat.shape[0]))
def next_batch(batch_size):
  batch_idxes = random.sample(example_idxes, batch_size)
  return X_dat[batch_idxes, :, :], Y_dat[batch_idxes, :], M_dat[batch_idxes, :]

In [None]:
# Training Parameters
learning_rate = 1e-2
training_steps = 1000
batch_size = 128
display_step = training_steps // 10

# Network Parameters
num_input = X_dat.shape[2]
timesteps = X_dat.shape[1] # timesteps
num_hidden = 32 # hidden layer num of features

In [None]:
gd_scope = 'gd_scope'

In [None]:
with tf.variable_scope(gd_scope, reuse=False):
  # tf Graph input
  X = tf.placeholder("float", [None, timesteps, num_input])
  Y = tf.placeholder("float", [None, timesteps])
  M = tf.placeholder("float", [None, timesteps]) # mask for variable length sequences
  INIT_STATE_A = tf.placeholder("float", [None, num_hidden])
  INIT_STATE_B = tf.placeholder("float", [None, num_hidden])

  weights = {
      'out': tf.Variable(tf.random_normal([num_hidden, 1]))
  }
  biases = {
      'out': tf.Variable(tf.random_normal([1]))
  }
  
  unstacked_X = tf.unstack(X, timesteps, 1)

  lstm_cell = rnn.BasicLSTMCell(num_hidden, forget_bias=1.0)

  state = (INIT_STATE_A, INIT_STATE_B)
  rnn_outputs = []
  rnn_states = []
  for input_ in unstacked_X:
    output, state = lstm_cell(input_, state)
    rnn_outputs.append(output)
    rnn_states.append(state)

  prediction = tf.reshape(
    tf.concat([tf.matmul(output, weights['out']) + biases['out'] for output in rnn_outputs], axis=1), 
    shape=[tf.shape(X)[0], timesteps])
  
  predictions = [tf.matmul(output, weights['out']) + biases['out'] for output in rnn_outputs]

  loss_op = tf.reduce_sum((prediction - Y)**2 * M) / tf.reduce_sum(M)

  optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
  train_op = optimizer.minimize(loss_op)

In [None]:
sess = U.get_session()
if sess is None:
  sess = U.make_session(num_cpu=5)
  sess.__enter__()
  
sess.run(tf.variables_initializer([v for v in tf.global_variables() if v.name.startswith(gd_scope + '/')]))

In [None]:
with tf.variable_scope(gd_scope, reuse=False):
  for step in range(1, training_steps+1):
      batch_x, batch_y, batch_mask = next_batch(batch_size)
      sess.run(train_op, feed_dict={X: batch_x, Y: batch_y, M: batch_mask, 
                                   INIT_STATE_A: np.zeros((batch_size, num_hidden)),
                                   INIT_STATE_B: np.zeros((batch_size, num_hidden))})
      if step % display_step == 0 or step == 1:
          loss = sess.run(loss_op, feed_dict={X: X_dat,
                                             Y: Y_dat,
                                             M: M_dat,
                                             INIT_STATE_A: np.zeros((X_dat.shape[0], num_hidden)),
                                             INIT_STATE_B: np.zeros((X_dat.shape[0], num_hidden))})
          val_loss = sess.run(loss_op, feed_dict={X: val_X_dat,
                                                 Y: val_Y_dat,
                                                 M: val_M_dat,
                                                 INIT_STATE_A: np.zeros((val_X_dat.shape[0], num_hidden)),
                                                 INIT_STATE_B: np.zeros((val_X_dat.shape[0], num_hidden))})
          print("Step " + str(step) + ", Training Loss= " + \
                "{:.4f}".format(loss), ", Validation Loss= " + "{:.4f}".format(val_loss))

  print("Optimization Finished!")

In [None]:
def train_supervised_goal_decoder(gd_scope, rollouts):
  X_dat, Y_dat, M_dat = format_rollouts(rollouts)
  
  example_idxes = list(range(X_dat.shape[0]))
  def next_batch(batch_size):
    batch_idxes = random.sample(example_idxes, batch_size)
    return X_dat[batch_idxes, :, :], Y_dat[batch_idxes, :], M_dat[batch_idxes, :]
  
  # Training Parameters
  learning_rate = 1e-2
  training_steps = 1000
  batch_size = 128
  display_step = training_steps // 10

  # Network Parameters
  num_input = X_dat.shape[2]
  timesteps = X_dat.shape[1] # timesteps
  num_hidden = 32 # hidden layer num of features
    
  sess = U.get_session()
  if sess is None:
    sess = U.make_session(num_cpu=5)
    sess.__enter__()

  sess.run(tf.variables_initializer([v for v in tf.global_variables() if v.name.startswith(gd_scope + '/')]))
  
  with tf.variable_scope(gd_scope, reuse=False):
    for step in range(1, training_steps+1):
        batch_x, batch_y, batch_mask = next_batch(batch_size)
        sess.run(train_op, feed_dict={X: batch_x, Y: batch_y, M: batch_mask, 
                                     INIT_STATE_A: np.zeros((batch_size, num_hidden)),
                                     INIT_STATE_B: np.zeros((batch_size, num_hidden))})
        if step % display_step == 0 or step == 1:
            loss = sess.run(loss_op, feed_dict={X: X_dat,
                                               Y: Y_dat,
                                               M: M_dat,
                                               INIT_STATE_A: np.zeros((X_dat.shape[0], num_hidden)),
                                               INIT_STATE_B: np.zeros((X_dat.shape[0], num_hidden))})
            print("Step " + str(step) + ", Training Loss={:.4f}".format(loss))

    print("Optimization Finished!")

In [None]:
def build_retrain_goal_decoder(pilot_name):
  with open(os.path.join(data_dir, '%s_pilot_policy_rollouts.pkl' % pilot_name), 'rb') as f:
    off_pol_rollouts = pickle.load(f)
  def retrain_goal_decoder(on_pol_rollouts):
    train_supervised_goal_decoder(gd_scope, off_pol_rollouts + on_pol_rollouts)
  return retrain_goal_decoder

In [None]:
gd_path = os.path.join(data_dir, '%s_pilot_goal_decoder.tf' % pilot_name)

In [None]:
save_tf_vars(gd_scope, gd_path)

In [None]:
load_tf_vars(gd_scope, gd_path)

In [None]:
def decode_goal(trajectory, init_state=None, only_final=False):
  traj_X = np.zeros((1, max_ep_len, n_obs_dim + n_act_dim))
  traj_X[0, :len(trajectory), :] = np.array(trajectory)
  with tf.variable_scope(gd_scope, reuse=False):
    feed_dict = {X: traj_X}
    if init_state is not None:
      feed_dict[INIT_STATE_A] = init_state[0]
      feed_dict[INIT_STATE_B] = init_state[1]
    else:
      feed_dict[INIT_STATE_A] = np.zeros((1, num_hidden))
      feed_dict[INIT_STATE_B] = np.zeros((1, num_hidden))
    if only_final:
      g, s = sess.run(
        [predictions[len(trajectory)-1], rnn_states[len(trajectory)-1]], 
        feed_dict=feed_dict
      )
      return g[0, 0], s
    else:
      g, s = sess.run(
        [predictions, rnn_states[len(trajectory)-1]], 
        feed_dict=feed_dict
      )
      return [x[0, 0] for x in g], s

In [None]:
def build_build_goal_decoder(pilot_name):
  def build_goal_decoder():
    load_tf_vars(gd_scope, os.path.join(data_dir, '%s_pilot_goal_decoder.tf' % pilot_name))
    return decode_goal
  return build_goal_decoder

build model-based goal decoder

In [None]:
goals = np.arange(-0.8, 1, 0.05)
n_goals = len(goals)

In [None]:
sess = U.get_session()
if sess is None:
  sess = U.make_session(num_cpu=5)
  sess.__enter__()
    
with tf.variable_scope(full_pilot_scope, reuse=None):
  Q_obs = tf.get_variable("Q_obs", (n_goals, n_obs_dim))

sess.run(tf.variables_initializer([Q_obs]))

In [None]:
with tf.variable_scope(full_pilot_scope, reuse=True):
  Q_values = full_pilot_q_func(Q_obs, n_act_dim, scope="q_func")

In [None]:
def compute_map_est_goal(s, a, log_prior, temp=1000):
  states = []
  for g in goals:
    state = copy(s)
    state[8] = g
    states.append(state)
  with tf.variable_scope(full_pilot_scope, reuse=True):
    Q = sess.run(
        Q_values,
        feed_dict={Q_obs: np.array(states)}
    )
    
  Q *= temp
  
  action = onehot_decode(a)
  log_cond_likelihood = Q[:, action] - logsumexp(Q, axis=1)
  log_marginal_likelihood = logsumexp(log_cond_likelihood) - np.log(n_goals)
  log_likelihood = log_cond_likelihood - log_marginal_likelihood
  log_posterior = log_likelihood + log_prior
  map_est_goal = goals[max(range(n_goals), key=lambda i: log_posterior[i])]
  return log_posterior, map_est_goal

In [None]:
zero_goal_idx = len(goals)//2-2

In [None]:
def mb_decode_goal(trajectory, init_state=None, only_final=False):
  if init_state is None:
    prior = np.ones(n_goals) / n_goals
    prior[zero_goal_idx] *= 2
    prior = prior / prior.sum()
    log_prior = np.log(prior)
    map_est_goals = []
  else:
    log_prior, map_est_goals = init_state
    trajectory = trajectory[-1:]
  for t in trajectory:
    s = np.array(t[:-n_act_dim])
    a = np.array(t[-n_act_dim:])
    log_posterior, map_est_goal = compute_map_est_goal(s, a, log_prior)
    map_est_goals.append(map_est_goal)
    log_prior = log_posterior
  return (map_est_goals[-1] if only_final else map_est_goals), (log_posterior, map_est_goals)

In [None]:
decode_goal = mb_decode_goal

In [None]:
def build_build_goal_decoder(pilot_name):
  def build_goal_decoder():
    return decode_goal
  return build_goal_decoder

In [None]:
# begin debug

In [None]:
rollout = rollouts[925]

In [None]:
goal = rollout[0][0][-1]
traj = traj_mask_helipad(combined_rollout(*rollout))
pred_goal, _ = decode_goal(traj)
mb_pred_goal, _ = mb_decode_goal(traj)

In [None]:
plt.xlabel('Step')
plt.ylabel('X-Coordinate')
plt.axhline(y=goal, label='True Goal', linestyle='--', linewidth=5, color='gray', alpha=0.5)
plt.plot(pred_goal[:len(rollout[0])], label='Predicted Goal (SL)', color='orange')
plt.plot(mb_pred_goal[:len(rollout[0])], label='Predicted Goal (BI)', color='teal')
plt.legend(loc='best')
plt.ylim([-1, 1])
plt.show()

In [None]:
#rollout = rollouts[986]
for rollout in rollouts[800:850]:
  goal = rollout[0][0][-1]
  traj = traj_mask_helipad(combined_rollout(*rollout))
  pred_goal, _ = decode_goal(traj)
  mb_pred_goal, _ = mb_decode_goal(traj)
  plt.xlabel('Step')
  plt.ylabel('X-Coordinate')
  plt.axhline(y=goal, label='True Goal', linestyle='--', linewidth=5, color='gray', alpha=0.5)
  plt.plot(pred_goal[:len(rollout[0])], label='Predicted Goal (SL)', color='orange')
  plt.plot(mb_pred_goal[:len(rollout[0])], label='Predicted Goal (BI)', color='teal')
  plt.legend(loc='best')
  plt.ylim([-1, 1])
  plt.show()

In [None]:
y_trues = []
y_preds = []
for rollout in rollouts:
  goal = rollout[0][0][-1]
  traj = traj_mask_helipad(combined_rollout(*rollout))
  pred_goal, final_states = mb_decode_goal(traj)
  y_trues.extend([goal] * len(pred_goal))
  y_preds.extend(pred_goal)

In [None]:
y_trues = np.array(y_trues)
y_preds = np.array(y_preds)

In [None]:
np.mean((y_trues - y_preds)**2), np.mean((y_trues - 0)**2)

In [None]:
mb_pred_goal, final_states = mb_decode_goal(traj)

In [None]:
plt.ylabel('Timestep')
plt.xlabel('Horizontal Location')
plt.title('Sample Episode from Optimal Synthetic Pilot')
plt.axvline(x=goal, label='True Goal', linestyle='--', linewidth=1, color='green')
plt.plot(list(reversed(pred_goal[:len(rollout[0])])), range(len(pred_goal[:len(rollout[0])])), label='Inferred Goal (Supervised Learning)', color='teal')
plt.plot(list(reversed(mb_pred_goal[:len(rollout[0])])), range(len(mb_pred_goal[:len(rollout[0])])), label='Inferred Goal (Bayesian Inference)', color='gray')
plt.yticks([0, 100, 200, 300, 400], ['400', '300', '200', '100', '0'])
plt.legend(loc='best')
plt.xlim([-1, 1])
plt.show()

In [None]:
# end debug

train assistive copilot

In [None]:
n_training_episodes = 500

In [None]:
make_q_func = lambda: deepq.models.mlp([64, 64])

In [None]:
copilot_dqn_learn_kwargs = {
  'lr': 1e-3,
  'exploration_fraction': 0.1,
  'exploration_final_eps': 0.02,
  'target_network_update_freq': 1500,
  'print_freq': 100,
  'num_cpu': 5,
  'gamma': 0.99,
}

In [None]:
def make_co_env(pilot_policy, build_goal_decoder=None, using_lander_reward_shaping=False, **extras):
    env = gym.make('LunarLanderContinuous-v2')
    env.unwrapped.using_lander_reward_shaping = using_lander_reward_shaping
    env.action_space = spaces.Discrete(n_act_dim)
    env.unwrapped.pilot_policy = pilot_policy
    if build_goal_decoder is None:
      obs_box = env.observation_space
      env.observation_space = spaces.Box(np.concatenate((obs_box.low, np.zeros(n_act_dim))), 
                                         np.concatenate((obs_box.high, np.ones(n_act_dim))))
    
    env.unwrapped._step_orig = env.unwrapped._step
    if build_goal_decoder is None:
      def _step(self, action):
        obs, r, done, info = self._step_orig(disc_to_cont(action))
        obs = np.concatenate((obs, onehot_encode(self.pilot_policy(obs[None, :]))))
        return obs, r, done, info
    else:
      goal_decoder = build_goal_decoder()
      def _step(self, action):
        obs, r, done, info = self._step_orig(disc_to_cont(action))
        self.actions.append(self.pilot_policy(obs[None, :]))
        traj = traj_mask_helipad(combined_rollout(self.trajectory[-1:], self.actions[-1:]))
        goal, self.init_state = goal_decoder(traj, init_state=self.init_state, only_final=True)
        obs = mask_helipad(obs, replace=goal)
        return obs, r, done, info
    env.unwrapped._step = types.MethodType(_step, env.unwrapped)
    
    return env

In [None]:
def co_build_act(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None, using_control_sharing=True):
  with tf.variable_scope(scope, reuse=reuse):
    observations_ph = U.ensure_tf_input(make_obs_ph("observation"))
    if using_control_sharing:
      pilot_action_ph = tf.placeholder(tf.int32, (), name='pilot_action')
      pilot_tol_ph = tf.placeholder(tf.float32, (), name='pilot_tol')
    else:
      eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0))
      stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic")
      update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps")

    q_values = q_func(observations_ph.get(), num_actions, scope="q_func")

    batch_size = tf.shape(q_values)[0]

    if using_control_sharing:
      q_values -= tf.reduce_min(q_values, axis=1)
      opt_actions = tf.argmax(q_values, axis=1, output_type=tf.int32)
      opt_q_values = tf.reduce_max(q_values, axis=1)

      batch_idxes = tf.reshape(tf.range(0, batch_size, 1), [batch_size, 1])
      reshaped_batch_size = tf.reshape(batch_size, [1])

      pi_actions = tf.tile(tf.reshape(pilot_action_ph, [1]), reshaped_batch_size)
      pi_act_idxes = tf.concat([batch_idxes, tf.reshape(pi_actions, [batch_size, 1])], axis=1)
      pi_act_q_values = tf.gather_nd(q_values, pi_act_idxes)

      # if necessary, switch steering and keep main
      mixed_actions = 3 * (pi_actions // 3) + (opt_actions % 3)
      mixed_act_idxes = tf.concat([batch_idxes, tf.reshape(mixed_actions, [batch_size, 1])], axis=1)
      mixed_act_q_values = tf.gather_nd(q_values, mixed_act_idxes)
      mixed_actions = tf.where(pi_act_q_values >= (1 - pilot_tol_ph) * opt_q_values, pi_actions, mixed_actions)

      # if necessary, keep steering and switch main
      mixed_act_idxes = tf.concat([batch_idxes, tf.reshape(mixed_actions, [batch_size, 1])], axis=1)
      mixed_act_q_values = tf.gather_nd(q_values, mixed_act_idxes)
      steer_mixed_actions = 3 * (opt_actions // 3) + (pi_actions % 3)
      mixed_actions = tf.where(mixed_act_q_values >= (1 - pilot_tol_ph) * opt_q_values, mixed_actions, steer_mixed_actions)

      # if necessary, switch steering and main
      mixed_act_idxes = tf.concat([batch_idxes, tf.reshape(mixed_actions, [batch_size, 1])], axis=1)
      mixed_act_q_values = tf.gather_nd(q_values, mixed_act_idxes)
      actions = tf.where(mixed_act_q_values >= (1 - pilot_tol_ph) * opt_q_values, mixed_actions, opt_actions)

      act = U.function(inputs=[
        observations_ph, pilot_action_ph, pilot_tol_ph
      ],
                       outputs=[actions])
    else:
      deterministic_actions = tf.argmax(q_values, axis=1)

      random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64)
      chose_random = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps
      stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions)

      output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions)
      update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps))
      act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph],
                       outputs=[output_actions],
                       givens={update_eps_ph: -1.0, stochastic_ph: True},
                       updates=[update_eps_expr])
    return act

In [None]:
def co_build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0,
    double_q=True, scope="deepq", reuse=None, using_control_sharing=True):
    act_f = co_build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse, using_control_sharing=using_control_sharing)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight")

        obs_t_input_get = obs_t_input.get()
        obs_tp1_input_get = obs_tp1_input.get()

        # q network evaluation
        q_t = q_func(obs_t_input_get, num_actions, scope='q_func', reuse=True)  # reuse parameters from act
        q_func_vars = U.scope_vars(U.absolute_scope_name('q_func'))

        # target q network evalution
        q_tp1 = q_func(obs_tp1_input_get, num_actions, scope="target_q_func")
        target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func"))

        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1)

        # compute estimate of best possible value starting from state at t + 1
        if double_q:
            q_tp1_using_online_net = q_func(obs_tp1_input_get, num_actions, scope='q_func', reuse=True)
            q_tp1_best_using_online_net = tf.arg_max(q_tp1_using_online_net, 1)
            q_tp1_best = tf.reduce_sum(q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1)
        else:
            q_tp1_best = tf.reduce_max(q_tp1, 1)
        q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

        # compute RHS of bellman equation
        q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked

        # compute the error (potentially clipped)
        td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        errors = U.huber_loss(td_error)
        weighted_error = tf.reduce_mean(importance_weights_ph * errors)

        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            optimize_expr = U.minimize_and_clip(optimizer,
                                                weighted_error,
                                                var_list=q_func_vars,
                                                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name),
                                   sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        train = U.function(
            inputs=[
                obs_t_input,
                act_t_ph,
                rew_t_ph,
                obs_tp1_input,
                done_mask_ph,
                importance_weights_ph
            ],
            outputs=td_error,
            updates=[optimize_expr]
        )
        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function([obs_t_input], q_t)

    return act_f, train, update_target, {'q_values': q_values}

In [None]:
def co_dqn_learn(
    env,
    q_func,
    lr=1e-3,
    max_timesteps=100000,
    buffer_size=50000,
    train_freq=1,
    batch_size=32,
    print_freq=1,
    checkpoint_freq=10000,
    learning_starts=1000,
    gamma=1.0,
    target_network_update_freq=500,
    exploration_fraction=0.1,
    exploration_final_eps=0.02,
    num_cpu=5,
    callback=None,
    scope='deepq',
    pilot_tol=0,
    pilot_is_human=False,
    reuse=False,
    using_supervised_goal_decoder=False):
    
    # Create all the functions necessary to train the model

    sess = U.get_session()
    if sess is None:
      sess = U.make_session(num_cpu=num_cpu)
      sess.__enter__()

    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.shape, name=name)
      
    using_control_sharing = pilot_tol > 0
    
    act, train, update_target, debug = co_build_train(
        scope=scope,
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,
        reuse=reuse,
        using_control_sharing=using_control_sharing
    )
    
    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    replay_buffer = ReplayBuffer(buffer_size)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    episode_outcomes = []
    saved_mean_reward = None
    obs = env.reset()
    prev_t = 0
    rollouts = []
    
    if pilot_is_human:
      global human_agent_action
      human_agent_action = init_human_action()
    
    if not using_control_sharing:
      exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)
        
    with tempfile.TemporaryDirectory() as td:
        model_saved = False
        model_file = os.path.join(td, 'model')
        for t in range(max_timesteps):
            masked_obs = obs if using_supervised_goal_decoder else mask_helipad(obs)

            act_kwargs = {}
            if using_control_sharing:
              act_kwargs['pilot_action'] = env.unwrapped.pilot_policy(obs[None, :n_obs_dim])
              act_kwargs['pilot_tol'] = pilot_tol if not pilot_is_human or (pilot_is_human and human_agent_active) else 0
            else:
              act_kwargs['update_eps'] = exploration.value(t)
              
            action = act(masked_obs[None, :], **act_kwargs)[0][0]
            new_obs, rew, done, info = env.step(action)

            if pilot_is_human:
              env.render()
              time.sleep(sim_delay_for_human)

            # Store transition in the replay buffer.
            masked_new_obs = new_obs if using_supervised_goal_decoder else mask_helipad(new_obs)
            replay_buffer.add(masked_obs, action, rew, masked_new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew

            if done:
                if t > learning_starts:
                  for _ in range(t - prev_t):
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None
                    td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights)

                obs = env.reset()

                episode_outcomes.append(rew)
                episode_rewards.append(0.0)

                if pilot_is_human:
                  global human_agent_action
                  human_agent_action = init_human_action()

                prev_t = t
                    
                if pilot_is_human:
                  time.sleep(1)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()

            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            mean_100ep_succ = round(np.mean([1 if x==100 else 0 for x in episode_outcomes[-101:-1]]), 2)
            mean_100ep_crash = round(np.mean([1 if x==-100 else 0 for x in episode_outcomes[-101:-1]]), 2)
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
                logger.record_tabular("mean 100 episode succ", mean_100ep_succ)
                logger.record_tabular("mean 100 episode crash", mean_100ep_crash)
                logger.dump_tabular()

            if checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0 and (saved_mean_reward is None or mean_100ep_reward > saved_mean_reward):
                if print_freq is not None:
                    print('Saving model due to mean reward increase:')
                    print(saved_mean_reward, mean_100ep_reward)
                U.save_state(model_file)
                model_saved = True
                saved_mean_reward = mean_100ep_reward

        if model_saved:
            U.load_state(model_file)

    reward_data = {
      'rewards': episode_rewards,
      'outcomes': episode_outcomes
    }
          
    return ActWrapper(act, act_params), reward_data

In [None]:
def make_co_policy(
  env, scope=None, pilot_tol=0, pilot_is_human=False, 
  n_eps=n_training_episodes, copilot_scope=None, 
  copilot_q_func=None, build_goal_decoder=None, 
  reuse=False, **extras):
  
  if copilot_scope is not None:
    scope = copilot_scope
  elif scope is None:
    scope = str(uuid.uuid4())
  q_func = copilot_q_func if copilot_scope is not None else make_q_func()
    
  return (scope, q_func), co_dqn_learn(
    env,
    scope=scope,
    q_func=q_func,
    max_timesteps=max_ep_len*n_eps,
    pilot_tol=pilot_tol,
    pilot_is_human=pilot_is_human,
    reuse=reuse,
    using_supervised_goal_decoder=(build_goal_decoder is not None),
    **copilot_dqn_learn_kwargs
  )

In [None]:
def str_of_config(pilot_tol, pilot_type, embedding_type, using_lander_reward_shaping):
  return "{'pilot_type': '%s', 'pilot_tol': %s, 'embedding_type': '%s', 'using_lander_reward_shaping': %s}" % (pilot_type, pilot_tol, embedding_type, str(using_lander_reward_shaping))

train and evaluate copilot

In [None]:
n_reps = 10

In [None]:
pilot_ids = ['sensor']
pilot_policies = [eval('%s_pilot_policy' % pilot_name) for pilot_name in pilot_ids]
embedding_type = 'rawaction'
using_lander_reward_shaping = True
pilot_tols = [0]

In [None]:
configs = []
for pilot_id, pilot_policy in zip(pilot_ids, pilot_policies):
  if embedding_type != 'rawaction':
    build_goal_decoder = build_build_goal_decoder(pilot_id)
  else:
    build_goal_decoder = None 
  for pilot_tol in pilot_tols:
    configs.append((
      str_of_config(pilot_tol, pilot_id, embedding_type, using_lander_reward_shaping), 
      {
        'pilot_tol': pilot_tol,
        'build_goal_decoder': build_goal_decoder,
        'pilot_policy': pilot_policy,
        'using_lander_reward_shaping': using_lander_reward_shaping,
        'reuse': False
      }))

In [None]:
reward_logs = {}

In [None]:
for config_name, config_kwargs in configs:
  print(config_name)
  reward_logs[config_name] = defaultdict(list)
  co_env = make_co_env(**config_kwargs)
  for i in range(n_reps):
    (copilot_scope, copilot_q_func), (raw_copilot_policy, reward_data) = make_co_policy(
      co_env, **config_kwargs)
    for k, v in reward_data.items():
      reward_logs[config_name][k].append(v)

In [None]:
reward_log_file = 'reward_logs.pkl'

In [None]:
with open(os.path.join(data_dir, reward_log_file), 'wb') as f:
  pickle.dump(reward_logs, f, pickle.HIGHEST_PROTOCOL)

Train and test on different pilots

In [None]:
pilot_tol_of_id = {
  'noop': 0,
  'laggy': 0.7,
  'noisy': 0.4,
  'sensor': 0
}

In [None]:
training_pilot_ids = list(pilot_tol_of_id.keys())

In [None]:
copilot_of_training_pilot = {}

In [None]:
copilot_path_of_training_pilot = lambda training_pilot_id: os.path.join(data_dir, 'pretrained_%s_copilot')
copilot_scope_of_training_pilot = lambda training_pilot_id: ('pretrained_%s_copilot_scope' % training_pilot_id)

In [None]:
for training_pilot_id, pilot_tol in pilot_tol_of_id.items():
  pilot_policy = eval('%s_pilot_policy' % training_pilot_id)
  copilot_scope = copilot_scope_of_training_pilot(training_pilot_id)
  config_kwargs = {
    'pilot_policy': pilot_policy,
    'pilot_tol': pilot_tol,
    'copilot_scope': copilot_scope,
    'copilot_q_func': make_q_func()
  }
  co_env = make_co_env(**config_kwargs)
  (copilot_scope, copilot_q_func), (raw_copilot_policy, reward_data) = make_co_policy(co_env, **config_kwargs)
  
  copilot_of_training_pilot[training_pilot_id] = (copilot_scope, raw_copilot_policy)
  copilot_path = copilot_path_of_training_pilot(training_pilot_id)
  save_tf_vars(copilot_scope, copilot_path)

In [None]:
def make_copilot_policy(training_pilot_id, eval_pilot_policy, pilot_tol):
  copilot_scope, raw_copilot_policy = copilot_of_training_pilot[training_pilot_id]
  def copilot_policy(obs):
    with tf.variable_scope(copilot_scope, reuse=None):
      masked_obs = mask_helipad(obs)[0]
      pilot_action = eval_pilot_policy(masked_obs[None, :n_obs_dim])
      
      if masked_obs.size == n_obs_dim:
        feed_obs = np.concatenate((masked_obs, onehot_encode(pilot_action)))
      else:
        feed_obs = masked_obs

      return raw_copilot_policy._act(
        feed_obs[None, :], 
        pilot_tol=pilot_tol, 
        pilot_action=pilot_action
      )[0][0]
  return copilot_policy

In [None]:
n_eval_eps = 100

In [None]:
cross_evals = {}

In [None]:
for training_pilot_id, training_pilot_tol in pilot_tol_of_id.items():
  # load pretrained copilot
  copilot_scope = copilot_scope_of_training_pilot(training_pilot_id)
  training_pilot_policy = eval('%s_pilot_policy' % training_pilot_id)
  config_kwargs = {
    'pilot_policy': training_pilot_policy,
    'pilot_tol': training_pilot_tol,
    'copilot_scope': copilot_scope,
    'copilot_q_func': make_q_func(),
    'reuse': True
  }
  co_env = make_co_env(**config_kwargs)
  make_co_policy(co_env, **config_kwargs)
  copilot_path = copilot_path_of_training_pilot(training_pilot_id)
  load_tf_vars(copilot_scope, copilot_path)
  # evaluate copilot with different pilots
  for eval_pilot_id, eval_pilot_tol in pilot_tol_of_id.items():
    eval_pilot_policy = eval('%s_pilot_policy' % eval_pilot_id)
    copilot_policy = make_copilot_policy(training_pilot_id, eval_pilot_policy, eval_pilot_tol)
    co_env = make_co_env(pilot_policy=eval_pilot_policy)
    cross_evals[(training_pilot_id, eval_pilot_id)] = [run_ep(copilot_policy, co_env, render=False)[:2] for _ in range(n_eval_eps)]

In [None]:
with open(os.path.join(data_dir, 'cross_evals.pkl'), 'wb') as f:
  pickle.dump(cross_evals, f, pickle.HIGHEST_PROTOCOL)