In [1]:
import numpy as np
import tensorflow as tf
import gym
import time
import spinup.algos.ppo.core as core
from spinup.utils.logx import EpochLogger
import spinup.algos.ppo.ppo as ppo
from tensorflow.train import AdamOptimizer

env = gym.make('zrc_learn:zrc-v0')

x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space)
adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)
ac_kwargs = {'action_space': env.action_space}
pi, logp, logp_pi, v = core.mlp_actor_critic(x_ph, a_ph, **ac_kwargs)


  TensorFlow's `tf-nightly` package will soon be updated to TensorFlow 2.0.

  Please upgrade your code to TensorFlow 2.0:
    * https://www.tensorflow.org/beta/guide/migration_guide

  Or install the latest stable TensorFlow 1.X release:
    * `pip install -U "tensorflow==1.*"`

  Otherwise your code may be broken by the change.

  


Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.


In [2]:
obs_dim = env.observation_space.shape
act_dim = env.action_space.shape
steps_per_epoch = 12000
gamma=0.99
lam=0.97
buf = ppo.PPOBuffer(obs_dim, act_dim, steps_per_epoch, gamma, lam)

# PPO objectives
clip_ratio=0.2
ratio = tf.exp(logp - logp_old_ph)          # pi(a|s) / pi_old(a|s)
min_adv = tf.where(adv_ph>0, (1+clip_ratio)*adv_ph, (1-clip_ratio)*adv_ph)
pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))
v_loss = tf.reduce_mean((ret_ph - v)**2)

# Optimizers
pi_lr=3e-4
vf_lr=1e-3
train_pi = AdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
train_v = AdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [3]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

log = EpochLogger(output_dir='log')
log.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})
print(log.tf_saver_info)

log._tf_simple_save()

[32;1mLogging data to log\progress.txt[0m
{'inputs': {'x': 'Placeholder:0'}, 'outputs': {'pi': 'pi/add:0', 'v': 'v/Squeeze:0'}}
Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.simple_save.
Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.utils.build_tensor_info or tf.compat.v1.saved_model.build_tensor_info.
INFO:tensorflow:Assets added to graph.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: log\simple_save\saved_model.pb


In [4]:
def update():
    inputs = {k:v for k,v in zip(all_phs, buf.get())}
    pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs)

    # Training
    for i in range(train_pi_iters):
        _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
        if kl > 1.5 * target_kl:
            log.log('Early stopping at step %d due to reaching max kl.'%i)
            break
    log.store(StopIter=i)
    for _ in range(train_v_iters):
        sess.run(train_v, feed_dict=inputs)

    # Log changes from update
    pi_l_new, v_l_new, kl, cf = sess.run([pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
    log.store(LossPi=pi_l_old, LossV=v_l_old, 
                 KL=kl, Entropy=ent, ClipFrac=cf,
                 DeltaLossPi=(pi_l_new - pi_l_old),
                 DeltaLossV=(v_l_new - v_l_old))

In [5]:
get_action_ops = [pi, v, logp_pi]
all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

# Info (useful to watch during learning)
approx_kl = tf.reduce_mean(logp_old_ph - logp)      # a sample estimate for KL-divergence, easy to compute
approx_ent = tf.reduce_mean(-logp)                  # a sample estimate for entropy, also easy to compute
clipped = tf.logical_or(ratio > (1+clip_ratio), ratio < (1-clip_ratio))
clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

epochs = 10
max_ep_len=1200
save_freq=10
start_time = time.time()
o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

train_pi_iters=80
train_v_iters=80
target_kl=0.01

env.render()

# Main loop: collect experience in env and update/log each epoch
for epoch in range(epochs):
    for t in range(steps_per_epoch):
        a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1,-1)})

        # save and log
        buf.store(o, a, r, v_t, logp_t)
        log.store(VVals=v_t)

        o, r, d, _ = env.step(a[0])
        ep_ret += r
        ep_len += 1

        terminal = d or (ep_len == max_ep_len)
        if terminal or (t==steps_per_epoch-1):
            if not(terminal):
                print('Warning: trajectory cut off by epoch at %d steps.'%ep_len)
            # if trajectory didn't reach terminal state, bootstrap value target
            last_val = r if d else sess.run(v, feed_dict={x_ph: o.reshape(1,-1)})
            buf.finish_path(last_val)
            if terminal:
                # only save EpRet / EpLen if trajectory finished
                log.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    # Save model
    if (epoch % save_freq == 0) or (epoch == epochs-1):
        log.save_state({'env': env}, None)

    # Perform PPO update!
    update()

    # Log info about epoch
    log.log_tabular('Epoch', epoch)
    log.log_tabular('EpRet', with_min_and_max=True)
    log.log_tabular('EpLen', average_only=True)
    log.log_tabular('VVals', with_min_and_max=True)
    log.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch)
    log.log_tabular('LossPi', average_only=True)
    log.log_tabular('LossV', average_only=True)
    log.log_tabular('DeltaLossPi', average_only=True)
    log.log_tabular('DeltaLossV', average_only=True)
    log.log_tabular('Entropy', average_only=True)
    log.log_tabular('KL', average_only=True)
    log.log_tabular('ClipFrac', average_only=True)
    log.log_tabular('StopIter', average_only=True)
    log.log_tabular('Time', time.time()-start_time)
    log.dump_tabular()

INFO:tensorflow:Assets added to graph.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: log\simple_save\saved_model.pb
[32;1mEarly stopping at step 1 due to reaching max kl.[0m
---------------------------------------
|             Epoch |               0 |
|      AverageEpRet |       -3.19e+04 |
|          StdEpRet |        4.42e+03 |
|          MaxEpRet |       -2.63e+04 |
|          MinEpRet |       -4.07e+04 |
|             EpLen |         1.2e+03 |
|      AverageVVals |          -0.549 |
|          StdVVals |           0.158 |
|          MaxVVals |           0.136 |
|          MinVVals |          -0.724 |
| TotalEnvInteracts |         1.2e+04 |
|            LossPi |       -1.63e-07 |
|             LossV |        8.37e+06 |
|       DeltaLossPi |        -0.00488 |
|        DeltaLossV |       -8.06e+04 |
|           Entropy |            2.76 |
|                KL |         0.00597 |
|          ClipFrac |          0.0409 |
|          StopIter |              

INFO:tensorflow:Assets added to graph.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: log\simple_save\saved_model.pb
---------------------------------------
|             Epoch |               9 |
|      AverageEpRet |            37.3 |
|          StdEpRet |            52.2 |
|          MaxEpRet |             162 |
|          MinEpRet |           -18.1 |
|             EpLen |        1.09e+03 |
|      AverageVVals |           -20.9 |
|          StdVVals |         0.00485 |
|          MaxVVals |           -20.9 |
|          MinVVals |             -21 |
| TotalEnvInteracts |         1.2e+05 |
|            LossPi |       -4.45e-08 |
|             LossV |             730 |
|       DeltaLossPi |        -0.00419 |
|        DeltaLossV |           -7.96 |
|           Entropy |            2.78 |
|                KL |         0.00732 |
|          ClipFrac |          0.0476 |
|          StopIter |              79 |
|              Time |             157 |
---------------

In [6]:
#from spinup.utils.test_policy import load_policy, run_policy

#x, ga = load_policy('log')

In [8]:
import tensorflow as tf

sess = tf.Session()
model = tf.saved_model.loader.load(
            sess,
            [tf.saved_model.tag_constants.SERVING],
            ''
        )
print(model)

INFO:tensorflow:Restoring parameters from variables\variables
meta_info_def {
  stripped_op_list {
    op {
      name: "Add"
      input_arg {
        name: "x"
        type_attr: "T"
      }
      input_arg {
        name: "y"
        type_attr: "T"
      }
      output_arg {
        name: "z"
        type_attr: "T"
      }
      attr {
        name: "T"
        type: "type"
        allowed_values {
          list {
            type: DT_BFLOAT16
            type: DT_HALF
            type: DT_FLOAT
            type: DT_DOUBLE
            type: DT_UINT8
            type: DT_INT8
            type: DT_INT16
            type: DT_INT32
            type: DT_INT64
            type: DT_COMPLEX64
            type: DT_COMPLEX128
            type: DT_STRING
          }
        }
      }
    }
    op {
      name: "AddV2"
      input_arg {
        name: "x"
        type_attr: "T"
      }
      input_arg {
        name: "y"
        type_attr: "T"
      }
      output_arg {
        name: "z"
      

AttributeError: module 'tensorflow' has no attribute 'run'

In [18]:
sess.run('Placeholder', feed_dict={'Placeholder:0':np.empty((32),dtype=np.float32)})

ValueError: Cannot feed value of shape (32,) for Tensor 'Placeholder:0', which has shape '(?, 32)'