In [1]:
import numpy as np
import tensorflow as tf
import gym
import time
import spinup.algos.ppo.core as core
from spinup.utils.logx import EpochLogger
import spinup.algos.ppo.ppo as ppo
from tensorflow.train import AdamOptimizer
from spinup.utils.logx import restore_tf_graph
from os import path

train_mode = 'locomotion'
hidden_sizes = (256, 256)
#train_mode = 'sense'
#hidden_sizes = (32,32)
env = gym.make('zrc_learn:zrc-v0', train_mode=train_mode)

print(env.observation_space)
print(env.action_space)

env.reset()

x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space)
adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)
ac_kwargs = {'action_space': env.action_space, 'hidden_sizes':hidden_sizes}
pi, logp, logp_pi, v = core.mlp_actor_critic(x_ph, a_ph, **ac_kwargs)


  TensorFlow's `tf-nightly` package will soon be updated to TensorFlow 2.0.

  Please upgrade your code to TensorFlow 2.0:
    * https://www.tensorflow.org/beta/guide/migration_guide

  Or install the latest stable TensorFlow 1.X release:
    * `pip install -U "tensorflow==1.*"`

  Otherwise your code may be broken by the change.

  

Box(35,)
Box(3,)

Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.


In [2]:
obs_dim = env.observation_space.shape
act_dim = env.action_space.shape
steps_per_epoch = 12000
epochs = 1000
gamma=0.99
clip_ratio=0.2
pi_lr=3e-4
vf_lr=1e-3
train_pi_iters=80
train_v_iters=80
lam=0.97
max_ep_len=12000
target_kl=0.02 #0.01
save_freq=10

log = EpochLogger(output_dir=train_mode)

# Experience buffer
buf = ppo.PPOBuffer(obs_dim, act_dim, steps_per_epoch, gamma, lam)

# Count variables
var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
log.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts)

# PPO objectives
ratio = tf.exp(logp - logp_old_ph)          # pi(a|s) / pi_old(a|s)
min_adv = tf.where(adv_ph>0, (1+clip_ratio)*adv_ph, (1-clip_ratio)*adv_ph)
pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))
v_loss = tf.reduce_mean((ret_ph - v)**2)

# Optimizers
train_pi = AdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
train_v = AdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

[32;1mLogging data to locomotion\progress.txt[0m
[32;1m
Number of parameters: 	 pi: 75782, 	 v: 75265
[0m
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [3]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

log.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})
print(log.tf_saver_info)

{'inputs': {'x': 'Placeholder:0'}, 'outputs': {'pi': 'pi/add:0', 'v': 'v/Squeeze:0'}}


In [4]:
def update():
    inputs = {k:v for k,v in zip(all_phs, buf.get())}
    pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs)

    # Training
    for i in range(train_pi_iters):
        _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
        if kl > 1.5 * target_kl:
            log.log('Early stopping at step %d due to reaching max kl.'%i)
            break
    log.store(StopIter=i)
    for _ in range(train_v_iters):
        sess.run(train_v, feed_dict=inputs)

    # Log changes from update
    pi_l_new, v_l_new, kl, cf = sess.run([pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
    log.store(LossPi=pi_l_old, LossV=v_l_old, 
                 KL=kl, Entropy=ent, ClipFrac=cf,
                 DeltaLossPi=(pi_l_new - pi_l_old),
                 DeltaLossV=(v_l_new - v_l_old))

In [None]:
get_action_ops = [pi, v, logp_pi]
all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

# Info (useful to watch during learning)
approx_kl = tf.reduce_mean(logp_old_ph - logp)      # a sample estimate for KL-divergence, easy to compute
approx_ent = tf.reduce_mean(-logp)                  # a sample estimate for entropy, also easy to compute
clipped = tf.logical_or(ratio > (1+clip_ratio), ratio < (1-clip_ratio))
clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

#if (path.exists('log/simple_save/saved_model.pb')):
#        tf.saved_model.loader.load(
#                sess,
#                [tf.saved_model.tag_constants.SERVING],
#                'log/simple_save'
#            )

start_time = time.time()
o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

env.render()

# Main loop: collect experience in env and update/log each epoch
for epoch in range(epochs):
    for t in range(steps_per_epoch):
        a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1,-1)})

        # save and log
        buf.store(o, a, r, v_t, logp_t)
        log.store(VVals=v_t)

        o, r, d, _ = env.step(a[0])
        ep_ret += r
        ep_len += 1

        terminal = d or (ep_len == max_ep_len)
        if terminal or (t==steps_per_epoch-1):
            if not(terminal):
                print('Warning: trajectory cut off by epoch at %d steps.'%ep_len)
            # if trajectory didn't reach terminal state, bootstrap value target
            last_val = r if d else sess.run(v, feed_dict={x_ph: o.reshape(1,-1)})
            buf.finish_path(last_val)
            if terminal:
                # only save EpRet / EpLen if trajectory finished
                log.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    # Save model
    if (epoch % save_freq == 0) or (epoch == epochs-1):
        log.save_state({'env': env}, None)

    # Perform PPO update!
    update()

    # Log info about epoch
    log.log_tabular('Epoch', epoch)
    log.log_tabular('EpRet', with_min_and_max=True)
    log.log_tabular('EpLen', average_only=True)
    log.log_tabular('VVals', with_min_and_max=True)
    log.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch)
    log.log_tabular('LossPi', average_only=True)
    log.log_tabular('LossV', average_only=True)
    log.log_tabular('DeltaLossPi', average_only=True)
    log.log_tabular('DeltaLossV', average_only=True)
    log.log_tabular('Entropy', average_only=True)
    log.log_tabular('KL', average_only=True)
    log.log_tabular('ClipFrac', average_only=True)
    log.log_tabular('StopIter', average_only=True)
    log.log_tabular('Time', time.time()-start_time)
    log.dump_tabular()

Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.simple_save.
Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.utils.build_tensor_info or tf.compat.v1.saved_model.build_tensor_info.
INFO:tensorflow:Assets added to graph.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: locomotion\simple_save\saved_model.pb
---------------------------------------
|             Epoch |               0 |
|      AverageEpRet |            11.5 |
|          StdEpRet |             1.1 |
|          MaxEpRet |            12.4 |
|          MinEpRet |            9.95 |
|             EpLen |         3.6e+03 |
|      AverageVVals |               0 |
|          StdVVals |               0 |
|          MaxVVals |               0 |
|          MinVVals |               0 |
| TotalEnvInteracts |         1.2e+04 |
|            LossPi |     

In [None]:
#from spinup.utils.test_policy import load_policy, run_policy

#x, ga = load_policy('log')

In [None]:
import numpy as np
import tensorflow as tf
import gym
import time
import spinup.algos.ppo.core as core
from spinup.utils.logx import EpochLogger
import spinup.algos.ppo.ppo as ppo
from tensorflow.train import AdamOptimizer
from spinup.utils.logx import restore_tf_graph
from os import path

sess = tf.Session()
model = restore_tf_graph(sess, osp.join('locomotion', 'simple_save'))