In [1]:
import numpy as np
import scipy.stats
import random
import tensorflow as tf
import time
import os
import gym
from gym import envs, scoreboard
from gym.spaces import Discrete, Box
import tempfile
import sys
from PGActorContinuous import PGActorContinuous
%matplotlib notebook
import matplotlib.pyplot as plt
from IPython import display

In [2]:
# ==========================
#   Training Parameters
# ==========================
# Max training steps
MAX_EPISODES = 5000
# Max episode length
MAX_EP_STEPS = 1000
# Base learning rate for the Actor network
ACTOR_LEARNING_RATE = 0.01
CRITIC_LEARNING_RATE = 0.03
# Discount factor 
GAMMA = 0.97

# ===========================
#   Utility Parameters
# ===========================
# Render gym env during training
RENDER_ENV = True
# Use Gym Monitor
GYM_MONITOR_EN = True
# Gym environment
ENV_NAME = 'Pendulum-v0'
# # Directory for storing gym results
# MONITOR_DIR = './results/gym_ddpg'
# # Directory for storing tensorboard summary results
# SUMMARY_DIR = './results/tf_ddpg'
RANDOM_SEED = 1337
EPS_PER_BATCH =20
# r = scipy.stats.truncnorm.rvs(a=-2.-mean, b=2.-mean, loc = mean, size=100000)
# plt.hist(np.concatenate([r, [0]], 0))[0].shape

In [3]:

def train(sess, env, actor):
    
    fig,ax = plt.subplots(1,1)
    ax.set_xlabel('Episode no.')
    ax.set_ylabel('Episode reward')
    ep_vals = []
    ax.set_xlim(0,MAX_EPISODES)
    ax.set_ylim(-9000,0)

    # Initialize memory
    old_states, old_actions , old_advantages, old_returns = [], [], [], []
    nep_reward = 0
    nep_est_reward = 0
    for i in range(MAX_EPISODES):
        
        s = env.reset()
        v = actor.predict_value(np.reshape(s, (1, -1)))[0]
        nep_est_reward += v
        ep_reward = 0
        returns = []
        values = []
        
        for j in range(MAX_EP_STEPS):
            means_wrapped, stds = actor.predict(np.reshape(s, (1, -1)))
            means = means_wrapped[0]
            a = map((lambda mean, std : scipy.stats.truncnorm.rvs(a=(-actor.action_bound[0]-mean)/std,
                                                                  b=(actor.action_bound[0]-mean)/std,
                                                                  loc=mean,
                                                                  scale=std)),
                    means, stds)
            a = np.asarray(a) 
            if not (np.abs(a) <=2.).all():
                print "!!!Action outside bounds!!!!"
                print "a =",a
                print "means =",means
                print "stds =",stds
            # get new state and reward
            s2, r, is_done, info = env.step(a)
            
            # get new value prediction and find delta
            v2 = actor.predict_value(np.reshape(s2, (1, -1)))[0]
        
            # add step to batch
            old_states.append(np.reshape(s, (actor.s_dim,)))
            old_actions.append(np.reshape(a, (actor.a_dim,)))
            values.append(v)
            returns.append([0])
            for k in range(len(returns)):
                assert k <= j
                returns[k] += r * (GAMMA**(j-k))
            s=s2
            v = v2
            ep_reward += r
            
            
            if is_done or j == MAX_EP_STEPS-1:
                break

        advantages = -np.asarray(values) + np.asarray(returns)
        old_advantages = old_advantages + advantages.tolist()
        old_returns = old_returns + returns
        nep_reward += ep_reward
        ep_vals.append(ep_reward)
        if ax.lines:
            for line in ax.lines:
                line.set_xdata(range(i+1))
                line.set_ydata(ep_vals)
        else:
            ax.plot(range(i+1), ep_vals)
        time.sleep(0.001)
        fig.canvas.draw()
        
        if i % EPS_PER_BATCH ==0 and i!=0:
#                 print "| Avg value (",EPS_PER_BATCH,"eps):",  (int(nep_reward/EPS_PER_BATCH)), \
#                 " | Avg est value (",EPS_PER_BATCH,"eps):", nep_est_reward[0]/EPS_PER_BATCH, " | Episode", i
#                 print
                nep_reward=0
                nep_est_reward = 0
                
        if i% EPS_PER_BATCH==0 and i!=0:
                
            actor.train(old_states, old_actions, old_advantages)
            for n in range(50):    
                actor.train_value(old_states, old_returns)
            old_states =[]
            old_actions = []
            old_advantages = []   
            old_returns = []

In [None]:
# defining environment
sess = tf.Session()

env = gym.make('Pendulum-v0')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
print "state and actions dims:", state_dim, action_dim

# make sure action bound is symmetric (can change in future,
# but need to remember to scale actor output appropriately)
assert (env.action_space.high == -env.action_space.low)

action_bound = env.action_space.high
print "Action bound:", action_bound
actor = PGActorContinuous(sess,state_dim, action_dim, action_bound, 
                 ACTOR_LEARNING_RATE, CRITIC_LEARNING_RATE)
# Initialize our Tensorflow variables
sess.run(tf.initialize_all_variables())


train(sess, env, actor)

[2016-11-25 16:06:09,953] Making new env: Pendulum-v0


state and actions dims: 3 1
Action bound: [ 2.]
There are 7 actor params
There are 6 value params


<IPython.core.display.Javascript object>

In [5]:
def run(sess, env, actor, n_eps):
    nep_reward=0
    nep_est_reward = 0
    for i in range(n_eps):
        s=env.reset()
        ep_reward = 0
        for j in range(MAX_EP_STEPS):
            time.sleep(0.02)
            env.render()
            means_wrapped, stds = actor.predict(np.reshape(s, (1, -1)))
            means = means_wrapped[0]
            try:
                a = map((lambda mean, std : scipy.stats.truncnorm.rvs(a=(-actor.action_bound[0]-mean)/std,
                                                                  b=(actor.action_bound[0]-mean)/std,
                                                                  loc=mean,
                                                                  scale=std)),
                    means, stds)
            except:
                print stds
            a = np.asarray(a) 
            if not (np.abs(a) <=2.).all():
                print "!!!Action outside bounds!!!!"
                print "a =", a
                print "means =", means
                print "stds =", stds
            # get new state and reward
            s2, r, is_done, info = env.step(a)
        
            s=s2
            ep_reward += r
            
            if is_done or j == 200-1:
                nep_reward += ep_reward
                break
        
    print "| Avg undisc return (",n_eps,"eps):",  (int(nep_reward/n_eps)), \
    " | Avg est value (",n_eps,"eps):", nep_est_reward/n_eps
    print
    return nep_reward 
        

In [6]:
run(sess, env, actor, 5)

KeyboardInterrupt: 