In [2]:
import numpy as np
import scipy
import random
import tensorflow as tf
import time
import os
import gym
from gym import envs, scoreboard
from gym.spaces import Discrete, Box
import tempfile
import sys
from PGActorDiscrete import PGActorDiscrete

In [3]:

# ==========================
#   Training Parameters
# ==========================
# Max training steps
MAX_EPISODES = 10000
# Max episode length
MAX_EP_STEPS = 1000
# Base learning rate for the Actor network
ACTOR_LEARNING_RATE = 0.03
CRITIC_LEARNING_RATE = 0.1
# Discount factor 
GAMMA = 0.99

# ===========================
#   Utility Parameters
# ===========================
# Render gym env during training
RENDER_ENV = True
# Use Gym Monitor
GYM_MONITOR_EN = True
# Gym environment
ENV_NAME = 'Pendulum-v0'
# # Directory for storing gym results
# MONITOR_DIR = './results/gym_ddpg'
# # Directory for storing tensorboard summary results
# SUMMARY_DIR = './results/tf_ddpg'
RANDOM_SEED = 1337
MINIBATCH_SIZE = 200

In [4]:
def train(sess, env, actor):
    
    # Initialize our Tensorflow variables
    sess.run(tf.initialize_all_variables())

    # Initialize memory
    old_states = []
    old_actions = []
    old_advantages = []
    old_hindsight_values = []
    mem_fill = 0
    nep_reward = 0
    nep_est_reward = 0
    
    for i in range(MAX_EPISODES):
        
        s = env.reset()
        v = actor.predict_value(np.reshape(s, (1, -1)))[0]
        nep_est_reward += v
        ep_reward = 0
        advantages = []
        
        for j in range(MAX_EP_STEPS):
            
            if RENDER_ENV:
                env.render()
            
            # get the probabilities and sample for an action
            probs = actor.predict(np.reshape(s, (1, -1)))
            a = [np.random.choice(range(actor.a_dim+1), p=probs[0])]
                                  
            # get new state and reward
            s2, r, is_done, info = env.step(a[0])
            
            # get new value prediction and find delta
            v2 = actor.predict_value(np.reshape(s2, (1, -1)))[0]
            
            if is_done:
                hs_v = [r]
            else:
                hs_v = r + GAMMA * v2
                
            # add step to batch
            old_states.append(np.reshape(s, (actor.s_dim,)))
            old_actions.append(np.reshape(a, (actor.a_dim,)))
            old_hindsight_values.append(hs_v)
            advantages.append(-v) # this will get modified in future timesteps
            for k in range(len(advantages)):
                assert k <= j
                advantages[k] += r * (GAMMA**(j-k))
            
            s=s2
            v=v2
            ep_reward += r
            
            # end of episode methods
            if is_done or j == MAX_EP_STEPS-1: 
                ep_len = j+1
                mem_fill += ep_len
                old_advantages = old_advantages + advantages
                nep_reward += ep_reward
                break
                                  
        if i % 100 ==0 and i!=0:
                print '| Avg value (100 eps): %.2i' % (int(nep_reward/100)), \
                " | Avg est value (100 eps): ", nep_est_reward[0]/100, " | Episode", i
                print
                nep_reward=0
                nep_est_reward = 0
        if mem_fill >= MINIBATCH_SIZE:
            
#             print "Training on", mem_fill, "examples"
#             print np.asarray(old_hindsight_values).shape
            actor.train(old_states, old_actions, old_advantages)
            actor.train_value(old_states, old_hindsight_values)
            old_states = []
            old_actions = []
            old_advantages = []
            old_hindsight_values = []
                                
            mem_fill = 0
            

In [5]:
# defining environment
sess = tf.Session()

env = gym.make('CartPole-v0')
# print env.observation_space
# state_dim = env.observation_space.shape[0]
# action_dim = env.action_space
# print action_dim
# print "state and actions dims:", state_dim, action_dim

# make sure action bound is symmetric (can change in future,
# but need to remember to scale actor output appropriately)
# assert (env.action_space.high == -env.action_space.low)

# action_bound = env.action_space.high

# start up actor and critic pair

actor = PGActorDiscrete(sess, 4, 1, 
                 ACTOR_LEARNING_RATE, CRITIC_LEARNING_RATE)

train(sess, env, actor)

[2016-11-20 00:43:48,730] Making new env: CartPole-v0


| Avg value (100 eps): 22  | Avg est value (100 eps):  9.86195739746  | Episode 100

| Avg value (100 eps): 22  | Avg est value (100 eps):  16.8506506348  | Episode 200

| Avg value (100 eps): 44  | Avg est value (100 eps):  26.8540039062  | Episode 300

| Avg value (100 eps): 213  | Avg est value (100 eps):  59.1013476562  | Episode 400

| Avg value (100 eps): 854  | Avg est value (100 eps):  86.7399121094  | Episode 500

| Avg value (100 eps): 992  | Avg est value (100 eps):  90.8156054687  | Episode 600

| Avg value (100 eps): 846  | Avg est value (100 eps):  81.5569726562  | Episode 700

| Avg value (100 eps): 962  | Avg est value (100 eps):  96.1985742187  | Episode 800



KeyboardInterrupt: 