In [1]:
import numpy as np
import scipy
import random
import tensorflow as tf
import time
import os
import gym
from gym import envs, scoreboard
from gym.spaces import Discrete, Box
import tempfile
import sys
from PGActor import PGActor

In [2]:

# ==========================
#   Training Parameters
# ==========================
# Max training steps
MAX_EPISODES = 10000
# Max episode length
MAX_EP_STEPS = 1000
# Base learning rate for the Actor network
ACTOR_LEARNING_RATE = 0.003
# Discount factor 
GAMMA = 0.99

# ===========================
#   Utility Parameters
# ===========================
# Render gym env during training
RENDER_ENV = True
# Use Gym Monitor
GYM_MONITOR_EN = True
# Gym environment
ENV_NAME = 'Pendulum-v0'
# # Directory for storing gym results
# MONITOR_DIR = './results/gym_ddpg'
# # Directory for storing tensorboard summary results
# SUMMARY_DIR = './results/tf_ddpg'
RANDOM_SEED = 1337
MINIBATCH_SIZE = 1000

In [3]:
def train(sess, env, actor):
    
    # Initialize our Tensorflow variables
    sess.run(tf.initialize_all_variables())

    # Initialize memory
    old_states = []
    old_actions = []
    old_values = []
#     old_probs = []
    mem_fill = 0
    
    for i in range(MAX_EPISODES):
        s = env.reset()
#         print "Episode", i

        ep_reward = 0
        if mem_fill >= MINIBATCH_SIZE:
            
#             print "Training on", mem_fill, "examples"
            actor.train(old_states, old_actions, old_values)
#             print "Trained!"
            old_states = []
            old_actions = []
            old_values = []
#             old_probs = []
            mem_fill = 0
    
        for j in range(MAX_EP_STEPS):
            
            if RENDER_ENV:
                env.render()
            
            # generating a step
            
            # get the distribution parameters, sample and determine their probs
            probs = actor.predict(np.reshape(s, (1, -1)))
#             means, stds = wrapped_means[0], wrapped_stds[0]
#             print "means, stds =",  (means, stds)
#             a = map((lambda mean, std: np.random.normal(mean, std)),
#                     means, stds)
            
            if np.random.uniform() < probs[0][0]:
                a = np.asarray([0])
            else:
                a = np.asarray([1])
#             vars = stds**2
#             probs = np.exp(-((a-means)**2)/(2*vars))/np.sqrt(2* vars * np.pi)
            
            # get new state and reward
            s2, r, is_done, info = env.step(a[0])
            
            # add step to batch
            
            old_states.append(np.reshape(s, (actor.s_dim,)))
            old_actions.append(np.reshape(a, (actor.a_dim,)))
#             old_probs.append(np.reshape(probs, (actor.a_dim,)))
           
            # keep adding steps until there are enough to do a training update
            
#             if replay_buffer.size() > MINIBATCH_SIZE:
#                 s_batch, a_batch, r_batch, is_done_batch, s2_batch = \
#                     replay_buffer.sample_batch(MINIBATCH_SIZE)
                
                
#                 # calculate targets
#                 target_qs = critic.predict_target(s2_batch, actor.predict_target(s2_batch))
                
#                 #if the game has ended target_q not added to get hindsight q
#                 hindsight_q_vec = (r_batch + (1 - is_done_batch.astype(float)) * GAMMA * np.reshape(target_qs, (MINIBATCH_SIZE,)))
# #                 print "r_batch has shape: ", r_batch.shape
# #                 print "is_done_batch has shape: ", is_done_batch.shape
# #                 print "target_qs has shape: ", target_qs.shape
# #                 print "H q vec has shape: ", hindsight_q_vec.shape
                
#                 #critic training
                
#                 predicted_q_value, _ = critic.train(s_batch, a_batch,
#                                                      np.reshape(hindsight_q_vec, (MINIBATCH_SIZE, 1)))
#                 ep_ave_max_q += np.amax(predicted_q_value)
                
#                 # actor training
#                 actions = actor.predict(s_batch)
#                 dQda_list = critic.action_gradients(s_batch, actions) # could repeat more than once, or even less than once
#                 actor.train(s_batch, dQda_list[0])
                
            s=s2
                
            ep_reward += r
                
            if is_done or j == MAX_EP_STEPS-1:
#                 if is_done:
#                     print "Done for some reason"
                ep_len = j+1
                mem_fill += ep_len
                values = ep_len * [np.asarray([ep_reward])]
                old_values = old_values + values
                if i % 100 ==0:
                    print '| Reward: %.2i' % int(ep_reward), " | Episode", i
                break
            

In [4]:
# defining environment
sess = tf.Session()

env = gym.make('CartPole-v0')
# print env.observation_space
# state_dim = env.observation_space.shape[0]
# action_dim = env.action_space.shape[0]
# print "state and actions dims:", state_dim, action_dim

# make sure action bound is symmetric (can change in future,
# but need to remember to scale actor output appropriately)
# assert (env.action_space.high == -env.action_space.low)

# action_bound = env.action_space.high

# start up actor and critic pair

actor = PGActor(sess, 4, 1, 1, 
                 ACTOR_LEARNING_RATE)

train(sess, env, actor)

[2016-11-18 20:18:27,286] Making new env: CartPole-v0


| Reward: 10  | Episode 0
| Reward: 08  | Episode 100
| Reward: 10  | Episode 200
| Reward: 10  | Episode 300
| Reward: 10  | Episode 400
| Reward: 08  | Episode 500
| Reward: 09  | Episode 600
| Reward: 10  | Episode 700
| Reward: 09  | Episode 800
| Reward: 09  | Episode 900
| Reward: 10  | Episode 1000
| Reward: 10  | Episode 1100
| Reward: 08  | Episode 1200
| Reward: 08  | Episode 1300
| Reward: 08  | Episode 1400
| Reward: 10  | Episode 1500
| Reward: 08  | Episode 1600
| Reward: 10  | Episode 1700
| Reward: 08  | Episode 1800
| Reward: 09  | Episode 1900
| Reward: 09  | Episode 2000
| Reward: 09  | Episode 2100
| Reward: 10  | Episode 2200
| Reward: 09  | Episode 2300
| Reward: 11  | Episode 2400
| Reward: 09  | Episode 2500
| Reward: 10  | Episode 2600
| Reward: 10  | Episode 2700
| Reward: 10  | Episode 2800
| Reward: 10  | Episode 2900
| Reward: 08  | Episode 3000
| Reward: 10  | Episode 3100
| Reward: 09  | Episode 3200
| Reward: 11  | Episode 3300
| Reward: 11  | Episode 34