In [1]:
import numpy as np
import scipy
import random
import tensorflow as tf
import time
import os
import gym
from gym import envs, scoreboard
from gym.spaces import Discrete, Box
import tempfile
import sys
from PGActorDiscrete import PGActorDiscrete

In [5]:

# ==========================
#   Training Parameters
# ==========================
# Max training steps
MAX_EPISODES = 10000
# Max episode length
MAX_EP_STEPS = 1000
# Base learning rate for the Actor network
ACTOR_LEARNING_RATE = 0.1
# Discount factor 
GAMMA = 0.99

# ===========================
#   Utility Parameters
# ===========================
# Render gym env during training
RENDER_ENV = True
# Use Gym Monitor
GYM_MONITOR_EN = True
# Gym environment
ENV_NAME = 'Pendulum-v0'
# # Directory for storing gym results
# MONITOR_DIR = './results/gym_ddpg'
# # Directory for storing tensorboard summary results
# SUMMARY_DIR = './results/tf_ddpg'
RANDOM_SEED = 1337
MINIBATCH_SIZE = 1000

In [6]:
def train(sess, env, actor):
    
    # Initialize our Tensorflow variables
    sess.run(tf.initialize_all_variables())

    # Initialize memory
    old_states = []
    old_actions = []
    old_values = []
    mem_fill = 0
    nep_reward = 0
    for i in range(MAX_EPISODES):
        
        s = env.reset()

        ep_reward = 0
        if mem_fill >= MINIBATCH_SIZE:
            
#             print "Training on", mem_fill, "examples"
            actor.train(old_states, old_actions, old_values)
            old_states = []
            old_actions = []
            old_values = []
            mem_fill = 0
        values = []
        for j in range(MAX_EP_STEPS):
            
            if RENDER_ENV:
                env.render()
            
            # get the probabilities
            probs = actor.predict(np.reshape(s, (1, -1)))
            
            if np.random.uniform() < probs[0][0]:
                a = np.asarray([0])
            else:
                a = np.asarray([1])
#             print probs
#             print a
            
            # get new state and reward
            s2, r, is_done, info = env.step(a[0])
            
            # add step to batch
            old_states.append(np.reshape(s, (actor.s_dim,)))
            old_actions.append(np.reshape(a, (actor.a_dim,)))
            values.append([-ep_reward])
            s=s2
                
            ep_reward += r
                
            if is_done or j == MAX_EP_STEPS-1:
#                 if is_done:
#                     print "Done for some reason"
                ep_len = j+1
                mem_fill += ep_len
                values = ep_reward + np.asarray(values)
#                 print values
                old_values = old_values + values.tolist()
                nep_reward += ep_reward
                if i % 100 ==0:
                    print '| Avg Value over 100 eps: %.2i' % (int(nep_reward/100)), " | Episode", i
                    nep_reward=0
                break
            

In [7]:
# defining environment
sess = tf.Session()

env = gym.make('CartPole-v0')
# print env.observation_space
# state_dim = env.observation_space.shape[0]
# action_dim = env.action_space.shape[0]
# print "state and actions dims:", state_dim, action_dim

# make sure action bound is symmetric (can change in future,
# but need to remember to scale actor output appropriately)
# assert (env.action_space.high == -env.action_space.low)

# action_bound = env.action_space.high

# start up actor and critic pair

actor = PGActor(sess, 4, 1, 1, 
                 ACTOR_LEARNING_RATE)

train(sess, env, actor)

[2016-11-18 22:05:36,406] Making new env: CartPole-v0


| Avg Value over 100 eps: 00  | Episode 0
| Avg Value over 100 eps: 22  | Episode 100
| Avg Value over 100 eps: 41  | Episode 200
| Avg Value over 100 eps: 148  | Episode 300
| Avg Value over 100 eps: 119  | Episode 400
| Avg Value over 100 eps: 360  | Episode 500


KeyboardInterrupt: 