In [1]:
import tensorflow as tf
import tflearn
import numpy as np
import gym
from ActorPair import ActorPair
from CriticPair import CriticPair
from ReplayBuffer import ReplayBuffer

In [2]:

# ==========================
#   Training Parameters
# ==========================
# Max training steps
MAX_EPISODES = 20
# Max episode length
MAX_EP_STEPS = 365 * 30
# Base learning rate for the Actor network
ACTOR_LEARNING_RATE = 0.0001
# Base learning rate for the Critic Network
CRITIC_LEARNING_RATE = 0.001
# Discount factor 
GAMMA = 0.99
# Soft target update param
TAU = 0.001
EPSILON = 1
# ===========================
#   Utility Parameters
# ===========================
# Render gym env during training
RENDER_ENV = True
# Use Gym Monitor
GYM_MONITOR_EN = True
# Gym environment
ENV_NAME = 'Pendulum-v0'
# Directory for storing gym results
MONITOR_DIR = './results/gym_ddpg'
# Directory for storing tensorboard summary results
SUMMARY_DIR = './results/tf_ddpg'
RANDOM_SEED = 1337
# Size of replay buffer
BUFFER_SIZE = 10000
MINIBATCH_SIZE = 64

In [3]:
def train(sess, env, actor, critic):
    
    # Initialize our Tensorflow variables
    sess.run(tf.initialize_all_variables())
   
    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(BUFFER_SIZE)
    
    for i in range(MAX_EPISODES):
        s = env.reset()
#         print "Episode", i
        ep_reward = 0
        ep_ave_max_q = 0
        for j in range(MAX_EP_STEPS):
            
            if RENDER_ENV:
                env.render()
            
            # generating a step
            
            # adding noise so that actor explores systematically across episode and step
             
            if np.random.uniform() < EPSILON*0.999**j:
                a = np.random.uniform(size = [1, 1])
            else:
                a = actor.predict(np.reshape(s, (1, -1)))
            # get new state and reward
            s2, r, is_done, info = env.step(a[0][0])
            
            # add step to replay buffer
            
            replay_buffer.add(np.reshape(s, (actor.s_dim,)),
                              np.reshape(a, (actor.a_dim,)), r,
                              is_done, np.reshape(s2, (actor.s_dim,)))
            
            # keep adding steps until there are enough to do a training update
            
            if replay_buffer.size() > MINIBATCH_SIZE:
                s_batch, a_batch, r_batch, is_done_batch, s2_batch = \
                    replay_buffer.sample_batch(MINIBATCH_SIZE)
                
                
                # calculate targets
                target_qs = critic.predict_target(s2_batch, actor.predict_target(s2_batch))
                
                #if the game has ended target_q not added to get hindsight q
                hindsight_q_vec = (r_batch + (1 - is_done_batch.astype(float)) * GAMMA * np.reshape(target_qs, (MINIBATCH_SIZE,)))
#                 print "r_batch has shape: ", r_batch.shape
#                 print "is_done_batch has shape: ", is_done_batch.shape
#                 print "target_qs has shape: ", target_qs.shape
#                 print "H q vec has shape: ", hindsight_q_vec.shape
                
                
                
                
                #critic training
#                 print hindsight_q_vec.shape
                predicted_q_value, _ = critic.train(s_batch, a_batch,
                                                     np.reshape(hindsight_q_vec, (MINIBATCH_SIZE, 1)))
                ep_ave_max_q += np.amax(predicted_q_value)
                
                # actor training
                actions = actor.predict(s_batch)
                dQda_list = critic.action_gradients(s_batch, actions) # could repeat more than once, or even less than once
                actor.train(s_batch, dQda_list[0])
                
                # updates targets
                actor.update_target_network()
                critic.update_target_network()
                
            s=s2
                
            ep_reward += r
                
            if is_done or j == MAX_EP_STEPS-1:
                print '| Reward: %.4f' % ep_reward, " | Episode", i, \
                    '| Qmax: %.4f' % (ep_ave_max_q / float(j))
                break
            

In [4]:
class DiscreteGBM(object):
    def __init__(self, mu, sigma, init_price, init_cash):
        self.init_price = init_price
        self.init_cash = init_cash
        self.mu = mu
        self.sigma = sigma
        
        self.price = self.init_price
        self.cash = self.init_cash
        self.stock_value = 0
        
    def reset(self):
        self.price = self.init_price
        self.cash = self.init_cash
        return [self.price, self.stock_value, self.cash]
    
    def step(self, action):
        # Apply the action
        invest_prop = min(max(action, -1), 1)
#         print "action =", action
#         print "invest_prop =", invest_prop
#         print "price =", self.price
#         print "stock value =", self.stock_value
#         print "cash =", self.cash
        if invest_prop < 0:
            # Take cash out of stock
            invest_amount = int(invest_prop * self.stock_value)
        else: 
            # Put cash into stock
            invest_amount = int(invest_prop * self.cash)
        
        # Reward is simply the amount of money extracted
        r = -invest_amount
        # Apply the action
        self.cash = self.cash - invest_amount
        self.stock_value = self.stock_value + invest_amount
            
        # Evolve the state (price, cash)
        W = np.random.normal(self.mu, self.sigma)
#         print "W =", W
        new_price = self.price * (1+W)
        self.stock_value = int(self.stock_value * (new_price / self.price))
        self.price = new_price
        
        is_done = (self.stock_value==0 and self.cash==0)
        info = None
        
#         if np.random.uniform() < 1./300:
#             print '| Invest: %.4f' % invest_prop, " | Invest Amount:", invest_amount,  \
#             " | Price:", self.price," | Cash:", self.cash, \
#                         '| Stock: %.4f' % self.stock_value
        
        return np.asarray([self.price, self.stock_value, self.cash]), r, is_done, info
        
    def render(self):
        return 

mu = 1.08**(1./365.) - 1
env = DiscreteGBM(mu, 20*mu, 1, 100 * 10**4 )

In [None]:
# defining environment
with tf.Session() as sess:

    state_dim = 3
    action_dim = 1
    print "state and actions dims:", state_dim, action_dim

    action_bound = 1

    # start up actor and critic pair

    actor = ActorPair(sess, state_dim, action_dim, action_bound, 
                     ACTOR_LEARNING_RATE, TAU)

    critic = CriticPair(sess, state_dim, action_dim,
                       CRITIC_LEARNING_RATE, TAU,  actor.get_num_trainable_vars())

    train(sess, env, actor, critic)

state and actions dims: 3 1
There are 0 before creating main actor
There are 6 after creating main actor
There are 12 after creating target actor
| Reward: 89050.0000  | Episode 0 | Qmax: 531551.5463
| Reward: -54404.0000  | Episode 1 | Qmax: 648693.8036
| Reward: 94892.0000  | Episode 2 | Qmax: 709660.3024
| Reward: 213058.0000  | Episode 3 | Qmax: 782390.4457
| Reward: 245880.0000  | Episode 4 | Qmax: 788071.7815
| Reward: 294022.0000  | Episode 5 | Qmax: 816845.3339
| Reward: 413984.0000  | Episode 6 | Qmax: 929603.0319
| Reward: 68357.0000  | Episode 7 | Qmax: 806903.6353
| Reward: 165641.0000  | Episode 8 | Qmax: 812885.4545
| Reward: 164849.0000  | Episode 9 | Qmax: 871246.6340
| Reward: 128083.0000  | Episode 10 | Qmax: 869331.8503
| Reward: 120416.0000  | Episode 11 | Qmax: 852673.5353
| Reward: 396870.0000  | Episode 12 | Qmax: 1018054.1993
