In [1]:
import pandas as pd
from datetime import datetime, timedelta
from collections import deque
import random
import numpy as np
import tensorflow as tf
import tflearn
import itertools
import keras

import os

from env.portfolio import *
import utils.markets.indicators as ti
from replay_buffer.replay_buffer import PrioritizedReplayBuffer as RBProportional

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
prices_raw = pd.read_csv('./data/price_data')
prices_raw['Date'] = prices_raw['Date'].apply(lambda x: datetime.strptime(x,"%Y-%m-%d %H:%M:%S"))
volumes_raw = pd.read_csv('./data/volume_data')
volumes_raw['Date'] = volumes_raw['Date'].apply(lambda x: datetime.strptime(x,"%Y-%m-%d %H:%M:%S"))
volumes_raw.set_index('Date',inplace=True)

volumes_raw.columns = pd.MultiIndex.from_product([[i for i in volumes_raw.columns],['vol']])

prices_raw.set_index('Date',inplace=True)

In [3]:
abbr = prices_raw.columns

In [9]:
resampled_price = prices_raw.resample("1h").ohlc().bfill()
resampled_vol = volumes_raw.resample("1h").asfreq().bfill()

print(np.count_nonzero(np.isnan(resampled_price)),np.count_nonzero(np.isnan(resampled_vol)),len(resampled_price))

0 0 8760


In [10]:
def add_ind(df):
    tmp = df
    for a in [50,100,200]:
        tmp = ti.moving_average(tmp,a)
    tmp = ti.macd(tmp,12,26)
    tmp = ti.relative_strength_index(tmp,10)
    return tmp

In [11]:
resampled_price.columns = pd.MultiIndex.from_product([abbr,['Open','High','Low','Close']], names=('coins', 'feature'))
history_ind = {name : add_ind(pd.concat([resampled_price[name],resampled_vol[name]],axis=1)).dropna().values for name in abbr}
history = {name : pd.concat([resampled_price[name],resampled_vol[name]],axis=1).values for name in abbr}

Feature engineering

In [12]:
data_shape = [len(history),500,history['btc'].shape[1]]
print(data_shape)

[10, 500, 5]


In [30]:
class OldReplayBuffer(object):

    def __init__(self, buffer_size, random_seed=123):
        """
        The right side of the deque contains the most recent experiences 
        """
        self.buffer_size = buffer_size
        self.count = 0
        self.buffer = deque()
        random.seed(random_seed)

    def add(self, s, a, r, s2, t, y):
        experience = (s, a, r, s2, t, y)
        if self.count < self.buffer_size: 
            self.buffer.append(experience)
            self.count += 1
        else:
            self.buffer.popleft()
            self.buffer.append(experience)

    def size(self):
        return self.count

    def sample_batch(self, batch_size):
        batch = []

        if self.count < batch_size:
            batch = deque(itertools.islice(self.buffer, 0, self.count))
        else:
            tmp = int(random.uniform(0,self.count-batch_size))
            batch = deque(itertools.islice(self.buffer, self.count-batch_size, self.count))

        s_batch = np.array([_[0] for _ in batch])
        a_batch = np.array([_[1] for _ in batch])
        r_batch = np.array([_[2] for _ in batch])
        s2_batch = np.array([_[3] for _ in batch])
        t_batch = np.array([_[4] for _ in batch]) 
        y_batch = np.array([_[5] for _ in batch])

        return s_batch, a_batch, r_batch, s2_batch, t_batch, y_batch

    def clear(self):
        self.buffer.clear()
        self.count = 0

In [13]:
class ActorNetwork(object):
    """
    Input to the network is the state, output is the action
    under a deterministic policy.
    The output layer activation is a tanh to keep the action
    between -action_bound and action_bound
    """

    def __init__(self, sess, state_dim, action_dim, action_bound, learning_rate, tau, batch_size, params):
        self.params = params
        self.sess = sess
        self.s_dim = list(state_dim)
        self.a_dim = action_dim
        self.action_bound = action_bound
        self.learning_rate = learning_rate
        self.tau = tau
        self.batch_size = batch_size

        # Actor Network
        with tf.variable_scope('Actor_main_net'):
            self.inputs, self.out = self.create_actor_network()
            self.network_params = tf.trainable_variables()
        
        # Target Network
        with tf.variable_scope('Actor_traget_net'):
            self.target_inputs, self.target_out = self.create_actor_network()
            self.target_network_params = tf.trainable_variables()[
                len(self.network_params):]
        
        # Op for periodically updating target network with online network
        # weights
        self.update_target_network_params = \
            [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) +
                                                  tf.multiply(self.target_network_params[i], 1. - self.tau))
                for i in range(len(self.target_network_params))]

        # Combine the gradients here
        with tf.variable_scope("Actor_grads"):
            # This gradient will be provided by the critic network
            self.action_gradient = tf.placeholder(tf.float32, [None, self.a_dim])
            self.unnormalized_actor_gradients = tf.gradients(
                self.out, self.network_params, -self.action_gradient)
            self.actor_gradients = list(map(lambda x: tf.div(x, self.batch_size), self.unnormalized_actor_gradients))

        # Optimization Op
        self.optimize = tf.train.AdamOptimizer(self.learning_rate).\
            apply_gradients(zip(self.actor_gradients, self.network_params))
            
        
        self.prices = tf.placeholder(tf.float32, [None, self.a_dim])
        with tf.name_scope('loss_pretrain'):   
            self.loss_obj = -tf.reduce_mean(tf.log(tf.reduce_sum(self.out[1:,:] * self.prices[:-1,:], reduction_indices=[1]) - \
                        tf.reduce_sum(tf.abs(self.out[1:,:] - self.out[:-1,:])*self.params['comission_ratio'], reduction_indices=[1])))
            
            self.optimize_pretrain = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss_obj)
            
            

        self.num_trainable_vars = len(self.network_params) + len(self.target_network_params)

    def create_actor_network(self):
        inputs = tflearn.input_data(shape=([None] + self.s_dim), name='ohlcv_input')
        #prev_policy_input = tf.placeholder(shape=[None,size[0],1,1], dtype=tf.float32, name='prev_policy_input')

        with tf.name_scope('CNN'):
            conv_1 = tflearn.layers.conv_2d(inputs, 4, [1, self.params['conv_width']],activation='leaky_relu', padding="valid", bias=True,
                                                name='conv_1')
            if self.params['bn']:
                conv_1 = tflearn.layers.normalization.batch_normalization(conv_1,name='conv_1_BN')
            conv_2 = tflearn.layers.conv_2d(conv_1, 8, [1,self.s_dim[1]-self.params['conv_width']+1],
                                            activation='leaky_relu', 
                                            padding="valid", bias=True,
                                            regularizer='L2',
                                            weight_decay=self.params['weight_decay_1'],
                                            name='conv_2') 
            if self.params['bn']:
                conv_2 = tflearn.layers.normalization.batch_normalization(conv_2,name='conv_2_BN')
        with tf.name_scope('Dense'):
            net = tflearn.fully_connected(inputs, 2*self.s_dim[0], name='1_dense')
            if self.params['bn']:
                net = tflearn.layers.normalization.batch_normalization(net,name='1_dense_BN')
            net = tflearn.activations.leakyrelu(net,name='1_dense_LRelu')
            net = tflearn.fully_connected(net, self.s_dim[0], name='2_dense')
            if self.params['bn']:
                net = tflearn.layers.normalization.batch_normalization(net,name='2_dense_BN')
            net = tflearn.activations.leakyrelu(net,name='2_dense_LRelu')
            net = tf.reshape(net,shape=[-1,self.s_dim[0],1,1])
        with tf.name_scope('merge'):
            concat_1 = keras.layers.concatenate([conv_2, net])
            conv_3 = tflearn.layers.conv_2d(concat_1, 1, [1, 1], 
                                            padding="valid", bias=False,
                                            regularizer='L2',
                                            weight_decay=self.params['weight_decay_2'],
                                            name='voting')
            if self.params['bn']:
                conv_3 = tflearn.layers.normalization.batch_normalization(conv_3,name='voting_BN')
        out = tf.nn.softmax(conv_3,axis=1,name='out')[:,:,0,0]
        return inputs, out
    
    def create_actor_network_2(self):
        inputs = tflearn.input_data(shape=[None]+ self.s_dim)
        net = tflearn.fully_connected(inputs, 100, name='dense_input',bias=False)
        if self.params['bn']:
            net = tflearn.layers.normalization.batch_normalization(net,name='input_BN')
        net = tflearn.activations.leakyrelu(net)
        net = tflearn.fully_connected(net, 50,name='dense',bias=False)
        if self.params['bn']:
            net = tflearn.layers.normalization.batch_normalization(net,name='dense_BN')
        net = tflearn.activations.leakyrelu(net)
        out = tflearn.fully_connected(
            net, self.a_dim, activation='softmax', name='out',bias=False)

        return inputs, out
    
    def create_actor_network_1(self):
        inputs = tflearn.input_data(shape=([None] + self.s_dim), name='ohlcv_input')
        with tf.name_scope('CNN'):
            conv_1 = tflearn.layers.conv_2d(inputs, 8, 
                                            [1, self.params['conv_width']],
                                            activation='leaky_relu', padding="valid", 
                                            bias=False,
                                            regularizer='L2',
                                            weight_decay=self.params['weight_decay_1'],
                                            name='conv_1')
            if self.params['dropout'] < 1:
                conv_1 = tflearn.layers.dropout(conv_1, self.params['dropout'])
            if self.params['bn']:
                conv_1 = tflearn.layers.normalization.batch_normalization(conv_1,name='conv_1_BN')
            conv_2 = tflearn.layers.conv_2d(conv_1, 16, [1,self.s_dim[1]-self.params['conv_width']+1],
                                            activation='leaky_relu', 
                                            padding="valid", bias=False,
                                            regularizer='L2',
                                            weight_decay=self.params['weight_decay_2'],
                                            name='conv_2')
            if self.params['dropout'] < 1:
                conv_2 = tflearn.layers.dropout(conv_2, self.params['dropout'])
            if self.params['bn']:
                conv_2 = tflearn.layers.normalization.batch_normalization(conv_2,name='conv_2_BN')
        with tf.name_scope('Dense'):     
            net = tflearn.flatten(conv_2)
            net = tflearn.fully_connected(net, 32, name='dense_1')
            net = tflearn.activations.leakyrelu(net)
            if self.params['dropout'] < 1:
                net = tflearn.layers.dropout(net, self.params['dropout'])
            if self.params['bn']:
                net = tflearn.layers.normalization.batch_normalization(net,name='dense_1_BN')
            net = tflearn.fully_connected(net, 32, name='dense_2')
            net = tflearn.activations.leakyrelu(net)
            if self.params['dropout'] < 1:
                net = tflearn.layers.dropout(net, self.params['dropout'])
            if self.params['bn']:
                net = tflearn.layers.normalization.batch_normalization(net,name='dense_2_BN')
        out = tflearn.fully_connected(
            net, self.a_dim, activation='softmax', name='out')

        return inputs, out

    def train(self, inputs, a_gradient):
        self.sess.run(self.optimize, feed_dict={
            self.inputs: inputs,
            #self.prev_policy_input: prev_policy_input,
            self.action_gradient: a_gradient
        })
        
    def pretrain(self, inputs, prices):
        self.sess.run(self.optimize_pretrain, feed_dict={
            self.inputs: inputs,
            self.prices: prices
        })

    def predict(self, inputs):
        return self.sess.run(self.out, feed_dict={
            self.inputs: inputs
        })

    def predict_target(self, inputs):
        return self.sess.run(self.target_out, feed_dict={
            self.target_inputs: inputs
        })

    def update_target_network(self):
        self.sess.run(self.update_target_network_params)

    def get_num_trainable_vars(self):
        return self.num_trainable_vars

In [51]:
class CriticNetwork(object):
    """
    Input to the network is the state and action, output is Q(s,a).
    The action must be obtained from the output of the Actor network.
    """

    def __init__(self, sess, state_dim, action_dim, learning_rate, tau, gamma, num_actor_vars, params):
        self.params = params
        self.sess = sess
        self.s_dim = list(state_dim)
        self.a_dim = action_dim
        self.learning_rate = learning_rate
        self.tau = tau
        self.gamma = gamma

        # Create the critic network
        with tf.variable_scope('Critic_main_net'):
            self.inputs, self.action, self.out = self.create_critic_network()
            self.network_params = tf.trainable_variables()[num_actor_vars:]

        # Target Network
        with tf.variable_scope('Critic_traget_net'):
            self.target_inputs, self.target_action, self.target_out = self.create_critic_network()
            self.target_network_params = tf.trainable_variables()[(len(self.network_params) + num_actor_vars):]

        # Op for periodically updating target network with online network
        # weights with regularization
        self.update_target_network_params = \
            [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) \
            + tf.multiply(self.target_network_params[i], 1. - self.tau))
                for i in range(len(self.target_network_params))]

        # Network target (y_i)
        self.predicted_q_value = tf.placeholder(tf.float32, [None, 1])

        # Define loss and optimization Op
        if self.params['with_batch_weights']:
            self.w = tf.placeholder(dtype=tf.float32,shape=[None, 1])
            print(self.out)
            self.out_w = tf.multiply(self.w,self.out)
        self.loss = tflearn.mean_square(self.predicted_q_value, self.out)
        
        self.optimize = tf.train.AdamOptimizer(
            self.learning_rate).minimize(self.loss)
        
        '''
         with tf.variable_scope('C_train'):
            self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
            # Get the gradient pairs (Tensor, Variable)
            self.grads = tf.gradients(self.loss, self.network_params)
            # Update the weights wrt to the gradient
            self.optimize = self.optimizer.apply_gradients(self.grads)
        '''
        # Get the gradient of the net w.r.t. the action.
        # For each action in the minibatch (i.e., for each x in xs),
        # this will sum up the gradients of each critic output in the minibatch
        # w.r.t. that action. Each output is independent of all
        # actions except for one.
        if self.params['with_batch_weights']:
            self.action_grads = tf.gradients(self.out_w, self.action, name = "critic_action_grads" )
        else:
            self.action_grads = tf.gradients(self.out, self.action, name = "critic_action_grads" )

    def create_critic_network(self):
        inputs = tflearn.input_data(shape=([None] + self.s_dim))
        action = tflearn.input_data(shape=([None,self.a_dim]))
        with tf.name_scope("dense"):
            net = tflearn.fully_connected(inputs, 128,name='input_dense')
            if self.params['bn']:
                net = tflearn.layers.normalization.batch_normalization(net,name='input_dense_BN')
            net = tflearn.activations.leakyrelu(net,name='input_dense_LRelu')

            # Add the action tensor in the 2nd hidden layer
            # Use two temp layers to get the corresponding weights and biases
        with tf.name_scope("merge"):
            t1 = tflearn.fully_connected(net, 32,name='merge_net')
            t2 = tflearn.fully_connected(action, 32, name='merge_action')
            net = tflearn.activation(
                tf.matmul(net, t1.W) + tf.matmul(action, t2.W) + t1.b + t2.b, activation='leaky_relu',name='merge_LRelu')

        # linear layer connected to 1 output representing Q(s,a)
        out = tflearn.fully_connected(net, 1, name = 'out',bias=False)
        return inputs, action, out

    def train(self, inputs, action, predicted_q_value, weights = None):
        if self.params['with_batch_weights']:
            return self.sess.run([self.out, self.optimize], feed_dict={
                self.inputs: inputs,
                self.action: action,
                self.predicted_q_value: predicted_q_value,
                self.w : weights
            })
        else:
            return self.sess.run([self.out, self.optimize], feed_dict={
                self.inputs: inputs,
                self.action: action,
                self.predicted_q_value: predicted_q_value
            })

    def predict(self, inputs, action):
        return self.sess.run(self.out, feed_dict={
            self.inputs: inputs,
            self.action: action
        })

    def predict_target(self, inputs, action, weights=None):
        return self.sess.run(self.target_out, feed_dict={
            self.target_inputs: inputs,
            self.target_action: action,
        })

    def action_gradients(self, inputs, actions, weights = None):
        if self.params['with_batch_weights']:
            return self.sess.run(self.action_grads, feed_dict={
                self.inputs: inputs,
                self.action: actions,
                self.w: weights
            })
        else:
            return self.sess.run(self.action_grads, feed_dict={
                self.inputs: inputs,
                self.action: actions
            })

    def update_target_network(self):
        self.sess.run(self.update_target_network_params)

In [52]:
class ActionNoise:
    def __init__(self, mu, sigma=1, theta=0.0, dt=1e-2, x0=None):
        self.theta = theta
        self.mu = mu
        self.sigma = sigma
        self.dt = dt
        self.x0 = x0
        self.reset()

    def __call__(self):
        x =  self.theta * (self.mu - self.x_prev) * self.dt + \
                self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
        self.x_prev = x
        return x

    def reset(self):
        self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)

    def __repr__(self):
        return 'ActionNoise(mu={}, sigma={})'.format(self.mu, self.sigma)

def build_summaries():
    episode_reward = tf.placeholder(tf.float32)
    tf.summary.scalar("Reward", episode_reward)
    episode_ave_max_q = tf.placeholder(tf.float32)
    tf.summary.scalar("Qmax_Value_ave", episode_ave_max_q)
    step_max_q = tf.placeholder(tf.float32)
    tf.summary.scalar("Qmax_Value", step_max_q)
    pol_sum = [tf.placeholder(tf.float32) for i in range(len(abbr))]
    for ind, x in enumerate(abbr):
        tf.summary.scalar("%s_policy"%x, pol_sum[ind])

    summary_vars = [episode_reward, episode_ave_max_q, step_max_q] + pol_sum
    vardic = {v.name: v for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)}
    for key, value in vardic.items():
        tf.summary.histogram(key[:-2], value)
    summary_ops = tf.summary.merge_all()

    return summary_ops, summary_vars

In [53]:
def pretrain(sess, env, args, actor, summary_ops, summary_vars):

    writer = tf.summary.FileWriter(args['summary_dir'], sess.graph)

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = OldReplayBuffer(int(args['buffer_size']), int(args['random_seed']))
    
    for i in range(int(args['max_episodes_pretrain'])):

        s = env.reset()[0]

        ep_reward = 0
        ep_ave_max_q = 0
        
        writer = tf.summary.FileWriter('%s/pretrain/episode_%s' % (args['summary_dir'], i), sess.graph)
        
        for j in range(int(args['max_episode_len'])):
            ep_q = 0
            a = actor.predict(np.reshape(s, ([1]+ actor.s_dim)))
            s2, r, terminal, info , y = env.step(a[0])
            replay_buffer.add(s, np.reshape(a, (actor.a_dim,)), r, s2, terminal, y)
            if replay_buffer.size() > int(args['minibatch_size']):
                s_batch, a_batch, r_batch, s2_batch, t_batch, y_batch = replay_buffer.sample_batch(int(args['minibatch_size']))
                a_outs = actor.predict(s_batch)
                actor.pretrain(s_batch, y_batch)
                
            s = s2
            ep_reward += r  
            
            if terminal or j%10==1 or j==0:
                feed_dict={
                    summary_vars[0]: ep_reward,
                    summary_vars[1]: ep_ave_max_q / float(j+1),
                    summary_vars[2]: ep_q
                }
                pol_dict = {}
                for k in range(len(a[0])):
                    pol_dict[summary_vars[3+k]] = a[0][k]
                feed_dict.update(pol_dict)
                summary_str = sess.run(summary_ops, feed_dict=feed_dict)
                
                writer.add_summary(summary_str, j)
                writer.flush()
                
                if terminal:
                    print('| Reward: {:.6f} | Episode: {:d} | Qmax: {:.4f} | j: {:d}'.format(ep_reward, \
                            i, (ep_ave_max_q / float(j)),j))
                    break
                    
        replay_buffer.clear()

In [54]:
def train(sess, env, args, actor, critic, actor_noise, summary_ops, summary_vars):

    # Set up summary Ops
    #summary_ops, summary_vars = build_summaries()

    #sess.run(tf.global_variables_initializer())
    #writer = tf.summary.FileWriter(args['summary_dir'], sess.graph)

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    if args['prioritised_replay']:
        replay_buffer = RBProportional(int(args['buffer_size']),args['prioritised_replay_alpha'])
    else:
        replay_buffer = OldReplayBuffer(int(args['buffer_size']), int(args['random_seed']))
    

    for i in range(int(args['max_episodes'])):

        s = env.reset()[0]

        ep_reward = 0
        ep_ave_max_q = 0
        
        writer = tf.summary.FileWriter('%s/train/episode_%s' % (args['summary_dir'], i), sess.graph)

        for j in range(int(args['max_episode_len'])):
            ep_q = 0
            if args['render_env']:
                env.render(mode='ansi')

            # Added exploration noise
            noise = 0.1*actor_noise()
            noise = noise-np.sum(noise)/len(noise)
            a = actor.predict(np.reshape(s, ([1]+ actor.s_dim))) + noise
            
            s2, r, terminal, info, y = env.step(a[0])
            
            replay_buffer.add(s, np.reshape(a, (actor.a_dim,)), r,
                              s2, terminal, y)

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if args['prioritised_replay']:
                if len(replay_buffer._storage) > int(args['minibatch_size']):
                    s_batch, a_batch, r_batch, s2_batch, t_batch, batch_weights, index = \
                         replay_buffer.sample(int(args['minibatch_size']),args['beta'])
                    batch_weights = np.transpose([batch_weights])
                    target_q = critic.predict_target(s2_batch, actor.predict_target(s2_batch), batch_weights)
                    delta = np.abs(np.transpose([r_batch]) + \
                        critic.gamma * target_q - \
                        critic.predict(s_batch, a_batch))[:,0]

                    replay_buffer.update_priorities(index, delta)
                    
                    y_i = []
                    for k in range(int(args['minibatch_size'])):
                        if t_batch[k]:
                            y_i.append(r_batch[k])
                        else:
                            y_i.append(r_batch[k] + critic.gamma * target_q[k])
                    
                     # Update the critic given the targets
                    predicted_q_value, _ = critic.train(
                        s_batch, a_batch, np.reshape(y_i, (int(args['minibatch_size']), 1)),batch_weights)

                    ep_q = np.amax(predicted_q_value)
                    ep_ave_max_q += ep_q

                    # Update the actor policy using the sampled gradient
                    if i >= 0:
                        a_outs = actor.predict(s_batch)
                        grads = critic.action_gradients(s_batch, a_outs, batch_weights)
                        actor.train(s_batch, grads[0])

                    # Update target networks
                    actor.update_target_network()
                    critic.update_target_network()
            elif replay_buffer.size() > int(args['minibatch_size']):
                    s_batch, a_batch, r_batch, s2_batch, t_batch, y_batch = \
                        replay_buffer.sample_batch(int(args['minibatch_size']))
                    # Calculate targets
                    target_q = critic.predict_target(
                        s2_batch, actor.predict_target(s2_batch))

                    y_i = []
                    for k in range(int(args['minibatch_size'])):
                        if t_batch[k]:
                            y_i.append(r_batch[k])
                        else:
                            y_i.append(r_batch[k] + critic.gamma * target_q[k])

                    # Update the critic given the targets
                    predicted_q_value, _ = critic.train(
                        s_batch, a_batch, np.reshape(y_i, (int(args['minibatch_size']), 1)))

                    ep_ave_max_q += np.amax(predicted_q_value)

                    # Update the actor policy using the sampled gradient
                    if i > 3:
                        a_outs = actor.predict(s_batch)
                        grads = critic.action_gradients(s_batch, a_outs)
                        actor.train(s_batch, grads[0])

                    # Update target networks
                    actor.update_target_network()
                    critic.update_target_network()

            s = s2
            ep_reward += r

            if terminal or j%10==1 or j==0:
                feed_dict={
                    summary_vars[0]: ep_reward,
                    summary_vars[1]: ep_ave_max_q / float(j+1),
                    summary_vars[2]: ep_q
                }
                pol_dict = {}
                for k in range(len(a[0])):
                    pol_dict[summary_vars[3+k]] = a[0][k]
                feed_dict.update(pol_dict)
                
                
                summary_str = sess.run(summary_ops, feed_dict=feed_dict)
                
                writer.add_summary(summary_str, j)
                writer.flush()
                
                if terminal:
                    print('| Reward: {:.6f} | Episode: {:d} | Qmax: {:.4f} | j: {:d}'.format(ep_reward, \
                            i, (ep_ave_max_q / float(j)),j))
                    break
        """
         if i%100==0 and i!=0:
            model_save_path="./tmp/ddpg/1/"
            if not os.path.exists(model_save_path):
                os.makedirs(model_save_path, exist_ok=True)

            saver = tf.train.Saver()
            model_path = saver.save(sess, model_save_path+("%s"%i))
            print("Model saved in %s" % model_path)
        """       

In [62]:
args = {'random_seed' : 123,
        'actor_lr' : 0.00005,
        'critic_lr': 0.00025,
        'minibatch_size' : 32,
        'tau' : 0.001,
        'buffer_size' : 100,        
        'gamma' : 0.999,
        'max_episodes' : 10000,
        'max_episodes_pretrain' : 1,
        'max_episode_len' : 7000,
        'render_env' : False,
        'use_gym_monitor' : False,
        'monitor_dir' : "./logs/mon/",
        'summary_dir' : './logs/ddpg/13',
        'prioritised_replay': False,
        'prioritised_replay_alpha': 1,
        'beta': 1,
        'comission_ratio': 0.0025
       } 

actor_net_params = {
    'conv_width' : 16,
    'weight_decay_1' : 1e-5,
    'weight_decay_2' : 1e-5,
    'bn' : False,
    'dropout': 0.5,
    'comission_ratio': 0.0025
}

critic_net_params = {
    'bn' : False,
    'with_batch_weights': False
}

In [66]:
!rm -r ./logs/ddpg/
!mkdir ./logs/ddpg
!mkdir ./logs/ddpg/13

In [70]:
tf.reset_default_graph()
with tf.Session() as session:
    env = PortfolioEnv(np.array(list((history_ind.values()))),\
                       abbr,steps=7000, \
                       window_length=128, 
                       trading_cost=0,\
                       sample_start_date='2017-05-10 19:00:00',\
                       start_idx=128)
    np.random.seed(int(args['random_seed']))
    tf.set_random_seed(int(args['random_seed']))
    env.seed(int(args['random_seed']))

    state_dim = env.observation_space.shape
    action_dim = env.action_space.shape[0]
    action_bound = 1

    actor = ActorNetwork(session, state_dim, action_dim, action_bound,
                         float(args['actor_lr']), float(args['tau']),
                         int(args['minibatch_size']),actor_net_params)

    critic = CriticNetwork(session, state_dim, action_dim,
                           float(args['critic_lr']), float(args['tau']),
                           float(args['gamma']),
                           actor.get_num_trainable_vars(),critic_net_params)


    actor_noise = ActionNoise(mu=np.zeros(action_dim))

    if args['use_gym_monitor']:
        if not args['render_env']:
            env = wrappers.Monitor(
                env, args['monitor_dir'], video_callable=False, force=True)
        else:
            env = wrappers.Monitor(env, args['monitor_dir'], force=True)

    summary_ops, summary_vars = build_summaries()
    session.run(tf.global_variables_initializer())
    
    pretrain(session, env, args, actor, summary_ops, summary_vars)
    train(session, env, args, actor, critic, actor_noise, summary_ops, summary_vars)

    if args['use_gym_monitor']:
        env.monitor.close()

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: Could not seed environment <PortfolioEnv instance>[0m


RuntimeError: Attempted to use a closed Session.