In [11]:
import pandas as pd
from datetime import datetime, timedelta
from collections import deque
import random
import numpy as np
import tensorflow as tf
import tflearn
import itertools
import os

import utils.for_tf as util
from env.portfolio import *
import utils.markets.indicators as ti
from replay_buffer.replay_buffer import PrioritizedReplayBuffer as RBProportional

In [5]:
prices_raw = pd.read_csv('./data/price_data')
prices_raw['Date'] = prices_raw['Date'].apply(lambda x: datetime.strptime(x,"%Y-%m-%d %H:%M:%S"))
volumes_raw = pd.read_csv('./data/volume_data')
volumes_raw['Date'] = volumes_raw['Date'].apply(lambda x: datetime.strptime(x,"%Y-%m-%d %H:%M:%S"))
volumes_raw.set_index('Date',inplace=True)

volumes_raw.columns = pd.MultiIndex.from_product([[i for i in volumes_raw.columns],['vol']])

prices_raw.set_index('Date',inplace=True)

In [6]:
abbr = prices_raw.columns

In [7]:
resampled_price = prices_raw.resample("1h").ohlc().bfill()
resampled_vol = volumes_raw.resample("1h").asfreq().bfill()

print(np.count_nonzero(np.isnan(resampled_price)),np.count_nonzero(np.isnan(resampled_vol)),len(resampled_price))

0 0 8760


In [8]:
def add_ind(df):
    tmp = df
    for a in [50,100,200]:
        tmp = ti.moving_average(tmp,a)
    tmp = ti.macd(tmp,12,26)
    tmp = ti.relative_strength_index(tmp,10)
    return tmp

In [9]:
resampled_price.columns = pd.MultiIndex.from_product([abbr,['Open','High','Low','Close']], names=('coins', 'feature'))
#history_ind = {name : add_ind(pd.concat([resampled_price[name],resampled_vol[name]],axis=1)).dropna().values for name in abbr}
history = {name : pd.concat([resampled_price[name],resampled_vol[name]],axis=1).values for name in abbr}

Feature engineering

In [10]:
data_shape = [len(history),500,history['btc'].shape[1]]
print(data_shape)

[10, 500, 5]


In [8]:
class Agent(object):
    def __init__(self, network, obs_dim,
            num_actions, gamma=0.9, lam=0.95, reuse=tf.AUTO_REUSE):
        self.num_actions = num_actions
        self.gamma = gamma
        self.lam = lam
        self.t = 0
        self.obss = []
        self.actions = []
        self.rewards = []
        self.values = []
        self.next_values = []

        act, train, update_old, backup_current = build_train(
            network=network,
            obs_dim=obs_dim,
            num_actions=num_actions,
            gamma=gamma,
            reuse=reuse
        )
        self._act = act
        self._train = train
        self._update_old = update_old
        self._backup_current = backup_current

    def act(self, obs):
        return self._act([obs])[0][0]

    def act_and_train(self, last_obs, last_action, last_value, reward, obs):
        action, value = self._act([obs])
        action = action[0]
        value = value[0]
        action = np.clip(action, -2, 2)

        if last_obs is not None:
            self._add_trajectory(
                last_obs,
                last_action,
                reward,
                last_value,
                value
            )

        self.t += 1
        return action, value

    def train(self, obs, actions, returns, deltas):
        self._backup_current()
        loss, value_loss, ratio = self._train(obs, actions, returns, deltas)
        print(loss, value_loss, ratio)
        self._update_old()
        return ratio

    def stop_episode(self, last_obs, last_action, last_value, reward):
        self._add_trajectory(
            last_obs,
            last_action,
            reward,
            last_value,
            0
        )

    def _reset_trajectories(self):
        self.obss = []
        self.rewards = []
        self.actions = []
        self.values = []
        self.next_values = []

    def _add_trajectory(self, obs, action, reward, value, next_value):
        self.obss.append(obs)
        self.actions.append(action)
        self.rewards.append(reward)
        self.values.append(value)
        self.next_values.append(next_value)

    def get_training_data(self):
        obss = list(self.obss)
        actions = list(self.actions)
        deltas = []
        returns = []
        V = 0
        for i in reversed(range(len(self.obss))):
            reward = self.rewards[i]
            value = self.values[i]
            next_value = self.next_values[i]
            delta = reward + self.gamma * next_value - value
            V = delta + self.lam * self.gamma * V
            deltas.append(V)
            returns.append(V + value)
        deltas = np.array(list(reversed(deltas)), dtype=np.float32)
        returns = np.array(list(reversed(returns)), dtype=np.float32)
        # standardize advantages
        deltas = (deltas - deltas.mean()) / (deltas.std() + 1e-5)
        self._reset_trajectories()
        return obss, actions, list(returns), list(deltas)

    def sync_old(self):
        self._update_old()

In [57]:

def build_train(network, obs_dim,
            num_actions, gamma=1.0, epsilon=0.2, beta=0.01, scope='ppo', reuse=tf.AUTO_REUSE):
    with tf.variable_scope(scope, reuse=reuse):
        # input placeholders
        obs_t_input = tf.placeholder(tf.float32, [None] + obs_dim, name='obs_t')
        act_t_ph = tf.placeholder(tf.float32, [None, num_actions], name='action')
        return_t_ph = tf.placeholder(tf.float32, [None, 1], name='return')
        advantage_t_ph = tf.placeholder(tf.float32, [None, 1], name='advantage')

        policy, value, dist = network(
            obs_t_input,
            num_actions,
            scope='network',
            reuse=reuse
        )
        network_func_vars = util.scope_vars(
            util.absolute_scope_name('network'),
            trainable_only=True
        )

        old_policy, old_value, old_dist = network(
            obs_t_input,
            num_actions,
            scope='old_network',
            reuse=reuse
        )
        old_network_func_vars = util.scope_vars(
            util.absolute_scope_name('old_network'),
            trainable_only=True
        )

        tmp_policy, tmp_value, tmp_dist = network(
            obs_t_input,
            num_actions,
            scope='tmp_network',
            reuse=reuse
        )
        tmp_network_func_vars = util.scope_vars(
            util.absolute_scope_name('tmp_network'),
            trainable_only=True
        )

        # clipped surrogate objective
        cur_policy = dist.log_prob(act_t_ph + 1e-5)
        old_policy = old_dist.log_prob(act_t_ph + 1e-5)
        ratio = tf.exp(cur_policy - old_policy)
        clipped_ratio = tf.clip_by_value(ratio, 1.0 - epsilon, 1.0 + epsilon)
        surrogate = -tf.reduce_mean(
            tf.minimum(ratio, clipped_ratio) * advantage_t_ph,
            name='surrogate'
        )

        with tf.variable_scope('loss'):
            # value network loss
            value_loss = tf.reduce_mean(tf.square(value - return_t_ph))

            # entropy penalty for exploration
            entropy = tf.reduce_mean(dist.entropy())
            penalty = -beta * entropy

            # total loss
            loss = surrogate + value_loss + penalty

        # optimize operations
        optimizer = tf.train.AdamOptimizer(3 * 1e-4)
        optimize_expr = optimizer.minimize(loss, var_list=network_func_vars)

        # update old network operations
        with tf.variable_scope('update_old_network'):
            update_old_expr = []
            sorted_tmp_vars = sorted(
                tmp_network_func_vars,
                key=lambda v: v.name
            )
            sorted_old_vars = sorted(
                old_network_func_vars,
                key=lambda v: v.name
            )
            for var_tmp, var_old in zip(sorted_tmp_vars, sorted_old_vars):
                update_old_expr.append(var_old.assign(var_tmp))
            update_old_expr = tf.group(*update_old_expr)

        # update tmp network operations
        with tf.variable_scope('update_tmp_network'):
            update_tmp_expr = []
            sorted_vars = sorted(network_func_vars, key=lambda v: v.name)
            sorted_tmp_vars = sorted(
                tmp_network_func_vars,
                key=lambda v: v.name
            )
            for var, var_tmp in zip(sorted_vars, sorted_tmp_vars):
                update_tmp_expr.append(var_tmp.assign(var))
            update_tmp_expr = tf.group(*update_tmp_expr)

        # action theano-style function
        act = util.function(inputs=[obs_t_input], outputs=[policy, value])

        # train theano-style function
        train = util.function(
            inputs=[
                obs_t_input, act_t_ph, return_t_ph, advantage_t_ph
            ],
            outputs=[loss, value_loss, tf.reduce_mean(ratio)],
            updates=[optimize_expr]
        )

        # update target theano-style function
        update_old = util.function([], [], updates=[update_old_expr])
        backup_current = util.function([], [], updates=[update_tmp_expr])

        return act, train, update_old, backup_current

In [58]:
def _make_network(inpt, num_actions, scope='network', reuse=tf.AUTO_REUSE):
    with tf.variable_scope(scope, reuse=reuse):
        out = inpt

        conv_1 = tflearn.layers.conv_2d(inpt, 4, [1, 16],activation='leaky_relu', padding="valid", bias=True,
                                            name='conv_1')

        conv_2 = tflearn.layers.conv_2d(conv_1, 8, [1,128-16+1],
                                        activation='leaky_relu', 
                                        padding="valid", bias=True,
                                        regularizer='L2',
                                        weight_decay=1e-5,
                                        name='conv_2') 

        with tf.name_scope('Dense'):
            net = tflearn.fully_connected(inpt, 2*10, name='1_dense')
            net = tflearn.activations.leakyrelu(net,name='1_dense_LRelu')
            net = tflearn.fully_connected(net, 10, name='2_dense')
            net = tflearn.activations.leakyrelu(net,name='2_dense_LRelu')
            net = tf.reshape(net,shape=[-1,10,1,1])
        with tf.name_scope('merge'):
            out = tf.concat([conv_2, net], axis = -1)
            out = tf.layers.flatten(out)

        # policy branch
        mu = tf.layers.dense(
            out,
            num_actions,
            kernel_initializer=tf.random_uniform_initializer(
                minval=-3e-3,
                maxval=3e-3
            ),
            name='mu3'
        )
        mu = tf.nn.tanh(mu + 1e-5)

        sigma = tf.layers.dense(
            out,
            num_actions,
            kernel_initializer=tf.random_uniform_initializer(
                minval=-3e-3,
                maxval=3e-3
            ),
            name='sigma3'
        )
        sigma = tf.nn.softplus(sigma + 1e-5)

        dist = tf.distributions.Normal(mu, sigma)
        policy = tf.nn.softmax(dist.sample())
        
        # value branch
        
        value = tf.layers.dense(
            out,
            1,
            kernel_initializer=tf.random_uniform_initializer(
                minval=-3e-3,
                maxval=3e-3
            ),
            name='d3'
        )
    return policy, value, dist

def make_network():
    return lambda *args, **kwargs: _make_network(*args, **kwargs)

In [59]:
def build_summaries():
    pol_sum = [tf.placeholder(tf.float64) for i in range(len(abbr))]
    for ind, x in enumerate(abbr):
        tf.summary.scalar("%s_policy"%x, pol_sum[ind])

    reward_summary = tf.placeholder(tf.float32, (), name='Reward')
    tf.summary.scalar('Reward', reward_summary)

    summary_vars = [reward_summary] + pol_sum
    summary_ops = tf.summary.merge_all()
    
    vardic = {v.name: v for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)}
    hist = []
    for key, value in vardic.items():
        hist.append(tf.summary.histogram(key[:-2], value))
    hist_op = tf.summary.merge(hist)   

    return hist_op, summary_ops, summary_vars

In [60]:
args = {
'outdir': './logs/ppo/out',
'logdir': './logs/ppo/log',
'load': None,
'final_steps': 10 ** 7,
'batch': 64,
'epoch': 100
}

In [68]:
!rm -r ./logs/ppo/
!mkdir ./logs/ppo

In [69]:
tf.reset_default_graph()

env = PortfolioEnv(np.array(list((history.values()))),\
                       abbr,steps=7000, \
                       window_length=128, 
                       trading_cost=0,\
                       sample_start_date='2017-05-10 19:00:00',\
                       start_idx=128)

obs_dim = list(env.observation_space.shape)
n_actions = env.action_space.shape[0]

network = make_network()

with tf.Session() as sess:
    init = tf.global_variables_initializer()
    sess.run(init)
    
    agent = Agent(network, obs_dim, n_actions)

    util.initialize()
    #agent.sync_old()

    saver = tf.train.Saver()

    if args['load'] is not None:
        saver.restore(sess, args['load'])
        
    hist_op, summary_ops, summary_vars = build_summaries()

    global_step = 0
    episode = 0
    while True:
        local_step = 0

        while True:
            training_data = []
            sum_of_reward = 0
            reward = 0
            obs = env.reset()[0]
            last_obs = None
            last_action = None
            last_value = None
            done = False
            train_writer = tf.summary.FileWriter('%s/train/episode_%s' % (args['logdir'], episode), sess.graph)
            while not done:
                action, value = agent.act_and_train(
                        last_obs, last_action, last_value, reward,  obs)

                last_obs = obs
                last_action = action
                last_value = value
                obs, reward, done, info, y = env.step(action)

                sum_of_reward += reward
                global_step += 1
                local_step += 1

                # save model
                if global_step % 20 ** 6 == 0:
                    path = os.path.join(args['outdir'],
                            '{}/model.ckpt'.format(global_step))
                    saver.save(sess, path)

                # summary
                if (global_step % 7001) % 10 == 1:
                    feed_dict={
                        summary_vars[0]: sum_of_reward
                    }
                    pol_dict = {}
                    for k in range(len(action)):
                        pol_dict[summary_vars[1+k]] = action[k]
                    feed_dict.update(pol_dict)

                    summary_str = sess.run(summary_ops, feed_dict=feed_dict) 
                    train_writer.add_summary(summary_str, global_step % 7001)
                    train_writer.flush()
                    
                #end of episode    
                if done:
                    agent.stop_episode(
                            last_obs, last_action, last_value, reward)
                    print(
                        'Episode: {}, Step: {}: Reward: {}'.format(
                        episode,
                        global_step,
                        sum_of_reward
                    ))
                    episode += 1
                    break
                    
            # append data for training
            training_data.append(agent.get_training_data())

            if local_step > 2048:
                break

        # train network
        obs = []
        actions = []
        returns = []
        deltas = []
        k = 0
        for o, a, r, d in training_data:
            obs.extend(o)
            actions.extend(a)
            returns.extend(r)
            deltas.extend(d)
        for epoch in range(50):       
            indices = range(k,(k+1)*args['batch'])
            sampled_obs = np.array(obs)[indices]
            sampled_actions = np.array(actions)[indices]
            sampled_returns = np.array(returns)[indices]
            sampled_deltas = np.array(deltas)[indices]
            ratio = agent.train(
                sampled_obs,
                sampled_actions,
                sampled_returns,
                sampled_deltas
            )
            k += 1
            summary_hist = sess.run(hist_op) 
            train_writer.add_summary(summary_hist, epoch)
            train_writer.flush()

        if args['final_steps'] < global_step:
            break

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Episode: 0, Step: 7001: Reward: 2.877212305978641
-0.24707846 9.507516e-05 0.9996441
-0.38075662 9.2216585e-05 1.0034016
-0.7227026 0.00047442105 1.0041225
-0.3937245 0.0009817599 1.0049675
-0.2966677 0.0013893408 1.0064046
-0.2883636 0.0012458336 1.0077386
-0.24225627 0.001103835 1.0092189
-0.2736369 0.0009131008 1.0107764
-0.23813507 0.00078166486 1.0125216
-0.27875057 0.00073807716 1.0142123
-0.25138706 0.0008039543 1.0159403
-0.23959762 0.0008694541 1.0176742
-0.2694517 0.00080885115 1.0193305
-0.23108041 0.0006801973 1.0214344
-0.20063347 0.0006311163 1.0242211
-0.13394165 0.00079379434 1.027716
-0.12428826 0.00078722305 1.0306537
-0.094245136 0.0006747592 1.03431
-0.09336728 0.0005894797 1.0377527
-0.067096904 0.00067417894 1.0419908
-0.033223186 0.00070248486 1.04540

InvalidArgumentError: Nan in summary histogram for: ppo/network/conv_2/W_1
	 [[Node: ppo/network/conv_2/W_1 = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](ppo/network/conv_2/W_1/tag, ppo/network/conv_2/W/read)]]

Caused by op 'ppo/network/conv_2/W_1', defined at:
  File "/Users/gidman/anaconda3/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/Users/gidman/anaconda3/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/Users/gidman/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/Users/gidman/anaconda3/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/Users/gidman/anaconda3/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 477, in start
    ioloop.IOLoop.instance().start()
  File "/Users/gidman/anaconda3/lib/python3.6/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/Users/gidman/anaconda3/lib/python3.6/site-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/Users/gidman/anaconda3/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/gidman/anaconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/Users/gidman/anaconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/Users/gidman/anaconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/Users/gidman/anaconda3/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/gidman/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/Users/gidman/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 235, in dispatch_shell
    handler(stream, idents, msg)
  File "/Users/gidman/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/Users/gidman/anaconda3/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/Users/gidman/anaconda3/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/Users/gidman/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2698, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/Users/gidman/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2802, in run_ast_nodes
    if self.run_code(code, result):
  File "/Users/gidman/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2862, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-69-ef77825fe786>", line 27, in <module>
    hist_op, summary_ops, summary_vars = build_summaries()
  File "<ipython-input-59-14bc663dd031>", line 15, in build_summaries
    hist.append(tf.summary.histogram(key[:-2], value))
  File "/Users/gidman/anaconda3/lib/python3.6/site-packages/tensorflow/python/summary/summary.py", line 203, in histogram
    tag=tag, values=values, name=scope)
  File "/Users/gidman/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/gen_logging_ops.py", line 283, in histogram_summary
    "HistogramSummary", tag=tag, values=values, name=name)
  File "/Users/gidman/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/Users/gidman/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3392, in create_op
    op_def=op_def)
  File "/Users/gidman/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1718, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

InvalidArgumentError (see above for traceback): Nan in summary histogram for: ppo/network/conv_2/W_1
	 [[Node: ppo/network/conv_2/W_1 = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](ppo/network/conv_2/W_1/tag, ppo/network/conv_2/W/read)]]
