In [1]:
"""
Actor

function 1, make decision: 

    input (s_t), output(action_t)

function 2, update decisition model:

    input (learning_rate, G_t), output(new model)
    
function 3, update the tagrget actor network

"""


import tensorflow as tf
from tensorlayer.layers import (BatchNorm, Conv2d, Dropout , Dense, Flatten, Input, LocalResponseNorm, MaxPool2d)
from tensorlayer.models import Model
import tensorlayer as tl
import numpy as np
import os 

In [None]:
# ===========================
#   Actor
# ===========================


class Actor_NetWork(object):
    '''
    tf_session: tensorflow session
    state_shape: shape of state
    action_shape: 
    learning_rate: learning rate of actor
    target_lr : learning rate of target actor network
    batch size: size of mini batch, used to training
    '''
    
    
    def __init__(self, gamma, state_shape, action_shape, a_learning_rate):
        
        self.gamma = gamma # discount_factor 
        
        self.state_shape = state_shape # should be [None, m_stock, historic_window, feature]
        self.action_shape = action_shape # should be [None, m_stock]
        
        # Acotr Network
        self.actor_learning_rate = a_learning_rate
        self.actor_network = self.get_cnn_actor_model(self.state_shape ,"Actor_Network") # tensorlayer model
        self.actor_network.train()
        self.actor_opt = tf.optimizers.Adam(self.actor_learning_rate)
  

        
  
    
    def Generate_action(self, states, greedy = False):
        '''
        states shape should be [m_stock, historic_window, feature]
        greedy is used to determine random explore 
        
        '''
        
        if greedy:
            new_action = self.actor_network(states)
            return new_action
        else:
            new_action = self.actor_network(states) 
            new_action = new_action + np.random.normal(0, 0.01, np.shape(new_action))
            new_action = np.clip(new_action, 0 ,1) # values outside the interval are clipped to the interval edges
            new_action = new_action/np.sum(new_action)
            new_action = np.array(new_action).astype(np.float32)
            return new_action 
    
    def learn(self, inputs, t, Gt):
        '''
        inputs: (states_t,actions_t,rewards_t,states_t+1)
        inputs shape: [[batch_size, m_stock, historic_window, feature], [batch_size, actions] \
            ,[batch_size], [[batch_size, m_stock, historic_window, feature]]
            
        used to update network
        
        '''
        states = inputs[0]
        actions =  inputs[1]
        rewards = inputs[2]
        next_states = inputs[3]
        
        discount_factor = tf.constant(self.gamma ** t, dtype=tf.float32)
        G_t = tf.constant(Gt, dtype=tf.float32)
        
        # actor gradients - Monte Carlo Policy Gradient
        with tf.GradientTape() as tape:
            actions = self.actor_network(states)
        actor_grads = discount_factor * G_t * tape.gradient(actions, self.actor_network.trainable_weights)
        
        # update actor and critic
        self.actor_opt.apply_gradients(zip(actor_grads, self.actor_network.trainable_weights))
        

           
    def save(self):
        """
        save trained weights
        :return: None
        """
        path = os.path.join('model', '_'.join(["MCPG", "PM"]))
        if not os.path.exists(path):
            os.makedirs(path) # create a new dir
        tl.files.save_weights_to_hdf5(os.path.join(path, 'actor.hdf5'), self.actor_network)

    def load(self):
        """
        load trained weights
        :return: None
        """
        path = os.path.join('model', '_'.join(["MCPG", "PM"]))
        tl.files.load_hdf5_to_weights_in_order(os.path.join(path, 'actor.hdf5'), self.actor_network)
    

    def get_cnn_actor_model(self, inputs_shape, model_name):
        # self defined initialization
        stock_num = inputs_shape[1]
        his_window = inputs_shape[2]
        feature_num = inputs_shape[3]
        W_init = tl.initializers.truncated_normal(stddev=5e-2)
        W_init2 = tl.initializers.truncated_normal(stddev=0.04)
        b_init2 = tl.initializers.constant(value=0.1)

        # build network
        ni = Input(inputs_shape)
        nn = Conv2d(feature_num, (1, 1), (1, 1), padding='SAME', act=tf.nn.relu, W_init=W_init, b_init=None, name='conv1')(ni) #fully connected
        nn = MaxPool2d((3, 3), (2, 2), padding='SAME', name='pool1')(nn)

        nn = Conv2d(feature_num, (1, his_window), (1, 1), padding='SAME', act=tf.nn.relu, W_init=W_init, b_init=None, name='conv2')(nn)
        nn = MaxPool2d((3, 3), (2, 2), padding='SAME', name='pool2')(nn)

        nn = Flatten(name='flatten')(nn)
        nn = Dropout(keep=0.5)(nn)
        nn = Dense(32, act=tf.nn.relu, W_init=W_init2, b_init=b_init2, name='dense1relu')(nn)
        nn = BatchNorm()(nn)
        nn = Dense(32, act=tf.nn.relu, W_init=W_init2, b_init=b_init2, name='dense2relu')(nn)
        nn = BatchNorm()(nn)
        nn = Dense(stock_num, act=tf.nn.softmax, W_init=W_init2, name='output')(nn)
        M = Model(inputs=ni, outputs=nn, name=model_name)
        return M
           

        

In [None]:
import time
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import tensorflow as tf
import tensorlayer as tl
import matplotlib.pyplot as plt
import os


# import our model and env
from env import PortfolioEnv
from ddpg import actor_critic ,replay_buffer

In [None]:
import fix_yahoo_finance as yf

stocks =  ['amt', 'axp', 'ba', 'cvx', 'jnj', 'ko', 'mcd', 'msft', 't', 'wmt']
start = datetime(2012,1,1)
end = datetime(2022,11,11)
data = yf.download(stocks, start=start, end=end)
data_close = data['Adj Close']
data_close.plot()

In [None]:
# using US stock


df = data.drop('Close', axis=1)
df = df[['Open','High','Low','Adj Close','Volume']]
df.columns = df.columns.swaplevel(0, 1)
df.sort_index(axis=1, level=0, inplace=True)
df.columns = pd.MultiIndex.from_product([stocks,['open', 'high', 'low', 'close', "volume"]], 
                                        names=['stock', 'price'])
df = df.dropna()

start = datetime.strptime("2013-01-01", '%Y-%m-%d' )
df_ = df.loc[start:]

# set the env input
history = df_ # the stock data
abbreviation = stocks # name of stock
steps = 100 # step for one eps
trading_cost = 0.0025
time_cost=0.00 # cost of holding equity
window_length = 10 # for obs
eps_move = 0 # move the start date after each rest
sample_start_date = "2018-05-01"

# build the replay_buffer
buffer_size = 50 # the max size of buffer

# build the actor, critic, and target network 
gamma = 0.99986 # discount factor
state_shape = [None,len(abbreviation),window_length,5] # (none, m_stock, history_window, feature)
action_shape = [None,len(abbreviation)] # (none, m_stock)
a_learning_rate = 0.001 # learning rate for actor


In [None]:
# build out agents
#  gamma, state_shape, action_shape, a_learning_rate, target_lr, c_learning_rate
a = actor_critic.Actor_Critic_NetWork(gamma, state_shape,action_shape,a_learning_rate,target_lr,c_learning_rate)

In [None]:
RANDOM_SEED = 1234

# build the env
PM_env = PortfolioEnv.PortfolioEnv(history, abbreviation, steps, trading_cost, time_cost, window_length, eps_move, sample_start_date)

# build the replay buffer
rb = replay_buffer.ReplayBuffer(buffer_size, RANDOM_SEED)

In [None]:
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

state_dim = PM_env.observation_space.shape
action_dim = PM_env.action_space.shape
action_range = PM_env.action_space.high  # scale action, [-action_range, action_range]

In [None]:
import matplotlib.pyplot as plt
import os

TRAIN_EPISODES = 5

batch_size = 50 # mini batch

train = True
RENDER = False

ALG_NAME = "DDPG"
ENV_ID = "Portfolio_Env"

t0 = time.time()
if train:  # train
    all_episode_reward = []
    for episode in range(TRAIN_EPISODES):
        state, info = PM_env.reset() 
        episode_reward = 0
        for step in range(steps):
            #print(state)
            if RENDER:
                PM_env.render()
            # Add exploration noise
            action = ac.Generate_action(state, greedy = False) #[[a0, a1, a2]]
            #print(action)
            state_, reward, done, info = PM_env.step(action) # compute the immidate reward and move to next steps
            #print(info)
            rb.add(state, action, reward, state_) # replay buffer add memory

            if rb.count > buffer_size:
                inputs = rb.sample_batch(batch_size)
                ac.learn(inputs)

            state = state_
            episode_reward += reward * gamma**step
            if done:
                break

        if episode == 0:
            all_episode_reward.append(episode_reward)
        else:
            all_episode_reward.append(all_episode_reward[-1] * 0.9 + episode_reward * 0.1)
        print(
            'Training  | Episode: {}/{}  | Episode Reward: {:.4f}  | Running Time: {:.4f}'.format(
                episode + 1, TRAIN_EPISODES, episode_reward,
                time.time() - t0
            )
        )
    ac.save()
    plt.plot(all_episode_reward)
    if not os.path.exists('image'):
        os.makedirs('image')
    plt.savefig(os.path.join('image', '_'.join([ALG_NAME, ENV_ID])))

