In [None]:
import datetime as dt
import os

import gym
import pandas as pd
import quantstats as qs
from stable_baselines3 import A2C, DQN, PPO
from stable_baselines3.common.policies import ActorCriticPolicy
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import BaseCallback, EvalCallback, CallbackList
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

from dailyenv import DailyTradingEnv

In [None]:
n_tickers = 5
experiment_name = 'small_policy'

tickers = ["AMM", "CIMB", "DIGI", "GAM", "GENM", "GENT", "HLBK", "IOI", "KLK", "MAY", "MISC", "NESZ", "PBK", "PEP", "PETD", "PTG", "RHBBANK", "ROTH", "T", "TNB"]

if n_tickers == 1:
    tickers = ['NESZ']
else:
    tickers = tickers[:n_tickers]

if os.path.exists(f'tensorboard_log/{experiment_name}_train'):
    i = 1
    while os.path.exists(f'tensorboard_log/{experiment_name}({i})_train'):
        i += 1
    experiment_name += f'({i})'

train_env = Monitor(DailyTradingEnv(tickers, dt.datetime(2010, 1, 1), dt.datetime(2018, 1, 1), experiment_name+'_train'))
eval_env = Monitor(DailyTradingEnv(tickers, dt.datetime(2018, 1, 1), dt.datetime(2020, 1, 1), experiment_name+'_eval'))

In [None]:
# Test threshold baselines

baseline_train_env = DailyTradingEnv(tickers, dt.datetime(2010, 1, 1), dt.datetime(2018, 1, 1))
baseline_val_env = DailyTradingEnv(tickers, dt.datetime(2018, 1, 1), dt.datetime(2020, 1, 1))

baseline_train_env.pretrain = True
for threshold in tqdm(range(11)):
    obs = baseline_train_env.reset()
    done = False
    while not done:
        action = np.zeros(len(tickers)+1)
        for i in range(len(tickers)):
            if obs[i] * 10 >= threshold:
                action[i+1] = 1
        if np.sum(action) == 0:
            action[0] = 1

        obs, reward, done, _ = baseline_train_env.step(action)

for threshold in tqdm(range(11)):
    obs = baseline_val_env.reset()
    done = False
    while not done:
        action = np.zeros(len(tickers)+1)
        for i in range(len(tickers)):
            if obs[i] * 10 >= threshold:
                action[i+1] = 1
        if np.sum(action) == 0:
            action[0] = 1

        obs, reward, done, _ = baseline_val_env.step(action)

In [None]:
fig, axs = plt.subplots(2,3,figsize=(15,10))
baseline_train_env.plot_change('rewards', axs[0][0])
baseline_train_env.plot_change(qs.stats.cagr, axs[0][1])
baseline_train_env.plot_change(qs.stats.sharpe, axs[0][2])
baseline_val_env.plot_change('rewards', axs[1][0])
baseline_val_env.plot_change(qs.stats.cagr, axs[1][1])
baseline_val_env.plot_change(qs.stats.sharpe, axs[1][2])
axs[0][0].set_ylabel('Train', fontsize=18)
axs[1][0].set_ylabel('Eval', fontsize=18)
axs[0][0].set_title('reward', fontsize=16)
axs[0][1].set_title('CAGR', fontsize=16)
axs[0][2].set_title('Sharpe', fontsize=16)
axs[1][0].set_xlabel('threshold', fontsize=16)
axs[1][1].set_xlabel('threshold', fontsize=16)
axs[1][2].set_xlabel('threshold', fontsize=16)
fig.suptitle('Attention', fontsize=20)
# fig.savefig('linear_gru.png')

Training

In [None]:
class ProgressBarCallback(BaseCallback):
    def __init__(self, total_steps):
        self.total_steps = total_steps
        super().__init__()
    def _on_training_start(self):
        self.pbar = tqdm(total=self.total_steps)
    def _on_step(self):
        self.pbar.update()
        if train_env.tf_logger is not None:
            train_env.tf_logger.inc_step()
        if eval_env.tf_logger is not None:
            eval_env.tf_logger.inc_step()
    def _on_training_end(self):
        self.pbar.close()

In [None]:
ts = 1000000
cb1 = ProgressBarCallback(ts)
cb2 = EvalCallback(eval_env, n_eval_episodes=1, eval_freq=ts//100)
acb = CallbackList([cb1,cb2])

model = A2C('MlpPolicy', train_env, device='cpu', gamma=0, n_steps=5, policy_kwargs={'net_arch': [dict(pi=[5], vf=[5])]})
#model = PPO('MlpPolicy', train_env, device='cpu', gamma=0)
model.learn(total_timesteps=ts, callback=acb, log_interval=ts)

In [None]:
# rewards, cagr, and sharpe
fig, axs = plt.subplots(2,3,figsize=(15,10))
baseline_train_env.plot_change('rewards', axs[0][0])
baseline_train_env.plot_change(qs.stats.cagr, axs[0][1])
baseline_train_env.plot_change(qs.stats.sharpe, axs[0][2])
baseline_val_env.plot_change('rewards', axs[1][0])
baseline_val_env.plot_change(qs.stats.cagr, axs[1][1])
baseline_val_env.plot_change(qs.stats.sharpe, axs[1][2])
axs[0][0].set_ylabel('Train')
axs[1][0].set_ylabel('Eval')
axs[0][0].set_title('reward')
axs[0][1].set_title('CAGR')
axs[0][2].set_title('Sharpe')
axs[1][0].set_xlabel('threshold')
axs[1][1].set_xlabel('threshold')
axs[1][2].set_xlabel('threshold')
fig.suptitle('RL-baseline comparison on 20-stock portfolio', fontsize=16)

axs[0][0].axhline(np.mean(train_env.get_series('rewards',-1)), color='orange')
axs[0][1].axhline(qs.stats.cagr(train_env.get_series('bankroll',-1)), color='orange')
axs[0][2].axhline(qs.stats.sharpe(train_env.get_series('bankroll',-1)), color='orange')
axs[1][0].axhline(np.mean(eval_env.get_series('rewards',-1)), color='orange')
axs[1][1].axhline(qs.stats.cagr(eval_env.get_series('bankroll',-1)), color='orange')
axs[1][2].axhline(qs.stats.sharpe(eval_env.get_series('bankroll',-1)), color='orange')

fig.savefig(f'graphs/{experiment_name}.png')
# train_env.save_logs(f'saves/{experiment_name}.npz')
# model.save(f'saves/{experiment_name}')

In [None]:
train_env.report()

In [None]:
eval_env.report()

1. sanity check x 1000, verify all stats and graphs that it's working
2. can it fit all 20 stocks in portfolio (higher dimensionality might make some parameters bad etc)
3. next steps (E.g. limits, costs)
- we're trading based on good predictions from train set, but when we give bad predictions for val set then it screws up

Notes
- Singlestock overfitting very hard, can fit to best baseline but absolutely bombs the validation run
- 5 stocks actually performing negatively on val set
- 20 tickers run significantly slower than 1 ticker, maybe we can change datastructure from pandas to numpy
- what do the dividends mean?