<a href="https://colab.research.google.com/github/ammarhusain/XCS229ii-project/blob/main/xcs229ii_final_experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# XCS229ii class project:

- Base algorithm : stock trading using FinRL
- Grid searching RL agent : stable baselines3
- Baseline hyperparameter optimization : [Optuna](https://optuna.readthedocs.io/en/stable/reference/trial.html)
  - Full [link](https://github.com/DLR-RM/rl-baselines3-zoo/blob/master/utils/hyperparams_opt.py) of optimizable parameters for each algorithm


In [None]:
# #Installing FinRL
# %%capture
# !pip install git+https://github.com/AI4Finance-LLC/FinRL-Library.git

# # #Installing Optuna
# !pip install optuna
# !pip install dm_tree
# !pip install ray[tune]
from IPython.display import clear_output

#Importing the libraries
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
# matplotlib.use('Agg')
import datetime
import optuna
%matplotlib inline
from finrl.apps import config
from optuna.integration import PyTorchLightningPruningCallback
from finrl.finrl_meta.preprocessor.yahoodownloader import YahooDownloader
from finrl.finrl_meta.preprocessor.preprocessors import FeatureEngineer, data_split
from finrl.finrl_meta.env_stock_trading.env_stocktrading import StockTradingEnv
from finrl.finrl_meta.env_stock_trading.env_stocktrading_np import StockTradingEnv as StockTradingEnv_numpy
from finrl.drl_agents.stablebaselines3.models import DRLAgent
from finrl.drl_agents.rllib.models import DRLAgent as DRLAgent_rllibca
from finrl.finrl_meta.data_processor import DataProcessor
import joblib
from finrl.plot import backtest_stats, backtest_plot, get_daily_return, get_baseline
import ray
from pprint import pprint

import numpy as np
import gym
from gym import spaces
import copy

import sys
sys.path.append("../FinRL-Library")

import itertools

import os
exp_name = "HypRL-FinRLbase/"
if not os.path.exists(exp_name + config.DATA_SAVE_DIR):
    os.makedirs(exp_name + config.DATA_SAVE_DIR)
if not os.path.exists(exp_name + config.TRAINED_MODEL_DIR):
    os.makedirs(exp_name + config.TRAINED_MODEL_DIR)
if not os.path.exists(exp_name + config.TENSORBOARD_LOG_DIR):
    os.makedirs(exp_name + config.TENSORBOARD_LOG_DIR)
if not os.path.exists(exp_name + config.RESULTS_DIR):
    os.makedirs(exp_name + config.RESULTS_DIR)

  "Distutils was imported before Setuptools. This usage is discouraged "
  'Module "zipline.assets" not found; multipliers will not be applied'


## Dataset Loading

In [None]:
%%capture
## Collecting and preprocessing data
DATASET_INFO = {'dow30' : [exp_name + "dow_30_processed_full.pkl", config.DOW_30_TICKER],
                'hsi_50' : [exp_name + "hsi_50_processed_full.pkl", config.HSI_50_TICKER],
                'dax_30' : [exp_name + "dax_30_processed_full.pkl", config.DAX_30_TICKER],
                'nas100' : [exp_name + "nas_100_processed_full.pkl", config.NAS_100_TICKER],
                #'sp500' : [exp_name + "sp_500_processed_full.pkl", config.SP_500_TICKER]
                 }
DATASETS = {}
N_TRIALS = 30
def load_datasets():
  for ds in DATASET_INFO.keys():
    print(f"DATASET_INFO[ds][0] {type(DATASET_INFO[ds])} .. {ds}")
    if not os.path.exists(DATASET_INFO[ds][0]):
      print(f"Processing {ds}")
      df = YahooDownloader(start_date = '2009-01-01',
                          end_date = '2021-10-01',
                          ticker_list = DATASET_INFO[ds][1]).fetch_data()

      fe = FeatureEngineer(
                          use_technical_indicator=True,
                          tech_indicator_list = config.TECHNICAL_INDICATORS_LIST,
                          use_vix=True,
                          use_turbulence=True,
                          user_defined_feature = False)

      processed = fe.preprocess_data(df)

      list_ticker = processed["tic"].unique().tolist()
      list_date = list(pd.date_range(processed['date'].min(),processed['date'].max()).astype(str))
      combination = list(itertools.product(list_date,list_ticker))

      processed_full = pd.DataFrame(combination,columns=["date","tic"]).merge(processed,on=["date","tic"],how="left")
      processed_full = processed_full[processed_full['date'].isin(processed['date'])]
      processed_full = processed_full.sort_values(['date','tic'])

      processed_full = processed_full.fillna(0)
      processed_full.to_pickle(DATASET_INFO[ds][0])

    DATASETS[ds] = pd.read_pickle(DATASET_INFO[ds][0])


In [None]:
## Setup datasets
TRADER_BOT = {
    'TRAIN_PERIOD' : ['2010-01-01', '2015-12-31'],
    'TRAIN_EVAL_PERIOD' : ['2016-01-01', '2017-12-31'],
    'TRADE_PERIOD' : ['2018-01-01', '2021-10-31'],
}

def get_string(params):
  return params['DATASET'] + "_train_" + params['TRAIN_PERIOD'][0] + "_" + params['TRAIN_PERIOD'][1] \
  + "_train_eval_" + params['TRAIN_EVAL_PERIOD'][0] + "_" + params['TRAIN_EVAL_PERIOD'][1]

def get_environments(params):
  train_set = data_split(DATASETS[params['DATASET']], 
                        params['TRAIN_PERIOD'][0], params['TRAIN_PERIOD'][1])
  train_eval_set = data_split(DATASETS[params['DATASET']], 
                        params['TRAIN_EVAL_PERIOD'][0], params['TRAIN_EVAL_PERIOD'][1])
  stock_dimension = len(train_set.tic.unique())
  state_space = 1 + 2*stock_dimension + len(config.TECHNICAL_INDICATORS_LIST)*stock_dimension
  print(f"Size of dataset splits - Training: {len(train_set)} , Trading: {len(train_eval_set)}, \
          Stock Dimension: {stock_dimension}, State Space: {state_space}")

  env_kwargs = {
      "hmax": 100, 
      "initial_amount": 1000000, 
      "buy_cost_pct": 0.001,
      "sell_cost_pct": 0.001,
      "state_space": state_space, 
      "stock_dim": stock_dimension, 
      "tech_indicator_list": config.TECHNICAL_INDICATORS_LIST, 
      "action_space": stock_dimension, 
      "reward_scaling": 1e-4
  }
  # Setup gym environments for the FinRL child model
  env_train_gym = StockTradingEnv(df = train_set, **env_kwargs)
  env_train_eval_gym = StockTradingEnv(df = train_eval_set, turbulence_threshold = None, **env_kwargs)   
  return env_train_gym, env_train_eval_gym

## Build the RL environment and agent

In [None]:
# Objective for tuning : Sharpe ratio
def calculate_sharpe(df):
  df['daily_return'] = df['account_value'].pct_change(1)
  if df['daily_return'].std() !=0:
    sharpe = (252**0.5)*df['daily_return'].mean()/ \
          df['daily_return'].std()
    return sharpe
  else:
    return 0

def evaluate(hyperparameters, env_train_gym, env_train_eval_gym, total_timesteps):
  agent = DRLAgent(env = env_train_gym.get_sb_env()[0])
  model_ddpg = agent.get_model("ddpg", model_kwargs = hyperparameters, verbose=0)
  trained_ddpg = agent.train_model(model=model_ddpg,
                                  tb_log_name="ddpg_optuna",
                                  total_timesteps=total_timesteps)
  # trained_ddpg.save('trained_models/optuna/ddpg_{}.pth'.format(trial.number))
  # clear_output(wait=True)
  # For the given hyperparamters, determine the account value in the trading period
  df_account_value, df_actions = DRLAgent.DRL_prediction(
    model=trained_ddpg, 
    environment = env_train_eval_gym)
  # Calculate sharpe from the account value
  return calculate_sharpe(df_account_value)

def get_n_maxvalues(grid, n=20):
  f_grid = grid.flatten()
  max_indices = f_grid.argsort()[-n:][::-1]
  indices = np.stack(np.unravel_index(max_indices, grid.shape), axis=1)
  return (f_grid[max_indices], indices)

class TunableHP:
  def __init__(self, eval_cache_path: str, env_train_gym = None, env_train_eval_gym = None):
    self.hyperparameters = {
      "gamma" : [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999],
      "learning_rate" : [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1],
      "batch_size" : [16, 32, 64, 100, 128, 256, 512, 1024, 2048],
      "buffer_size" : [int(1e4), int(1e5), int(1e6)]    
    }
    self.hyperparameter_keys = list(self.hyperparameters) 
    self.env_train_gym = env_train_gym
    self.env_train_eval_gym = env_train_eval_gym
    self.eval_cache = np.zeros(self.getGridSize()) - 1.0

    if os.path.exists('cached_hyp_results/' + eval_cache_path):
      # print(f"Loading cached_hyp_results/{eval_cache_path}")
      self.eval_cache = np.load('cached_hyp_results/' + eval_cache_path)

  def mapStateToHP(self,state):
    hp_dict = {}
    for p,i in enumerate(state):
      param_key = self.hyperparameter_keys[p]
      hp_dict[param_key] = self.hyperparameters[param_key][i]
    return hp_dict

  def mapHPToState(self, hp_dict):
    state = []
    for hpk in self.hyperparameter_keys:
      state.append(self.hyperparameters[hpk].index(hp_dict[hpk]))
    return state

  def getGridSize(self):
    return [len(self.hyperparameters[k]) for k in self.hyperparameter_keys]

  def eval(self, state):
    state = tuple(state)
    if self.eval_cache[state] != [-1.0]:
      return self.eval_cache[state]
    if self.env_train_gym == None or self.env_train_eval_gym == None:
      print(f"No gym environments set. Can only depend on cache")
      assert(1==0)
    hp_dict = self.mapStateToHP(state)
    print(f"Running evaluation for : {state} -> {hp_dict}")
    self.env_train_gym.reset()
    self.env_train_eval_gym.reset()
    return evaluate(hp_dict, self.env_train_gym, self.env_train_eval_gym, 5000)

  def sample_optuna_params(self, trial:optuna.Trial):
    # setup optuna trial
    self.hyperparameters_optuna = {}
    for hpk in self.hyperparameter_keys:
      self.hyperparameters_optuna[hpk] = trial.suggest_categorical(hpk, self.hyperparameters[hpk])
    return self.hyperparameters_optuna
    
  def optuna_objective(self, trial:optuna.Trial):
    # Trial will suggest a set of hyperparamters from the specified range
    hyperparameters_optuna_trial = self.sample_optuna_params(trial)
    return self.eval(self.mapHPToState(hyperparameters_optuna_trial))

class HypRLGridEnv(gym.Env):
  """
  Custom Environment that follows gym interface.
  """
  MAX_ITER = N_TRIALS

  def __init__(self, tunableParams: TunableHP, reward_mechanism = "accumulate", is_test = False):
    super(HypRLGridEnv, self).__init__()
    self.tunableParams = tunableParams
    # Size of the grid
    self.grid_size = tunableParams.getGridSize()
    # Define action and observation space
    n_actions = 5
    self.action_space = spaces.Box(low=-int(n_actions/2), high=int(n_actions/2), shape=(len(self.grid_size),), dtype=np.int32)
    # The observation will be the coordinate of the agent
    self.observation_space = spaces.MultiDiscrete(self.grid_size)
    self.reward_mechanism = reward_mechanism
    self.is_test = is_test

  def reset(self):
    """
    Important: the observation must be a numpy array
    :return: (np.array) 
    """
    # reset the number of iterations for this agent
    self.iter = 0
    # Initialize the agent at the right of the grid
    self.agent_state = np.random.randint(self.grid_size)
    self.reward = self.tunableParams.eval(self.agent_state)
    self.best = {'state': copy.deepcopy(self.agent_state), 'val': self.reward}
    self.visited = {}
    self.visited[tuple(self.agent_state)] = True
    return np.array(self.agent_state) 

  def step(self, action):
    self.iter += 1

    for i, _ in enumerate(action):
      self.agent_state[i] += round(action[i])
      # Account for the boundaries of the grid
      self.agent_state[i] = np.clip(self.agent_state[i], 0, self.grid_size[i]-1)

    if self.is_test == True:
      done = self.iter >= self.MAX_ITER
    else:
      # We are done when we visit the same state twice or have taken more iterations than MAX
      done = bool(self.iter >= self.MAX_ITER or tuple(self.agent_state) in self.visited)

    self.visited[tuple(self.agent_state)] = True

    if self.tunableParams.eval(self.agent_state) > self.best['val']:
      self.best = {'state':copy.deepcopy(self.agent_state), 'val': self.tunableParams.eval(self.agent_state)}

    if self.reward_mechanism == "best_state_end":
      # reward idea #1 : this just ends up at a given cell that is maximum from its training environment
      # Reward is minimum of whatever val loss we saw so far
      # Null reward everywhere except when the episode terminates
      reward = self.best['val'] if done else 0
    elif self.reward_mechanism == "best_state_ongoing":
      # reward idea #5
      # set the agent reward to the current best value
      reward = self.best['val']
    elif self.reward_mechanism == "current_state_end":
      # reward idea #2
      # set the reward to that observed in the final state
      # Null reward everywhere except when the episode terminates
      reward = self.tunableParams.eval(self.agent_state) if done else 0
    elif self.reward_mechanism == "current_state_ongoing":
      # reward idea #4
      # set the agent reward is whatever is at the current state
      reward = self.tunableParams.eval(self.agent_state)
    elif self.reward_mechanism == "accumulate_ongoing":
      # reward idea #3
      # let the agent accumulate reward as it goes
      self.reward += self.tunableParams.eval(self.agent_state)
      reward = self.reward
    elif self.reward_mechanism == "accumulate_end":
      # reward idea #3
      # let the agent accumulate reward as it goes
      self.reward += self.tunableParams.eval(self.agent_state)
      reward = self.reward if done else 0



    # Optionally we can pass additional info
    info = {}
    info['best'] = self.best
    info['visited'] = self.visited

    return np.array(self.agent_state), reward, done, info

# # check and make sure the environment is sane and working
# from stable_baselines3.common.env_checker import check_env
# # If the environment doesn't follow the interface, an error will be thrown
# hyprl_env = HypRLGridEnv(TunableHP(*get_environments(RL_AGENT_PARAMS)))
# check_env(hyprl_env, warn=True)


### Hyp-RL Agent

In [None]:
from stable_baselines3 import DQN, PPO, A2C, DDPG, TD3
from stable_baselines3.common.env_util import make_vec_env
import statistics as sts
optuna.logging.set_verbosity(optuna.logging.WARN)

def get_name(ds):
  if ds == 'hsi_50_train_2010-01-01_2015-12-31_train_eval_2016-01-01_2017-12-31.npy':
    return 'HSI_50'
  elif ds == 'dow30_train_2010-01-01_2015-12-31_train_eval_2016-01-01_2017-12-31.npy':
    return 'DOW_30'
  elif ds == 'nas100_train_2010-01-01_2015-12-31_train_eval_2016-01-01_2017-12-31.npy':
    return 'NAS_100'

# hsi_50_train_2010-01-01_2015-12-31_train_eval_2016-01-01_2017-12-31.npy
# dow30_train_2010-01-01_2015-12-31_train_eval_2016-01-01_2017-12-31.npy
OPT_SAMPLES = 20
available_datasets = ['hsi_50_train_2010-01-01_2015-12-31_train_eval_2016-01-01_2017-12-31.npy', 'dow30_train_2010-01-01_2015-12-31_train_eval_2016-01-01_2017-12-31.npy', 'nas100_train_2010-01-01_2015-12-31_train_eval_2016-01-01_2017-12-31.npy']
RL_agent_reward_mechanisms = ["best_state_end", "best_state_ongoing", "current_state_end", "current_state_ongoing", "accumulate_ongoing", "accumulate_end"]
results_dict = {}
for ds_train in available_datasets:
  print("=====================================================================")
  results_dict[get_name(ds_train)] = []
  for reward_mechanism in RL_agent_reward_mechanisms:
    print(f"Training RL agent on {ds_train} with reward mechanism - '{reward_mechanism}'")

    RL_agent_train_tunableHP = TunableHP(ds_train)
    # Train the agent
    hyprl_train_env = HypRLGridEnv(RL_agent_train_tunableHP, reward_mechanism)
    hyprl_train_env = make_vec_env(lambda: hyprl_train_env, n_envs=1)
    hyprl_model = A2C('MlpPolicy', hyprl_train_env, verbose=0)
    ## may be try
    # hyprl_model = DQN('MlpPolicy', hyprl_env, verbose=1)
    # hyprl_model = PPO('MlpPolicy', hyprl_env, verbose=1)
    # hyprl_model = TD3('MlpPolicy', hyprl_train_env, verbose=1).learn(5000)

    hyprl_model.learn(total_timesteps=50000)

    for ds_test in available_datasets:
      print(f"Testing agent on {ds_test}")    
      test_tunableHP = TunableHP(ds_test)
      hyprl_test_env = HypRLGridEnv(test_tunableHP, reward_mechanism, is_test=True)
      hyprl_test_env = make_vec_env(lambda: hyprl_test_env, n_envs=1)
      print(f"Top 20 ground truth : mean {sts.mean(get_n_maxvalues(test_tunableHP.eval_cache, 20)[0]):.3f}")
      RL_agent_eval = []
      optuna_baseline_eval = []
      random_baseline_eval = []
      for i in range(OPT_SAMPLES):
        obs = hyprl_test_env.reset()
        for step in range(N_TRIALS):
          action, _ = hyprl_model.predict(obs, deterministic=True)
          obs, reward, done, info = hyprl_test_env.step(action)
          if done:
            RL_agent_eval.append(info[0]['best']['val'])
            break

        # Run the optuna baseline
        sampler = optuna.samplers.TPESampler(seed=np.random.randint(100))
        study = optuna.create_study(study_name="optuna_hyprl",direction='maximize',
                                    sampler = sampler, pruner=optuna.pruners.HyperbandPruner())
        study.optimize(test_tunableHP.optuna_objective, n_trials=N_TRIALS,catch=(ValueError,))
        optuna_baseline_eval.append(study.best_trial.value) 
      # Fetch a random value from the hyperparameter grid
      random_baseline_eval = np.random.choice(hyprl_test_env.envs[0].tunableParams.eval_cache.flatten(), OPT_SAMPLES)
      results_dict[get_name(ds_train)].append([get_name(ds_test), 'Top-20 (mean)', 'RL agent', 'Optuna baseline', 'Random baseline'])
      results_dict[get_name(ds_train)].append([reward_mechanism, f"{sts.mean(get_n_maxvalues(test_tunableHP.eval_cache, 20)[0]):.3f}", 
                                               f"{sts.mean(RL_agent_eval):.3f}", f"{sts.mean(optuna_baseline_eval):.3f}", f"{sts.mean(random_baseline_eval):.3f}"])

      if sts.mean(RL_agent_eval) > sts.mean(optuna_baseline_eval):
        print(f"%%%%%%%% BETTER THAN BASELINE %%%%%%%%")
        print(f"RL Agent            : mean {sts.mean(RL_agent_eval):.3f}")
        print(f"Optuna baseline     : mean {sts.mean(optuna_baseline_eval):.3f}")

      print(".....................................................................")


Training RL agent on hsi_50_train_2010-01-01_2015-12-31_train_eval_2016-01-01_2017-12-31.npy with reward mechanism - 'best_state_end'


KeyboardInterrupt: ignored

In [None]:
## Append some random sampling from each dataset
OPT_SAMPLES

from tabulate import tabulate

f = open("experiments.txt", "w")
f.write("Trained of DOW_30\n")
f.write(f"{tabulate(results_dict['DOW_30'], tablefmt='grid')}")
f.write("\nTrained of HSI_50\n")
f.write(f"{tabulate(results_dict['HSI_50'], tablefmt='grid')}")

f.write("\nTrained of NAS_100\n")
f.write(f"{tabulate(results_dict['NAS_100'], tablefmt='grid')}")

f.close()

print(f"{tabulate(results_dict['DOW_30'], tablefmt='grid')}")


In [None]:
a = [5,2,7,1,9,0,58,47,2]
np.random.choice(a, size=5, replace=True, p=None)


In [None]:
#   #For the given hyperparamters, determine the account value in the trading period
#   df_account_value, df_actions = DRLAgent.DRL_prediction(
#     model=trained_ddpg, 
#     environment = e_train_eval_gym)
#   #Calculate sharpe from the account value
#   sharpe = calculate_sharpe(df_account_value)

# perf_stats_all = pd.DataFrame(backtest_stats(account_value=df_account_value))
# #print(f"Baseline stats: {perf_stats_all}")

## Archive code

Generates and saves the full cache to disk

In [None]:
import numpy as np
nas = np.load('cached_hyp_results/nas100_train_2010-01-01_2015-12-31_train_eval_2016-01-01_2017-12-31.npy')
nas = nas.flatten()
np.where(nas==-1.0)[0].shape



In [None]:
# cache saver
%%time
SAVER_PARAMS = {
    'TRAIN_PERIOD' : ['2010-01-01', '2015-12-31'],
    'TRAIN_EVAL_PERIOD' : ['2016-01-01', '2017-12-31'],
    
}

load_datasets()

import itertools
for ds in DATASET_INFO.keys():
  print(f"Caching for dataset: {ds}")
  SAVER_PARAMS['DATASET'] = ds 
  file_path = 'cached_hyp_results_2/' + get_string(SAVER_PARAMS) + '.npy'
  hyprl_env = HypRLGridEnv(TunableHP(get_string(SAVER_PARAMS) + '.npy', *get_environments(SAVER_PARAMS)))
  # generate the eval_cache
  grid_size = hyprl_env.tunableParams.getGridSize()
  hyperparameter_ranges = [range(gs) for gs in grid_size]
  for s in list(itertools.product(*hyperparameter_ranges)):
    hyprl_env.tunableParams.eval(s)  
    np.save(file_path, hyprl_env.tunableParams.eval_cache)


  # hyprl_env.eval(np.array([0,0]))

DATASET_INFO[ds][0] <class 'list'> .. dow30
DATASET_INFO[ds][0] <class 'list'> .. hsi_50
DATASET_INFO[ds][0] <class 'list'> .. dax_30
DATASET_INFO[ds][0] <class 'list'> .. nas100
Caching for dataset: dow30
Size of dataset splits - Training: 43761 , Trading: 14587,           Stock Dimension: 29, State Space: 291
Caching for dataset: hsi_50
Size of dataset splits - Training: 44702 , Trading: 14942,           Stock Dimension: 31, State Space: 311
Caching for dataset: dax_30
Size of dataset splits - Training: 40068 , Trading: 13419,           Stock Dimension: 27, State Space: 271
Running evaluation for : (5, 0, 7, 0) -> {'gamma': 0.999, 'learning_rate': 1e-05, 'batch_size': 1024, 'buffer_size': 10000}
{'gamma': 0.999, 'learning_rate': 1e-05, 'batch_size': 1024, 'buffer_size': 10000}
hit end!
Running evaluation for : (5, 0, 7, 1) -> {'gamma': 0.999, 'learning_rate': 1e-05, 'batch_size': 1024, 'buffer_size': 100000}
{'gamma': 0.999, 'learning_rate': 1e-05, 'batch_size': 1024, 'buffer_size': 