<a href="https://colab.research.google.com/github/ammarhusain/XCS229ii-project/blob/main/xcs229ii_FinRL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# #Installing FinRL
# %%capture
# !pip install git+https://github.com/AI4Finance-LLC/FinRL-Library.git

# #Installing Optuna
# !pip install optuna
# !pip install dm_tree
# !pip install ray[tune]
from IPython.display import clear_output

#Importing the libraries
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
# matplotlib.use('Agg')
import datetime
import optuna
%matplotlib inline
from finrl.apps import config
from optuna.integration import PyTorchLightningPruningCallback
from finrl.finrl_meta.preprocessor.yahoodownloader import YahooDownloader
from finrl.finrl_meta.preprocessor.preprocessors import FeatureEngineer, data_split
from finrl.finrl_meta.env_stock_trading.env_stocktrading import StockTradingEnv
from finrl.finrl_meta.env_stock_trading.env_stocktrading_np import StockTradingEnv as StockTradingEnv_numpy
from finrl.drl_agents.stablebaselines3.models import DRLAgent
from finrl.drl_agents.rllib.models import DRLAgent as DRLAgent_rllib
from finrl.finrl_meta.data_processor import DataProcessor
import joblib
from finrl.plot import backtest_stats, backtest_plot, get_daily_return, get_baseline
import ray
from pprint import pprint

import numpy as np
import gym
from gym import spaces
import copy

import sys
sys.path.append("../FinRL-Library")

import itertools

import os
exp_name = "HypRL-FinRLbase/"
if not os.path.exists(exp_name + config.DATA_SAVE_DIR):
    os.makedirs(exp_name + config.DATA_SAVE_DIR)
if not os.path.exists(exp_name + config.TRAINED_MODEL_DIR):
    os.makedirs(exp_name + config.TRAINED_MODEL_DIR)
if not os.path.exists(exp_name + config.TENSORBOARD_LOG_DIR):
    os.makedirs(exp_name + config.TENSORBOARD_LOG_DIR)
if not os.path.exists(exp_name + config.RESULTS_DIR):
    os.makedirs(exp_name + config.RESULTS_DIR)

  "Distutils was imported before Setuptools. This usage is discouraged "
  'Module "zipline.assets" not found; multipliers will not be applied'


## Dataset Loading

In [None]:
%%capture
## Collecting and preprocessing data
dataset_info = {'dow30' : [exp_name + "dow_30_processed_full.pkl", config.DOW_30_TICKER],
                'hsi_50' : [exp_name + "hsi_50_processed_full.pkl", config.HSI_50_TICKER],
                'dax_30' : [exp_name + "dax_30_processed_full.pkl", config.DAX_30_TICKER],
                 'nas100' : [exp_name + "nas_100_processed_full.pkl", config.NAS_100_TICKER],
                 'sp500' : [exp_name + "sp_500_processed_full.pkl", config.SP_500_TICKER]
                 }
datasets = {}
print(dataset_info)
for ds in dataset_info.keys():
  print(f"dataset_info[ds][0] {type(dataset_info[ds])} .. {ds}")
  if not os.path.exists(dataset_info[ds][0]):
    print(f"Processing {ds}")
    df = YahooDownloader(start_date = '2009-01-01',
                        end_date = '2021-10-01',
                        ticker_list = dataset_info[ds][1]).fetch_data()

    fe = FeatureEngineer(
                        use_technical_indicator=True,
                        tech_indicator_list = config.TECHNICAL_INDICATORS_LIST,
                        use_vix=True,
                        use_turbulence=True,
                        user_defined_feature = False)

    processed = fe.preprocess_data(df)

    list_ticker = processed["tic"].unique().tolist()
    list_date = list(pd.date_range(processed['date'].min(),processed['date'].max()).astype(str))
    combination = list(itertools.product(list_date,list_ticker))

    processed_full = pd.DataFrame(combination,columns=["date","tic"]).merge(processed,on=["date","tic"],how="left")
    processed_full = processed_full[processed_full['date'].isin(processed['date'])]
    processed_full = processed_full.sort_values(['date','tic'])

    processed_full = processed_full.fillna(0)
    processed_full.to_pickle(dataset_info[ds][0])

  datasets[ds] = pd.read_pickle(dataset_info[ds][0])


In [None]:
## Setup datasets
RL_AGENT_PARAMS = {
    'DATASET' : 'dow30',
    'TRAIN_PERIOD' : ['2010-01-01', '2015-12-31'],
    'TRADE_PERIOD' : ['2016-01-01', '2017-12-31'],
    
}
TRADER_BOT = {
    'TRAIN_PERIOD' : ['2010-01-01', '2017-12-31'],
    'TRADE_PERIOD' : ['2018-01-01', '2021-10-31'],
}

def get_environments(params):
  train_set = data_split(datasets[params['DATASET']], 
                        params['TRAIN_PERIOD'][0], params['TRAIN_PERIOD'][1])
  trade_set = data_split(datasets[params['DATASET']], 
                        params['TRADE_PERIOD'][0], params['TRADE_PERIOD'][1])
  stock_dimension = len(train_set.tic.unique())
  state_space = 1 + 2*stock_dimension + len(config.TECHNICAL_INDICATORS_LIST)*stock_dimension
  print(f"Size of dataset splits - Training: {len(train_set)} , Trading: {len(trade_set)}, \
          Stock Dimension: {stock_dimension}, State Space: {state_space}")

  env_kwargs = {
      "hmax": 100, 
      "initial_amount": 1000000, 
      "buy_cost_pct": 0.001,
      "sell_cost_pct": 0.001,
      "state_space": state_space, 
      "stock_dim": stock_dimension, 
      "tech_indicator_list": config.TECHNICAL_INDICATORS_LIST, 
      "action_space": stock_dimension, 
      "reward_scaling": 1e-4
  }
  # Setup gym environments for the FinRL child model
  env_train_gym = StockTradingEnv(df = train_set, **env_kwargs)
  env_trade_gym = StockTradingEnv(df = trade_set, turbulence_threshold = None, **env_kwargs)   
  return env_train_gym, env_trade_gym

## Build the RL environment and agent

In [None]:
# Objective for tuning : Sharpe ratio
def calculate_sharpe(df):
  df['daily_return'] = df['account_value'].pct_change(1)
  if df['daily_return'].std() !=0:
    sharpe = (252**0.5)*df['daily_return'].mean()/ \
          df['daily_return'].std()
    return sharpe
  else:
    return 0

def evaluate(hyperparameters, env_train_gym, env_trade_gym, total_timesteps):
  ## HACK
  return np.random.randint(100)
  agent = DRLAgent(env = env_train_gym.get_sb_env()[0])
  model_ddpg = agent.get_model("ddpg", model_kwargs = hyperparameters, verbose=0)
  trained_ddpg = agent.train_model(model=model_ddpg,
                                  tb_log_name="ddpg_optuna",
                                  total_timesteps=total_timesteps)
  # trained_ddpg.save('trained_models/optuna/ddpg_{}.pth'.format(trial.number))
  clear_output(wait=True)
  # For the given hyperparamters, determine the account value in the trading period
  df_account_value, df_actions = DRLAgent.DRL_prediction(
    model=trained_ddpg, 
    environment = env_trade_gym)
  # Calculate sharpe from the account value
  return calculate_sharpe(df_account_value)

class TunableHP:
  def __init__(self, env_train_gym, env_trade_gym):
    self.hyperparameters = {
      # "gamma" : [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999],
      # "learning_rate" : [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1],
      "batch_size" : [16, 32, 64, 100, 128, 256, 512, 1024, 2048],
      "buffer_size" : [int(1e4), int(1e5), int(1e6)]    
    }
    self.hyperparameter_keys = list(self.hyperparameters) 
    self.env_train_gym = env_train_gym
    self.env_trade_gym = env_trade_gym
  def mapStateToHP(self,state):
    hp_dict = {}
    for p,i in enumerate(state):
      param_key = self.hyperparameter_keys[p]
      hp_dict[param_key] = self.hyperparameters[param_key][i]
    return hp_dict
  
  def getGridSize(self):
    return [len(self.hyperparameters[k]) for k in self.hyperparameter_keys]

  def evaluateRLAgent(self, state):
    hp_dict = self.mapStateToHP(state)
    print(f"Running evaluation for : {state} -> {hp_dict}")
    # log to tensorboard if needed
    return evaluate(hp_dict, self.env_train_gym, self.env_trade_gym, 5000)


class HypRLGridEnv(gym.Env):
  """
  Custom Environment that follows gym interface.
  """
  MAX_ITER = 10

  def __init__(self, tunableParams: TunableHP):
    super(HypRLGridEnv, self).__init__()

    self.tunableParams = tunableParams

    # Size of the grid
    self.grid_size = tunableParams.getGridSize()
    
    # Define action and observation space
    n_actions = 5
    self.action_space = spaces.Box(low=-int(n_actions/2), high=int(n_actions/2), shape=(len(self.grid_size),), dtype=np.int32)
    # The observation will be the coordinate of the agent
    self.observation_space = spaces.MultiDiscrete(self.grid_size)
    self.eval_cache = np.zeros(self.grid_size)

  def eval(self, state):
    state = tuple(state)
    if self.eval_cache[state] == [0.0]:
      # train & test the model for these hyperparameters
      self.eval_cache[state] = self.tunableParams.evaluateRLAgent(state)
    return self.eval_cache[state]

  def reset(self):
    """
    Important: the observation must be a numpy array
    :return: (np.array) 
    """
    # reset the number of iterations for this agent
    self.iter = 0
    # Initialize the agent at the right of the grid
    self.agent_state = np.random.randint(self.grid_size)
    self.reward = self.eval(self.agent_state)
    self.best = {'state':copy.deepcopy(self.agent_state), 'val':self.eval(self.agent_state)}
    self.visited = {}
    self.visited[tuple(self.agent_state)] = True
    return np.array(self.agent_state) 

  def step(self, action):
    self.iter += 1

    print(f"self.agent_state {self.agent_state} action {action}")
    for i, _ in enumerate(action):
      self.agent_state[i] += round(action[i])
      # Account for the boundaries of the grid
      self.agent_state[i] = np.clip(self.agent_state[i], 0, self.grid_size[i]-1)
    print(f"self.agent_state {self.agent_state} after")

    # We are done when we visit the same state twice or have taken more iterations than MAX
    done = bool(self.iter >= self.MAX_ITER or tuple(self.agent_state) in self.visited)

    self.visited[tuple(self.agent_state)] = True

    # reward idea #1
    # Reward is minimum of whatever val loss we saw so far
    self.reward = max(self.reward, self.eval(self.agent_state))
    # Null reward everywhere except when the episode terminates
    reward = self.reward if done else 0

    # reward idea #2
    # set the reward to that observed in the final state
    #reward = self.eval(self.agent_state) if done else 0

    # reward idea #3
    # let the agent accumulate reward as it goes
    # self.reward += self.eval(self.agent_state)
    # reward = self.reward

    if self.eval(self.agent_state) > self.best['val']:
      self.best = {'state':copy.deepcopy(self.agent_state), 'val':self.eval(self.agent_state)}

    # Optionally we can pass additional info
    info = {}
    info['best'] = self.best
    info['visited'] = self.visited

    return np.array(self.agent_state), reward, done, info


# # check and make sure the environment is sane and working
# from stable_baselines3.common.env_checker import check_env
# # If the environment doesn't follow the interface, an error will be thrown
# hyprl_env = HypRLGridEnv(TunableHP(*get_environments(RL_AGENT_PARAMS)))
# check_env(hyprl_env, warn=True)


### Hyp-RL Agent

In [None]:
from stable_baselines3 import DQN, PPO, A2C
from stable_baselines3.common.env_util import make_vec_env
#from stable_baselines3.common.policies import MlpPolicy
# # Instantiate the env
hyprl_env = HypRLGridEnv(TunableHP(*get_environments(RL_AGENT_PARAMS)))
# wrap it
hyprl_env = make_vec_env(lambda: hyprl_env, n_envs=1)

# Train the agent
##model = ACKTR('MlpPolicy', env, verbose=1).learn(5000)
hyprl_model = A2C('MlpPolicy', hyprl_env, verbose=1)
#hyprl_model = DQN('MlpPolicy', hyprl_env, verbose=1)
# hyprl_model = PPO('MlpPolicy', hyprl_env, verbose=1)
hyprl_model.learn(total_timesteps=50000)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
self.agent_state [7 2] action [ 0.08136038 -0.97359872]
self.agent_state [7 1] after
self.agent_state [4 2] action [-2. -2.]
self.agent_state [2 0] after
self.agent_state [2 0] action [ 0.59847158 -1.40313959]
self.agent_state [3 0] after
self.agent_state [3 0] action [-0.71976215 -1.83019435]
self.agent_state [2 0] after
self.agent_state [4 0] action [-2. -2.]
self.agent_state [2 0] after
self.agent_state [2 0] action [ 0.0795549 -1.4893595]
self.agent_state [2 0] after
self.agent_state [5 2] action [ 2.         -0.43466666]
self.agent_state [7 2] after
self.agent_state [7 2] action [-0.03029279 -0.83024669]
self.agent_state [7 1] after
self.agent_state [7 1] action [-0.05845634 -0.23215619]
self.agent_state [7 1] after
self.agent_state [4 2] action [-2. -2.]
self.agent_state [2 0] after
self.agent_state [2 0] action [ 0.33653378 -1.19052684]
self.agent_state [2 0] after
self.agent_state [7 1] action [-0.15418194  0.1347

<stable_baselines3.a2c.a2c.A2C at 0x7fdd3f6241d0>

In [None]:
print(f"{hyprl_env.envs[0].eval_cache}")
hyprl_env.action_space.sample()

[[22. 51. 41.]
 [20. 96. 62.]
 [98. 12. 28.]
 [89. 57. 14.]
 [63. 71. 21.]
 [53. 59. 86.]
 [56. 98. 81.]
 [44. 98. 64.]
 [ 4. 53. 23.]]


array([-1,  2], dtype=int32)

In [1]:
# Test the trained agent for sanity checking on the same environment

obs = hyprl_env.reset()
n_steps = 20
for step in range(n_steps):
  action, _ = hyprl_model.predict(obs, deterministic=True)
  # print("Step {}".format(step + 1))
  # print("Action: ", action)
  #pdb.set_trace()
  obs, reward, done, info = hyprl_env.step(action)
  if done:
    # Note that the VecEnv resets automatically
    # when a done signal is encountered
    print("Goal reached!", "reward=", reward, "final_state=", info[0]['terminal_observation'], "best=", info[0]['best'])
    print(f"info {info}")
    break
  print('obs=', obs, 'reward=', reward, 'done=', done, 'info', info)
  #env.render(mode='console')

np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})
print(f"{hyprl_env.envs[0].eval_cache}")

NameError: ignored

In [None]:
round(0.51)

In [None]:
# %%capture
### Trading bot model
## Setup Training environment
env_kwargs = {
    "hmax": 100, 
    "initial_amount": 1000000, 
    "buy_cost_pct": 0.001,
    "sell_cost_pct": 0.001,
    "state_space": state_space, 
    "stock_dim": stock_dimension, 
    "tech_indicator_list": config.TECHNICAL_INDICATORS_LIST, 
    "action_space": stock_dimension, 
    "reward_scaling": 1e-4
}
e_train_gym = StockTradingEnv(df = train, **env_kwargs)
agent = DRLAgent(env = e_train_gym.get_sb_env()[0])
## DDPG model
model_ddpg = agent.get_model("ddpg")
# Setup Trading environment
e_trade_gym = StockTradingEnv(df = trade, turbulence_threshold = None, **env_kwargs)

In [None]:
%%time
%%capture
trained_ddpg = agent.train_model(model=model_ddpg, 
                             tb_log_name='ddpg',
                             total_timesteps=10000)

In [None]:
  #For the given hyperparamters, determine the account value in the trading period
  df_account_value, df_actions = DRLAgent.DRL_prediction(
    model=trained_ddpg, 
    environment = e_trade_gym)
  #Calculate sharpe from the account value
  sharpe = calculate_sharpe(df_account_value)

perf_stats_all = pd.DataFrame(backtest_stats(account_value=df_account_value))
#print(f"Baseline stats: {perf_stats_all}")

## Optuna baseline

Full [link](https://github.com/DLR-RM/rl-baselines3-zoo/blob/master/utils/hyperparams_opt.py) of optimizable parameters for each algorithm

In [None]:
from IPython.display import clear_output

def sample_ddpg_params(trial:optuna.Trial):
  # Size of the replay buffer
  buffer_size = trial.suggest_categorical("buffer_size", [int(1e4), int(1e5), int(1e6)])
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1)
  batch_size = trial.suggest_categorical("batch_size", [32, 64, 128, 256, 512])
  
  return {"buffer_size": buffer_size,
          "learning_rate":learning_rate,
          "batch_size":batch_size}

# Objective for tuning : Sharpe ratio
def calculate_sharpe(df):
  df['daily_return'] = df['account_value'].pct_change(1)
  if df['daily_return'].std() !=0:
    sharpe = (252**0.5)*df['daily_return'].mean()/ \
          df['daily_return'].std()
    return sharpe
  else:
    return 0
  
  
def objective(trial:optuna.Trial):
  #Trial will suggest a set of hyperparamters from the specified range
  hyperparameters = sample_ddpg_params(trial)
  model_ddpg = agent.get_model("ddpg",model_kwargs = hyperparameters, verbose=0)
  trained_ddpg = agent.train_model(model=model_ddpg,
                                  tb_log_name="ddpg_optuna",
                                  total_timesteps=5000)
  trained_ddpg.save('trained_models/optuna/ddpg_{}.pth'.format(trial.number))
  clear_output(wait=True)
  #For the given hyperparamters, determine the account value in the trading period
  df_account_value, df_actions = DRLAgent.DRL_prediction(
    model=trained_ddpg, 
    environment = e_trade_gym)
  #Calculate sharpe from the account value
  sharpe = calculate_sharpe(df_account_value)

  return sharpe

#Create a study object and specify the direction as 'maximize'
#As you want to maximize sharpe
#Pruner stops not promising iterations
#Use a pruner, else you will get error related to divergence of model
#You can also use Multivariate samplere
#sampler = optuna.samplers.TPESampler(multivarite=True,seed=42)
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(study_name="ddpg_study",direction='maximize',
                            sampler = sampler, pruner=optuna.pruners.HyperbandPruner())

#You can increase the n_trials for a better search space scanning
study.optimize(objective, n_trials=30,catch=(ValueError,))

In [None]:
#Get the best hyperparamters
print('Hyperparameters after tuning',study.best_params)
print('Hyperparameters before tuning',config.DDPG_PARAMS)
study.best_trial
from stable_baselines3 import DDPG
tuned_model_ddpg = DDPG.load('trained_models/optuna/ddpg_{}.pth'.format(study.best_trial.number),env=env_train)
#Trading period account value with tuned model
df_account_value_tuned, df_actions_tuned = DRLAgent.DRL_prediction(
    model=tuned_model_ddpg, 
    environment = e_trade_gym)

perf_stats_all_tuned = backtest_stats(account_value=df_account_value_tuned)
perf_stats_all_tuned = pd.DataFrame(perf_stats_all_tuned)
print(f"Baseline stats: {perf_stats_all_tuned}")

## XCS229ii Algorithm