In [1]:
# module import

import warnings

import gym_trading_env  # noqa
import gymnasium as gym
import numpy as np
import pandas as pd
import torch
from gym_trading_env.renderer import Renderer
from gymnasium.envs.registration import register
from sb3_contrib import QRDQN, RecurrentPPO
from sb3_contrib.qrdqn import MlpPolicy
from sklearn.preprocessing import robust_scale
from stable_baselines3.common.utils import get_device
# from stable_baselines3.ppo import MlpPolicy
from tqdm import TqdmExperimentalWarning

In [2]:
# Preprocess


def SMA(df, ndays):
    SMA = pd.Series(df.close.rolling(ndays).mean(), name="SMA_" + str(ndays))
    return SMA.astype(float).round(2)


def BBANDS(df, n):
    MA = df.close.rolling(window=n).mean()
    SD = df.close.rolling(window=n).std()
    upperBand = MA + (2 * SD)
    lowerBand = MA - (2 * SD)
    return upperBand.astype(float).round(2), lowerBand.astype(float).round(2)


def RSI(df, periods=14):
    close_delta = df.close.diff()
    up = close_delta.clip(lower=0)
    down = -1 * close_delta.clip(upper=0)
    ma_up = up.ewm(com=periods - 1, adjust=True, min_periods=periods).mean()
    ma_down = down.ewm(com=periods - 1, adjust=True, min_periods=periods).mean()

    _rsi = ma_up / ma_down
    return (100 - (100 / (1 + _rsi))).astype(float).round(2)


def MACD(df):
    k = df["close"].ewm(span=12, adjust=False, min_periods=12).mean()
    d = df["close"].ewm(span=26, adjust=False, min_periods=26).mean()
    macd = k - d
    macd_s = macd.ewm(span=9, adjust=False, min_periods=9).mean()
    macd_h = macd - macd_s
    #return df.index.map(macd), df.index.map(macd_s), df.index.map(macd_h)
    return macd.astype(float).round(2), macd_s.astype(float).round(2), macd_h.astype(float).round(2)


def add_robust_features(df):
    df["feature_close"] = robust_scale(df.close.pct_change())
    df["feature_open"] = robust_scale(df.open/df.close)
    df["feature_high"] = robust_scale(df.high/df.close)
    df["feature_low"] = robust_scale(df.low/df.close)
    df["feature_volume"] = robust_scale(df.volume / df.volume.rolling(7*24).max())
    df.dropna(inplace= True)
    return df


def normalize(df):
    result = df.copy()
    columns = [x for x in df.columns if "feature" in x]
    for feature_name in columns:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result


def robust(df):
    result = df.copy()
    columns = [x for x in df.columns if "feature" in x]
    for feature_name in columns:
        result[feature_name] = robust_scale(df[feature_name])
    return result


def stochastic_fast_k(df, n=5):
      fast_k = ((df.close - df.low.rolling(n).min()) / (df.high.rolling(n).max() - df.low.rolling(n).min())) * 100 
      return fast_k 


def stochastic_slow_k(fast_k, n=3):
    slow_k = fast_k.rolling(n).mean()  
    return slow_k 


def stochastic_slow_d(slow_k, n=3):
    slow_d = slow_k.rolling(n).mean() 
    return slow_d


def OBV(df):
    volume_diff = df.volume.diff()
    direction = np.zeros(len(df))
    direction[1:] = np.where(df.close[1:] > df.close[:-1].values, 1, -1)
    direction[volume_diff == 0] = 0
    obv = (volume_diff * direction).cumsum() 
    return obv.astype(float).round(2)


def preprocess(df):
    df["volume"] = df.volume.astype(float).round(2)
    df["feature_close"] = df.close
    df["feature_open"] = df.open
    df["feature_high"] = df.high
    df["feature_low"] = df.low
    df["feature_volume"] = df.volume
    df["feature_SMA_7"] = SMA(df, 7)
    df["feature_SMA_25"] = SMA(df, 25)
    df["feature_SMA_99"] = SMA(df, 99)
    df["feature_MiddleBand"], df["feature_LowerBand"] = BBANDS(df, 21)
    df["feature_MACD"], df["feature_MACD_S"], df["feature_MACD_H"] = MACD(df)
    df = df.dropna()

    df_robust = robust(df)

    df_robust["feature_RSI_6"] = RSI(df, periods=6)
    df_robust["feature_RSI_12"] = RSI(df, periods=12)
    df_robust["feature_RSI_24"] = RSI(df, periods=24)

    return df_robust


def only_sub_indicators(df):
    # df['fast_k'] = stochastic_fast_k(df, 5)
    # df['feature_slow_stochastic_k'] = stochastic_slow_k(df.fast_k, 3)
    # df['feature_slow_stochastic_d'] = stochastic_slow_d(df.feature_slow_stochastic_k, 3)
    # df["feature_OBV"] = OBV(df)
    df["feature_RSI_6"] = RSI(df, periods=6)
    df["feature_RSI_12"] = RSI(df, periods=12)
    df["feature_RSI_24"] = RSI(df, periods=24)
    df["feature_MACD"], df["feature_MACD_S"], df["feature_MACD_H"] = MACD(df)
    df = df.dropna()
    return df


In [7]:
# Reward Function

def reward_only_position_changed(history):
    if history["portfolio_valuation", -1] <= 0:
        return -1
    prev_position = history["position", -2]
    curr_position = history["position", -1]
    # holding_fee = 0.01
    # holding_cost = 0

    index = 1
    index_limit = len(history)

    while index < index_limit and history["position", -index] == prev_position:
        index += 1
        # holding_cost -= holding_fee

    if prev_position == curr_position:
        # if curr_position == 0:
        #     return holding_cost
        # else:
        return 0
    else:
        return (history["portfolio_valuation", -1] - history["portfolio_valuation", -index]) /history["portfolio_valuation", 0]  # / sqrt(index)

In [4]:
def reward_by_pnl(history):
    if history["portfolio_valuation", -1] <= 0:
        return -1

    prev_position = history["position", -2]
    curr_position = history["position", -1]

    if prev_position == curr_position:
        return 0
    else:
        return  (history["portfolio_valuation", -1] - history["entry_valuation", -1]) /history["portfolio_valuation", 0]

In [8]:
from gte import MultiDatasetDiscretedTradingEnv
# Enviornment
from stable_baselines3.common.env_util import make_vec_env


vec_env = make_vec_env(
    MultiDatasetDiscretedTradingEnv,
    n_envs=32,
    # vec_env_cls=SubprocVecEnv,
    env_kwargs=dict(
        dataset_dir="./data/train/month/**/*.pkl",
        preprocess=only_sub_indicators,
        reward_function=reward_only_position_changed,
        positions=[-10, 0, 10],
        trading_fees=0.0001,
        borrow_interest_rate=0.00003,
        window_size=60,
        portfolio_initial_value=100,
    ),
    wrapper_class=gym.wrappers.FlattenObservation
)

In [9]:
# Train

warnings.filterwarnings("ignore", category=TqdmExperimentalWarning)
model = QRDQN(
    MlpPolicy,
    vec_env,
    # buffer_size=100_000_000,  # 1_000_000
    # n_steps=1024,
    # batch_size=32,
    verbose=0,
    device="cuda",
    tensorboard_log="./tensorboard/",
    # learning_rate =0.0000001, #0.00005
    # use_sde=False,
)

In [10]:
model.learn(total_timesteps=100000000, progress_bar=True)

Output()

<sb3_contrib.qrdqn.qrdqn.QRDQN at 0x1e00c148110>

In [None]:
obs = vec_env.reset()
for _ in range(1000):
    action, _states = model.predict(obs, deterministic=True)
    obs, rewards, dones, info = vec_env.step(action)
    vec_env.render()

In [None]:
renderer = Renderer(render_logs_dir="render_logs")
renderer.add_metric(
    "Annual Market Return",
    lambda df: f"{((df['close'].iloc[-1] / df['close'].iloc[0]) ** (pd.Timedelta(days=365) / (df.index.values[-1] - df.index.values[0])) - 1) * 100:0.2f}%",
)
renderer.add_metric(
    "Annual Portfolio Return",
    lambda df: f"{((df['portfolio_valuation'].iloc[-1] / df['portfolio_valuation'].iloc[0]) ** (pd.Timedelta(days=365) / (df.index.values[-1] - df.index.values[0])) - 1) * 100:0.2f}%",
)
renderer.run()

In [None]:
model.save("./model/QRDQN/1m_+-10_0.01_2.zip")