In [None]:
# module import

import warnings
import gym_trading_env  # noqa
import gymnasium as gym
import numpy as np
import pandas as pd
import torch
from gym_trading_env.renderer import Renderer
from stable_baselines3 import DQN, PPO
from sb3_contrib import QRDQN, ARS, TRPO
from sb3_contrib.qrdqn import MlpPolicy
from stable_baselines3.common.utils import get_device
# from stable_baselines3.ppo import MlpPolicy
from tqdm import TqdmExperimentalWarning

In [None]:
print(torch.cuda.is_available())
get_device()

In [None]:
# Preprocess


def SMA(df, ndays):
    SMA = pd.Series(df.close.rolling(ndays).mean(), name="SMA_" + str(ndays))
    return SMA.astype(float).round(2)


def BBANDS(df, n):
    MA = df.close.rolling(window=n).mean()
    SD = df.close.rolling(window=n).std()
    upperBand = MA + (2 * SD)
    lowerBand = MA - (2 * SD)
    return upperBand.astype(float).round(2), lowerBand.astype(float).round(2)


def RSI(df, periods=14):
    close_delta = df.close.diff()
    up = close_delta.clip(lower=0)
    down = -1 * close_delta.clip(upper=0)
    ma_up = up.ewm(com=periods - 1, adjust=True, min_periods=periods).mean()
    ma_down = down.ewm(com=periods - 1, adjust=True, min_periods=periods).mean()

    _rsi = ma_up / ma_down
    return (100 - (100 / (1 + _rsi))).astype(float).round(2)


def MACD(df):
    k = df["close"].ewm(span=12, adjust=False, min_periods=12).mean()
    d = df["close"].ewm(span=26, adjust=False, min_periods=26).mean()
    macd = k - d
    macd_s = macd.ewm(span=9, adjust=False, min_periods=9).mean()
    macd_h = macd - macd_s
    return df.index.map(macd), df.index.map(macd_s), df.index.map(macd_h)


def preprocess(df):
    df["volume"] = df.volume.astype(float).round(2)
    df["feature_close"] = df.close
    df["feature_open"] = df.open
    df["feature_high"] = df.high
    df["feature_low"] = df.low
    df["feature_volume"] = df.volume
    df["feature_SMA"] = SMA(df, 50)
    df["feature_MiddleBand"], df["feature_LowerBand"] = BBANDS(df, 50)
    df["feature_RSI"] = RSI(df, periods=14)
    df["feature_MACD"], df["feature_MACD_S"], df["feature_MACD_H"] = MACD(df)
    df = df.dropna()

    return df

In [None]:
# Reward function

def reward_function(history):
    # history["portfolio_valuation", -2] - history["portfolio_valuation", -2]/100
    init_value = history["portfolio_valuation", 0]
    # prev_value = history["portfolio_valuation", -2]
    curr_value = history["portfolio_valuation", -1]

    pc = np.sum(np.diff(history["position"]) != 0)
    if 20 < pc <= 30:
        r = -80
    elif 10 < pc <= 20:
        r = -90
    elif 0 < pc <= 10:
        r = -100
    else:
        r = 100 * (curr_value / init_value - 1)
    
    return r

In [None]:

def basic_reward_function(history):
    return np.log(history["portfolio_valuation", -1] / history["portfolio_valuation", -2])

In [None]:
def adjusted_reward_function_with_holding_penalty(history):
    # 상수 설정
    TRADE_FREQUENCY_REWARD = 0.01  # 각 거래에 대한 보상
    HOLDING_PENALTY_FACTOR = -0.001  # 포지션을 유지하는 데 따른 패널티 계수
    
    # 로그 수익률 계산
    log_return = np.log(history["portfolio_valuation", -1] / history["portfolio_valuation", -2])
    
    # 포지션 변경 횟수 계산
    position_changes = np.sum(np.diff(history["position"]) != 0)
    
    # 거래 빈도에 따른 보상 계산
    trade_frequency_reward = TRADE_FREQUENCY_REWARD * position_changes
    
    # 포지션 유지 시간 패널티 계산
    holding_periods = np.diff(np.where(np.diff(history["position"]) != 0)[0])
    holding_penalty = np.sum(holding_periods * HOLDING_PENALTY_FACTOR)
    
    # 최종 보상 계산
    reward = log_return + trade_frequency_reward + holding_penalty
    
    return reward


In [None]:
# Enviornment

env = gym.make(
    "MultiDatasetTradingEnv",
    dataset_dir="D:/Destktop/PyCharm_Projects/sb3/data/train/day/**/**/*.pkl",
    preprocess=preprocess,
    # reward_function=adjusted_reward_function_with_holding_penalty,
    positions=[-10, 10],
    trading_fees=0.0001,
    borrow_interest_rate=0.01,
)
env.unwrapped.add_metric(
    "Position Changes", lambda history: np.sum(np.diff(history["position"]) != 0)
)
env.unwrapped.add_metric("Episode Length", lambda history: len(history["position"]))

In [None]:
# Train

warnings.filterwarnings("ignore", category=TqdmExperimentalWarning)
model = QRDQN(
    MlpPolicy,
    env,
    buffer_size=100000000,  # 1000000
    batch_size=256,
    verbose=0,
    device="auto",
    tensorboard_log="./tensorboard/",
)
model.learn(total_timesteps=10000000, progress_bar=True)

# vec_env = model.get_env()
# obs = vec_env.reset()

In [77]:
reward = 0.0
done, truncated = False, False
observation, info = env.reset()

warnings.filterwarnings("ignore", category=UserWarning)
while not done and not truncated:
    action = model.predict(observation, deterministic=True)
    observation, r, done, truncated, info = env.step(action[0])
    reward += r

# env.unwrapped.save_for_render(dir="render_logs")

Market Return : -1.03%   |   Portfolio Return : -19.27%   |   Position Changes : 0   |   Episode Length : 1330   |   


In [None]:
# data = pd.read_csv("1.csv", parse_dates=["date"], index_col="date")

def get_random_data():
    from random import randint, choice
    import os
    data_dir = f"D:/Destktop/PyCharm_Projects/sb3/data/train/day/{randint(2018, 2022)}/{randint(1, 12)}"
    files = os.listdir(data_dir)
    random_file = choice(files)
    file_path = os.path.join(data_dir, random_file)
    return pd.read_pickle(file_path)

data = get_random_data()

evaluate_env = gym.make(
    "TradingEnv",
    df=data,
    positions=[-10, 10],
    trading_fees=0.0001,
    borrow_interest_rate=0.0003,
)
evaluate_env.unwrapped.add_metric(
    "Position Changes", lambda history: np.sum(np.diff(history["position"]) != 0)
)
evaluate_env.unwrapped.add_metric("Episode Length", lambda history: len(history["position"]))

evaluate_model = QRDQN(
    MlpPolicy,
    evaluate_env,
    buffer_size=100000000,
    batch_size=128,
    verbose=0,
    device="auto",
)
evaluate_model.load("./model/QRDQN/1m_+-10_0.01.zip")

reward = 0.0
done, truncated = False, False
observation, info = evaluate_env.reset()

warnings.filterwarnings("ignore", category=UserWarning)
while not done and not truncated:
    action = evaluate_model.predict(observation, deterministic=True)
    observation, r, done, truncated, info = evaluate_env.step(action[0])
    reward += r

print(reward)
# evaluate_env.unwrapped.save_for_render(dir="render_logs")

In [None]:
renderer = Renderer(render_logs_dir="render_logs")
renderer.add_metric(
    "Annual Market Return",
    lambda df: f"{((df['close'].iloc[-1] / df['close'].iloc[0]) ** (pd.Timedelta(days=365) / (df.index.values[-1] - df.index.values[0])) - 1) * 100:0.2f}%",
)
renderer.add_metric(
    "Annual Portfolio Return",
    lambda df: f"{((df['portfolio_valuation'].iloc[-1] / df['portfolio_valuation'].iloc[0]) ** (pd.Timedelta(days=365) / (df.index.values[-1] - df.index.values[0])) - 1) * 100:0.2f}%",
)
renderer.run()

In [None]:
model.save("./model/QRDQN/1m_+-10_0.01_2.zip")