In [None]:
# module import

import warnings

import gym_trading_env  # noqa
import gymnasium as gym
import numpy as np
import pandas as pd
import torch
from gym_trading_env.renderer import Renderer
from gymnasium.envs.registration import register
from sb3_contrib import QRDQN, RecurrentPPO
from sb3_contrib.qrdqn import MlpPolicy
from sb3_contrib.ppo_recurrent import MlpLstmPolicy
from sklearn.preprocessing import robust_scale
from stable_baselines3.common.utils import get_device
# from stable_baselines3.ppo import MlpPolicy
from tqdm import TqdmExperimentalWarning

In [None]:
# Preprocess


def SMA(df, ndays):
    SMA = pd.Series(df.close.rolling(ndays).mean(), name="SMA_" + str(ndays))
    return SMA.astype(float).round(2)


def BBANDS(df, n):
    MA = df.close.rolling(window=n).mean()
    SD = df.close.rolling(window=n).std()
    upperBand = MA + (2 * SD)
    lowerBand = MA - (2 * SD)
    return upperBand.astype(float).round(2), lowerBand.astype(float).round(2)


def RSI(df, periods=14):
    close_delta = df.close.diff()
    up = close_delta.clip(lower=0)
    down = -1 * close_delta.clip(upper=0)
    ma_up = up.ewm(com=periods - 1, adjust=True, min_periods=periods).mean()
    ma_down = down.ewm(com=periods - 1, adjust=True, min_periods=periods).mean()

    _rsi = ma_up / ma_down
    return (100 - (100 / (1 + _rsi))).astype(float).round(2)


def MACD(df):
    k = df["close"].ewm(span=12, adjust=False, min_periods=12).mean()
    d = df["close"].ewm(span=26, adjust=False, min_periods=26).mean()
    macd = k - d
    macd_s = macd.ewm(span=9, adjust=False, min_periods=9).mean()
    macd_h = macd - macd_s
    #return df.index.map(macd), df.index.map(macd_s), df.index.map(macd_h)
    return macd.astype(float).round(2), macd_s.astype(float).round(2), macd_h.astype(float).round(2)


def add_robust_features(df):
    df["feature_close"] = robust_scale(df.close.pct_change())
    df["feature_open"] = robust_scale(df.open/df.close)
    df["feature_high"] = robust_scale(df.high/df.close)
    df["feature_low"] = robust_scale(df.low/df.close)
    df["feature_volume"] = robust_scale(df.volume / df.volume.rolling(7*24).max())
    df.dropna(inplace= True)
    return df


def normalize(df):
    result = df.copy()
    columns = [x for x in df.columns if "feature" in x]
    for feature_name in columns:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result


def robust(df):
    result = df.copy()
    columns = [x for x in df.columns if "feature" in x]
    for feature_name in columns:
        result[feature_name] = robust_scale(df[feature_name])
    return result


def stochastic_fast_k(df, n=5):
      fast_k = ((df.close - df.low.rolling(n).min()) / (df.high.rolling(n).max() - df.low.rolling(n).min())) * 100 
      return fast_k 


def stochastic_slow_k(fast_k, n=3):
    slow_k = fast_k.rolling(n).mean()  
    return slow_k 


def stochastic_slow_d(slow_k, n=3):
    slow_d = slow_k.rolling(n).mean() 
    return slow_d


def OBV(df):
    volume_diff = df.volume.diff()
    direction = np.zeros(len(df))
    direction[1:] = np.where(df.close[1:] > df.close[:-1].values, 1, -1)
    direction[volume_diff == 0] = 0
    obv = (volume_diff * direction).cumsum() 
    return obv.astype(float).round(2)


def preprocess(df):
    df["volume"] = df.volume.astype(float).round(2)
    df["feature_close"] = df.close
    df["feature_open"] = df.open
    df["feature_high"] = df.high
    df["feature_low"] = df.low
    df["feature_volume"] = df.volume
    df["feature_SMA_7"] = SMA(df, 7)
    df["feature_SMA_25"] = SMA(df, 25)
    df["feature_SMA_99"] = SMA(df, 99)
    df["feature_MiddleBand"], df["feature_LowerBand"] = BBANDS(df, 21)
    df["feature_MACD"], df["feature_MACD_S"], df["feature_MACD_H"] = MACD(df)
    df = df.dropna()

    df_robust = robust(df)

    df_robust["feature_RSI_6"] = RSI(df, periods=6)
    df_robust["feature_RSI_12"] = RSI(df, periods=12)
    df_robust["feature_RSI_24"] = RSI(df, periods=24)

    return df_robust


def only_sub_indicators(df):
    # df['fast_k'] = stochastic_fast_k(df, 5)
    # df['feature_slow_stochastic_k'] = stochastic_slow_k(df.fast_k, 3)
    # df['feature_slow_stochastic_d'] = stochastic_slow_d(df.feature_slow_stochastic_k, 3)
    # df["feature_OBV"] = OBV(df)
    df["feature_RSI_6"] = RSI(df, periods=6)
    df["feature_RSI_12"] = RSI(df, periods=12)
    df["feature_RSI_24"] = RSI(df, periods=24)
    df["feature_MACD"], df["feature_MACD_S"], df["feature_MACD_H"] = MACD(df)
    df = df.dropna()
    
    return df


In [None]:
# Reward Function

def reward_only_position_changed(history):
    dd = (
        history["portfolio_valuation", 0]
        if history["portfolio_valuation", 0] > history["prev_position_valuation", -2]
        else history["prev_position_valuation", -2]
    )

    return (history["portfolio_valuation", -1] - history["prev_position_valuation", -2]) / dd  # / sqrt(index)

In [None]:
# Reward Function

def reward_only_position_changed_roe(history):
    if history["portfolio_valuation", -1] <= 0:
        return -1
    prev_position = history["position", -2]
    curr_position = history["position", -1]

    position_roe = (
        history["portfolio_valuation", -1] / history["entry_valuation", -1] - 1
    )
    total_roe = (
        history["portfolio_valuation", -1] / history["portfolio_valuation", 0] - 1
    )

    if prev_position == curr_position:
        return 0
    else:
        if history["portfolio_valuation", -2] < history["portfolio_valuation", -1]:
            if total_roe > 0:
                return position_roe
            else:
                return total_roe
        else:
            return position_roe

In [None]:
def reward_by_pnl(history):
    if history["portfolio_valuation", -1] <= 0:
        return -1

    prev_position = history["position", -2]
    curr_position = history["position", -1]

    if prev_position == curr_position:
        return 0
    else:
        return  (history["portfolio_valuation", -1] - history["entry_valuation", -1]) /history["portfolio_valuation", 0]

In [None]:
def reward_done(history):
    if history["portfolio_valuation", -1] <= 0:
        return -10

    return (history["portfolio_valuation", -1] - history["portfolio_valuation", 0]) / history["portfolio_valuation", 0]

In [None]:
def calculate_volatility(portfolio_valuations):
    """
    포트폴리오 가치의 리스트를 기반으로 변동성(표준편차)를 계산합니다.
    """
    if len(portfolio_valuations) < 2:
        return 0  # 데이터가 충분하지 않은 경우 변동성을 0으로 반환
    log_returns = np.diff(np.log(portfolio_valuations))
    volatility = np.std(log_returns)
    return volatility

def reward_function_volatility(history):
    """
    변경된 보상 함수: 동적으로 N을 계산하고, 손해가 났을 때 음수 보상을 부여합니다.
    """
    total_steps = history[-1]['step'] + 1  # 총 스텝 수
    N_ratio = 0.1  # 전체 데이터의 마지막 10%를 사용
    N = max(int(total_steps * N_ratio), 1)  # 적어도 한 스텝은 포함되도록

    # 최근 N개 스텝의 포트폴리오 가치 추출
    portfolio_valuations = [history[max(0, len(history) - N + i)]['portfolio_valuation'] for i in range(N)]
    
    # 변동성 계산
    volatility = calculate_volatility(portfolio_valuations)
    
    # 현재 스텝 정보 추출
    current_portfolio_valuation = history[-1]['portfolio_valuation']
    initial_portfolio_valuation = history[0]['portfolio_valuation']
    current_step = history[-1]['step']

    # 수익률 계산
    total_roe = (current_portfolio_valuation / initial_portfolio_valuation - 1)

    # 장기 실행 보상 및 변동성에 기반한 보상 조정
    long_term_bonus = max(1, current_step / 100)
    stability_bonus = 1 / (volatility + 0.01)  # 분모가 0이 되지 않도록

    # 손실이 발생했을 경우 음수 보상 부여
    if total_roe < 0:
        reward = total_roe * stability_bonus * long_term_bonus
    else:
        reward = max(0, total_roe) * stability_bonus * long_term_bonus

    return reward


In [None]:
import numpy as np

def calculate_volatility_and_sharpe(portfolio_valuations, risk_free_rate=0.0):
    """
    포트폴리오 가치의 리스트를 기반으로 변동성(표준편차)와 샤프 비율을 계산합니다.
    risk_free_rate는 연간 무위험 수익률을 의미하며, 기본값을 0으로 설정합니다.
    """
    if len(portfolio_valuations) < 2:
        return 0, 0  # 데이터가 충분하지 않은 경우 변동성과 샤프 비율을 0으로 반환

    log_returns = np.diff(np.log(portfolio_valuations))
    volatility = np.std(log_returns)
    mean_return = np.mean(log_returns)
    sharpe_ratio = (mean_return - risk_free_rate / 252) / volatility if volatility != 0 else 0

    return volatility, sharpe_ratio

def reward_function_sharp(history, N_ratio=0.1, risk_free_rate=0.0):
    """
    개선된 보상 함수: 동적으로 N을 계산하고, 손해가 났을 때 음수 보상을 부여하며,
    리스크 조정 수익률(샤프 비율)을 고려합니다.
    """
    total_steps = history[-1]['step'] + 1
    N = max(int(total_steps * N_ratio), 1)

    portfolio_valuations = [history[max(0, len(history) - N + i)]['portfolio_valuation'] for i in range(N)]
    
    volatility, sharpe_ratio = calculate_volatility_and_sharpe(portfolio_valuations, risk_free_rate)
    
    current_portfolio_valuation = history[-1]['portfolio_valuation']
    initial_portfolio_valuation = history[0]['portfolio_valuation']
    current_step = history[-1]['step']

    total_roe = (current_portfolio_valuation / initial_portfolio_valuation - 1)
    long_term_bonus = max(1, current_step / 100)

    reward = total_roe * long_term_bonus

    # 손실 시 음수 보상 부여와 리스크 조정 보상 적용
    if total_roe < 0:
        reward *= (1 + sharpe_ratio)  # 손실이 있을 경우 샤프 비율을 이용해 조정
    else:
        reward *= (1 + abs(sharpe_ratio))  # 수익이 있을 경우 절대값 샤프 비율을 이용해 조정

    return reward


In [None]:
import numpy as np

def calculate_volatility_and_sharpe(portfolio_valuations, risk_free_rate=0.0):
    """
    포트폴리오 가치의 리스트를 기반으로 변동성(표준편차)와 샤프 비율을 계산합니다.
    """
    if len(portfolio_valuations) < 2:
        return 0, 0  # 데이터가 충분하지 않은 경우 변동성과 샤프 비율을 0으로 반환

    log_returns = np.diff(np.log(portfolio_valuations))
    volatility = np.std(log_returns)
    mean_return = np.mean(log_returns)
    sharpe_ratio = (mean_return - risk_free_rate / 252) / volatility if volatility != 0 else 0

    return volatility, sharpe_ratio

def calculate_max_drawdown(portfolio_valuations):
    """
    최대 드로다운을 계산합니다.
    """
    max_value = np.max(portfolio_valuations)
    drawdowns = 1 - (portfolio_valuations / max_value)
    max_drawdown = np.max(drawdowns)
    return max_drawdown

def reward_function_sharp_with_volatility(history, N_ratio=0.1, risk_free_rate=0.0, volatility_penalty=0.5):
    total_steps = history[-1]["step"] + 1
    N = max(int(total_steps * N_ratio), 1)

    portfolio_valuations = [
        history[max(0, len(history) - N + i)]["portfolio_valuation"] for i in range(N)
    ]

    volatility, sharpe_ratio = calculate_volatility_and_sharpe(
        portfolio_valuations, risk_free_rate
    )

    current_portfolio_valuation = history[-1]["portfolio_valuation"]
    initial_portfolio_valuation = history[0]["portfolio_valuation"]
    current_step = history[-1]["step"]

    total_roe = current_portfolio_valuation / initial_portfolio_valuation - 1
    long_term_bonus = max(1, current_step / 100)

    reward = total_roe * long_term_bonus

    # 새로운 로직: 변동성을 반영하여 보상 조정
    volatility_adjustment = 1 - (volatility * volatility_penalty)
    reward *= volatility_adjustment

    if total_roe < 0:
        reward *= 1 + sharpe_ratio
    else:
        reward *= 1 + abs(sharpe_ratio)

    return reward


In [None]:
from gte import MultiDatasetDiscretedTradingEnv
# Enviornment
from stable_baselines3.common.env_util import make_vec_env


vec_env = make_vec_env(
    MultiDatasetDiscretedTradingEnv,
    n_envs=16,
    # vec_env_cls=SubprocVecEnv,
    env_kwargs=dict(
        dataset_dir="./data/train/day/**/**/*.pkl",
        preprocess=only_sub_indicators,
        reward_function=reward_only_position_changed,
        positions=[-5, -2, 0, 2, 5],
        trading_fees=0.0001,
        borrow_interest_rate=0.00003,
        window_size=180,
        portfolio_initial_value=1000,
        verbose=2,
    ),
    wrapper_class=gym.wrappers.FlattenObservation
)

In [None]:
register(
    id='MultiDatasetDiscretedTradingEnv',
    entry_point='gte:MultiDatasetDiscretedTradingEnv',
    disable_env_checker = True
)
env = gym.make(
    "MultiDatasetDiscretedTradingEnv",
    dataset_dir="./data/train/month/**/*.pkl",
    preprocess=only_sub_indicators,
    reward_function=reward_function_volatility,
    positions=[-5, -2, 0, 2, 5],
    trading_fees=0.0001,
    borrow_interest_rate=0.00003,
    window_size=240,
    portfolio_initial_value=1000,
    verbose=2,
)
env = gym.wrappers.FlattenObservation(env)

In [None]:
# Train

warnings.filterwarnings("ignore", category=TqdmExperimentalWarning)
model = QRDQN(
    MlpPolicy,
    vec_env,
    buffer_size=10_000_000,  # 1_000_000
    # n_steps=1024,
    # batch_size=4,
    verbose=0,
    device="cpu",
    tensorboard_log="./tensorboard/",
    # optimize_memory_usage=True,
    # learning_rate =0.0000001, #0.00005
    # use_sde=False,
)

In [None]:
# Train

warnings.filterwarnings("ignore", category=TqdmExperimentalWarning)
model = RecurrentPPO(
    MlpLstmPolicy,
    vec_env,
    # n_steps=1024*16,
    # batch_size=4,
    verbose=0,
    # device="cpu",
    tensorboard_log="./tensorboard/",
    # optimize_memory_usage=True,
    # learning_rate =0.0000001, #0.00005
    # use_sde=False,
)

In [None]:
model.learn(total_timesteps=1000000, progress_bar=True)

In [None]:
obs = vec_env.reset()
for _ in range(1000):
    action, _states = model.predict(obs, deterministic=True)
    obs, rewards, dones, info = vec_env.step(action)
    # vec_env.render()
import pprint
pprint.pprint(info)

In [None]:
renderer = Renderer(render_logs_dir="render_logs")
renderer.add_metric(
    "Annual Market Return",
    lambda df: f"{((df['close'].iloc[-1] / df['close'].iloc[0]) ** (pd.Timedelta(days=365) / (df.index.values[-1] - df.index.values[0])) - 1) * 100:0.2f}%",
)
renderer.add_metric(
    "Annual Portfolio Return",
    lambda df: f"{((df['portfolio_valuation'].iloc[-1] / df['portfolio_valuation'].iloc[0]) ** (pd.Timedelta(days=365) / (df.index.values[-1] - df.index.values[0])) - 1) * 100:0.2f}%",
)
renderer.run()

In [12]:
model.save("./model/RPPO/2024_04_11_1.zip")