In [1]:
import numpy as np
import pandas as pd
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.monitor import Monitor
from sklearn.model_selection import train_test_split
from data.coin_load_data_multi import coin_load_data
from network.transformer import TransformerPolicy
import torch
import torch.nn as nn
import multiprocessing
from env.envE2 import TradingEnv
from network.finaldecision import FinalDecisionNetwork
from env.envE2multi import FinalTradingEnv

In [3]:
def train_agent(env, timesteps,device="cuda"):
    """에이전트를 학습시키는 함수"""
    model = PPO("MlpPolicy", env, verbose=1, device=device, learning_rate=2e-4)
    model.learn(total_timesteps=timesteps)
    return model

In [None]:
data_5min, data_15min, data_1hour, data_4hour  = coin_load_data()
train_5min, test_5min = train_test_split(data_5min, test_size=0.15, random_state=42, shuffle=False)
train_15min, test_15min = train_test_split(data_15min, test_size=0.15, random_state=42, shuffle=False)
train_1hour, test_1hour = train_test_split(data_1hour, test_size=0.15, random_state=42, shuffle=False)
train_4hour, test_4hour = train_test_split(data_4hour, test_size=0.15, random_state=42, shuffle=False)

In [None]:
envs = {
    "5min": DummyVecEnv([lambda: TradingEnv(data_5min)]),
    "15min": DummyVecEnv([lambda: TradingEnv(data_15min)]),
    "1hour": DummyVecEnv([lambda: TradingEnv(data_1hour)]),
    "4hour": DummyVecEnv([lambda: TradingEnv(data_4hour)]),
}
timesteps = {
    "5min": data_5min.shape[0],
    "15min": data_15min.shape[0]*3,
    "1hour": data_1hour.shape[0]*6,
    "4hour": data_4hour.shape[0] * 10,
}

# 멀티프로세싱 풀 생성
with multiprocessing.Pool(processes=4) as pool:
    results = pool.starmap(
        train_agent,
        [
            (envs["5min"], timesteps["5min"]),
            (envs["15min"], timesteps["15min"]),
            (envs["1hour"], timesteps["1hour"]),
            (envs["4hour"], timesteps["4hour"]),
        ],
    )

# 결과 저장
model_5min, model_15min, model_1hour, model_4hour = results

In [None]:
# 최종 환경 생성
final_env = FinalTradingEnv(
    models=[model_5min, model_15min, model_1hour, model_4hour],
    base_env=TradingEnv(data_5min),  # 5분봉 데이터를 기본 보상 및 상태로 사용
    render_mode="human"
)

final_env = DummyVecEnv([lambda: final_env])

# PPO로 학습
final_model = PPO("MlpPolicy", final_env, verbose=1, device="cuda")
final_model.learn(total_timesteps=100000)