In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

In [4]:
df = pd.read_csv("data_hasil_olah_Aave.csv")
df.head()

Unnamed: 0,SNo,Name,Symbol,Date,High,Low,Open,Close,Volume,Marketcap,log_return,ma5,ma20,ma_ratio,volatility,rsi
0,20,Aave,AAVE,2020-10-24 23:59:59,42.439706,38.728822,41.686637,39.455022,57658170.0,421421100.0,-0.737283,37.302737,43.395799,-1.074054,0.526385,-0.981832
1,21,Aave,AAVE,2020-10-25 23:59:59,40.928509,37.601201,39.455022,37.904761,43970450.0,404862600.0,-0.562345,38.447685,42.630075,-0.841598,-0.028793,-1.120489
2,22,Aave,AAVE,2020-10-26 23:59:59,39.163864,35.071968,37.904763,36.033922,50307220.0,384880100.0,-0.685704,38.622399,42.311691,-0.781608,-0.030617,-1.480912
3,23,Aave,AAVE,2020-10-27 23:59:59,38.869182,34.469635,36.033914,34.871531,58581310.0,372464600.0,-0.476899,37.990375,42.051069,-0.833117,-0.335354,-1.483095
4,24,Aave,AAVE,2020-10-28 23:59:59,35.564022,32.000059,34.871536,32.394083,60533870.0,346002800.0,-0.956042,36.131864,41.48255,-1.011292,-1.505724,-1.609274


In [5]:
# df.head()

In [6]:
df['log_return'] = np.log(df['Close'] / df['Close'].shift(1))
df['target'] = df['log_return'].shift(-1)

df = df.dropna()

features = ['log_return','Volume']

X = df[features]
y = df['target']

In [11]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np

class TradingEnv(gym.Env):
    def __init__(self, df, window_size=10):
        super(TradingEnv, self).__init__()
        
        self.df = df
        self.window_size = window_size
        self.current_step = window_size
        self.transaction_cost = 0.0005
        
        # 0 = Short, 1 = Long
        self.action_space = spaces.Discrete(2)
        
        # window log_return
        self.observation_space = spaces.Box(
            low=-np.inf,
            high=np.inf,
            shape=(window_size,),
            dtype=np.float32
        )
        
        self.position = 0  # -1 short, 1 long
        self.balance = 1.0

    def reset(self, seed=None, options=None):
        self.current_step = self.window_size
        self.position = 0
        self.balance = 1.0
        return self._get_obs(), {}

    def _get_obs(self):
        return self.df['log_return'].iloc[
            self.current_step-self.window_size:self.current_step
        ].values.astype(np.float32)

    def step(self, action):
        done = False
        
        # ubah action jadi posisi
        new_position = 1 if action == 1 else -1
        
        log_return = self.df['log_return'].iloc[self.current_step]
        
        reward = new_position * log_return
        
        # biaya transaksi kalau ganti posisi
        if new_position != self.position:
            reward -= self.transaction_cost
        
        self.balance *= (1 + reward)
        
        self.position = new_position
        self.current_step += 1
        
        if self.current_step >= len(self.df)-1:
            done = True
            
        return self._get_obs(), reward, done, False, {}

In [12]:
from stable_baselines3 import PPO

env = TradingEnv(df)

model = PPO(
    "MlpPolicy",
    env,
    verbose=1,
    learning_rate=0.0003,
    n_steps=64,
    batch_size=32
)

model.learn(total_timesteps=20000)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------
| time/              |     |
|    fps             | 440 |
|    iterations      | 1   |
|    time_elapsed    | 0   |
|    total_timesteps | 64  |
----------------------------
-------------------------------------------
| time/                   |               |
|    fps                  | 418           |
|    iterations           | 2             |
|    time_elapsed         | 0             |
|    total_timesteps      | 128           |
| train/                  |               |
|    approx_kl            | 1.2873672e-05 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.693        |
|    explained_variance   | -0.112        |
|    learning_rate        | 0.0003        |
|    loss                 | 0.0135        |
|    n_updates            | 10            |
|    policy_gradient_loss | -0.00105      |

<stable_baselines3.ppo.ppo.PPO at 0x1c1111acf50>

In [13]:
obs, _ = env.reset()
done = False

total_reward = 0

while not done:
    action, _ = model.predict(obs)
    obs, reward, done, _, _ = env.step(action)
    total_reward += reward

print("Final Balance:", env.balance)
print("Total Reward:", total_reward)

Final Balance: 99.26465982831341
Total Reward: 5.465412499784765
