In [21]:
import gymnasium as gym
from gymnasium import spaces
from gymnasium.envs.registration import register

import numpy as np
import pandas as pd

class TradeEnv(gym.Env):
    
    def __init__(self, input_dim, max_steps, cash_balance, df):
        super(TradeEnv, self).__init__()
        self.input_dim = input_dim
        self.action_space = spaces.Discrete(3)  # actions: buy, sell and hold
        self.observation_space = spaces.Box(low=np.zeros(self.input_dim), high=np.ones(self.input_dim) * np.inf, dtype=np.float64)
        self.df = df
        self.df_counter = 0

        self.current_step = 0
        self.max_steps = max_steps  # episode terminates when day ends

        self.state = None

        self.hold_counter = 0
        self.impossible_sell_counter = 0
        self.impossible_buy_counter = 0

        self.current_price = 0
        self.initial_cash = cash_balance

        self.inventory = {'Last Buy Price': 0,
                          'Last Sell Price': 0,
                          'Holdings': 0,
                          'Cash Balance': cash_balance,
                          'Profit': 0,
                          'Volatility': 0,
                          'Sharpe Ratio': 0}
        

    def step(self, action):
        self.current_step += 1
        self.df_counter += 1
        done = self.current_step == self.max_steps

        reward = self.calculate_pnl(action)
        self.state = self.update_state(self.df[self.df_counter])

        info = {"Profit": self.inventory["Profit"]}

        return self.state, reward, done, False, info
    
    def update_state(self, next_state):
        self.state = next_state
        return self.state
    

    def reset(self, seed=None):
        super().reset(seed=seed)
        self.__init__(self.input_dim, self.max_steps, self.initial_cash, self.df)
        self.df_counter = 0
        self.state = self.df[self.df_counter]

        info = {}

        return self.state, info


    def calculate_pnl(self, action):

        self.current_price = self.state[0]  # select open price for OHLCVT data

        if action == 0: # buy
            if self.inventory["Holdings"] == 0:   # need to have empty inventory to buy
                buy_quantity = self.inventory["Cash Balance"] / self.current_price
                self.inventory['Cash Balance'] = 0
                self.inventory["Holdings"] = buy_quantity
                self.hold_counter = 0
                self.inventory["Last Buy Price"] = self.current_price
                reward = 5
            else:
                self.impossible_buy_counter += 1
                reward = -self.impossible_buy_counter

        elif action == 1: # sell
            if self.inventory["Holdings"] > 0:  # need to own in order to sell
                self.inventory["Cash Balance"] += self.inventory["Holdings"] * self.current_price
                reward = self.inventory["Cash Balance"] - self.inventory["Holdings"] * self.inventory["Last Buy Price"]
                self.inventory["Holdings"] = 0
                self.inventory["Last Buy Price"] = 0
                self.hold_counter = 0
                self.inventory['Profit'] += reward
            else:
                self.impossible_sell_counter += 1
                reward = -self.impossible_sell_counter

        elif action == 2: # hold
            self.hold_counter += 1
            reward = -self.hold_counter

        print(self.hold_counter)

        return reward

**Hyperparameters to tune:**
- Learning Rate: Controls how much the model's weights are updated during training. A higher learning rate might lead to faster learning but can cause instability. A lower learning rate ensures more stable but slower learning.
- N_steps: Number of steps the agent takes before updating its policy. It's a trade-off between performance and memory usage. In a trading environment, this could be aligned with the frequency of decision-making.
- Gamma (Discount Factor): Determines the importance of future rewards. A lower value makes the agent short-sighted by discounting future rewards heavily.
- Gae_lambda (Generalized Advantage Estimator): Balances bias and variance in the advantage estimation. It affects how the agent evaluates the trade-off between immediate and future rewards.
- Ent_coef (Entropy Coefficient): Encourages exploration by adding an entropy bonus to the objective function. Higher entropy can help explore more strategies in a complex trading market.
- Seed: Sets the random seed for reproducibility of training results.
- Use_sde (Stochastic Differential Equations): If enabled, introduces stochasticity in the policy, which can help exploration.

In [9]:
df = pd.read_csv('ETHUSD_5.csv', header=None, names=['Time', 'Open', 'High', 'Low', 'Close', 'Volume', 'Trades'])
df['Time'] = pd.to_datetime(df['Time'], unit='s')
df.set_index('Time', inplace=True)
df = df.to_numpy()

In [11]:
import numpy as np
from sb3_contrib import RecurrentPPO
from stable_baselines3.common.evaluation import evaluate_policy

env = TradeEnv(6, 288, 1000, df)
model = RecurrentPPO("MlpLstmPolicy", env, verbose=1, learning_rate=0.001, n_steps=32, gamma=0.65, ent_coef=0.05, seed=42)
model.learn(20000)

vec_env = model.get_env()
mean_reward, std_reward = evaluate_policy(model, vec_env, n_eval_episodes=20, warn=False)
print(mean_reward)

# model.save("ppo_recurrent")
# del model # remove to demonstrate saving and loading

# model = RecurrentPPO.load("ppo_recurrent")

# obs = vec_env.reset()
# # cell and hidden state of the LSTM
# lstm_states = None
# num_envs = 1
# # Episode start signals are used to reset the lstm states
# episode_starts = np.ones((num_envs,), dtype=bool)
# while True:
#     action, lstm_states = model.predict(obs, state=lstm_states, episode_start=episode_starts, deterministic=True)
#     obs, rewards, dones, info = vec_env.step(action)
#     episode_starts = dones
#     vec_env.render("human")

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-----------------------------
| time/              |      |
|    fps             | 1088 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 32   |
-----------------------------
----------------------------------------
| time/                   |            |
|    fps                  | 169        |
|    iterations           | 2          |
|    time_elapsed         | 0          |
|    total_timesteps      | 64         |
| train/                  |            |
|    approx_kl            | 0.02348095 |
|    clip_fraction        | 0.188      |
|    clip_range           | 0.2        |
|    entropy_loss         | -1.09      |
|    explained_variance   | -0.00518   |
|    learning_rate        | 0.001      |
|    loss                 | 3.12       |
|    n_updates            | 10         |
|    policy_gradient_loss | -0.0336    |
|    value_loss           | 10.4       |

<sb3_contrib.ppo_recurrent.ppo_recurrent.RecurrentPPO at 0x17a386510>

In [22]:
env_test = TradeEnv(6, 288, 1000, df)
mean_reward, std_reward = evaluate_policy(model, env_test, n_eval_episodes=20, warn=False)
mean_reward, std_reward

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277


(-41616.0, 0.0)

In [18]:
env_test.inventory

AttributeError: 'DummyVecEnv' object has no attribute 'inventory'