In [1]:
'''Imports'''
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division

import gym
import random
import numpy as np
import pandas as pd
from gym import spaces
from gym.utils import seeding
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3 import DQN

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
'''Anytrading env'''

## More improvements
## --> More features, faster obs
## --> Not normalized data

# position constant
LONG  = 0
SHORT = 1
FLAT  = 2

# action constant
BUY  = 0
SELL = 1
HOLD = 2

class TradingEnv(gym.Env):

    def __init__(self, config):
        
        self.train       = config['train']
        self.show_trade  = config['show_trade']
        self.window_size = config['window_size']

        self.path    = config['path']
        self.actions = ["LONG", "SHORT", "FLAT"]
        self.fee = 0.0005

        self.seed()
        self._process_data()

        # n_features
        self.n_features = self.df.shape[1]
        self.shape      = (self.window_size, self.signal_features.shape[1])

        # defines action space
        self.action_space      = spaces.Discrete(len(self.actions))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(self.shape), dtype=np.float32)
    
    def _process_data(self):
        
        raw_df = pd.read_csv(self.path)
        
        del raw_df['date']
        del raw_df['compsum']
        #feature_list = raw_df.columns.values.tolist()
        feature_list =  ['open','close','low','high','volume']

        start = 0
        end   = len(raw_df.index)
        self.signal_features = (raw_df.loc[:, feature_list].to_numpy()[start:end])

        raw_df.dropna(inplace=True) # drops Nan rows
        self.closingPrices = raw_df['close'].values
        self.df = raw_df[feature_list].values

    def render(self, mode='human', verbose=False):
        return None

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def step(self, action):

        if self.done:
            return self.state, self.reward, self.done, {}
        self.reward = 0

        # action comes from the agent
        # 0 buy, 1 sell, 2 hold
        # single position can be opened per trade
        # valid action sequence would be
        # LONG : buy - hold - hold - sell
        # SHORT : sell - hold - hold - buy
        # invalid action sequence is just considered hold
        # (e.g.) "buy - buy" would be considred "buy - hold"
        
        self.action = HOLD  # hold
        if action == BUY: # buy
            if self.position == FLAT: # if previous position was flat
                self.position = LONG # update position to long
                self.action = BUY # record action as buy
                self.entry_price = self.closingPrice # maintain entry price
            elif self.position == SHORT: # if previous position was short
                self.position = FLAT  # update position to flat
                self.action = BUY # record action as buy
                self.exit_price = self.closingPrice
                self.reward += ((self.entry_price - self.exit_price)/self.exit_price + 1)*(1-self.fee)**2 - 1 # calculate reward
                self.krw_balance = self.krw_balance * (1.0 + self.reward) # evaluate cumulative return in krw-won
                self.entry_price = 0 # clear entry price
                self.n_short += 1 # record number of short
        elif action == 1: # vice versa for short trade
            if self.position == FLAT:
                self.position = SHORT
                self.action = 1
                self.entry_price = self.closingPrice
            elif self.position == LONG:
                self.position = FLAT
                self.action = 1
                self.exit_price = self.closingPrice
                self.reward += ((self.exit_price - self.entry_price)/self.entry_price + 1)*(1-self.fee)**2 - 1
                self.krw_balance = self.krw_balance * (1.0 + self.reward)
                self.entry_price = 0
                self.n_long += 1

        # [coin + krw_won] total value evaluated in krw won
        if(self.position == LONG):
            temp_reward = ((self.closingPrice - self.entry_price)/self.entry_price + 1)*(1-self.fee)**2 - 1
            new_portfolio = self.krw_balance * (1.0 + temp_reward)
        elif(self.position == SHORT):
            temp_reward = ((self.entry_price - self.closingPrice)/self.closingPrice + 1)*(1-self.fee)**2 - 1
            new_portfolio = self.krw_balance * (1.0 + temp_reward)
        else:
            temp_reward = 0
            new_portfolio = self.krw_balance

        self.portfolio = new_portfolio
        self.current_tick += 1
        if(self.show_trade and self.current_tick%100 == 0):
            print("Tick: {0}/ Portfolio (krw-won): {1}".format(self.current_tick, self.portfolio))
            print("Long: {0}/ Short: {1}".format(self.n_long, self.n_short))
        self.history.append((self.action, self.current_tick, self.closingPrice, self.portfolio, self.reward))
        
        self.state = self.obvs()
        
        info = {'portfolio':np.array([self.portfolio]),
                                                    "history":self.history,
                                                    "n_trades":{'long':self.n_long, 'short':self.n_short}}
        if (self.current_tick > (self.df.shape[0]) - self.window_size-1):
            self.done = True
            self.reward = self.get_profit()
            if(self.train == False):
                np.array([info]).dump(
                    './info/ppo_{0}_LS_{1}_{2}.info'.format(self.portfolio,
                                                                 self.n_long,
                                                                 self.n_short))
        return self.state, self.reward, self.done, info

    def get_profit(self):
        if(self.position == LONG):
            profit = ((self.closingPrice - self.entry_price)/self.entry_price + 1)*(1-self.fee)**2 - 1
        elif(self.position == SHORT):
            profit = ((self.entry_price - self.closingPrice)/self.closingPrice + 1)*(1-self.fee)**2 - 1
        else:
            profit = 0
        return profit
    def reset(self):
        # self.current_tick = random.randint(0, self.df.shape[0]-800)
        if(self.train):
            self.current_tick = random.randint(self.window_size, self.df.shape[0] - 800)
            # if a shape error occurs, its definetely this part here
        else:
            self.current_tick = self.window_size

        #print("start episode ... {0}" .format(self.current_tick))

        # positions
        self.n_long = 0
        self.n_short = 0

        # clear internal variables
        self.history = [] # keep buy, sell, hold action history
        self.krw_balance = 100 * 10000 # initial balance, u can change it to whatever u like
        self.portfolio = float(self.krw_balance) # (coin * current_price + current_krw_balance) == portfolio
        self.profit = 0
        self.closingPrice = self.closingPrices[self.current_tick]

        self.action = HOLD
        self.position = FLAT
        self.done = False
        self.state = self.obvs()  # This is returning a obervation 
        #print(self.observation_space.shape)
        return self.state

    def obvs(self):
        return self.signal_features[(self.current_tick-self.window_size+1):self.current_tick+1].astype("float32")

In [3]:
env_config_training = {'train': True ,"window_size": 144, "path": "D:/VS_project/ZALK/Rl_signal/MS/data/data.csv","show_trade": False}
environment = lambda:TradingEnv(env_config_training)


def main():

    # Train
    vec_env = make_vec_env(environment, n_envs=8, vec_env_cls=DummyVecEnv)
    model = DQN('MlpPolicy',vec_env, verbose=2, batch_size=1024) 
    model.learn(total_timesteps=1000000,log_interval=1)
    model.save("agent/")#place file path
        

    # Evaluate for 100 episodes
    env = TradingEnv(env_config_training)
    obs = env.reset()
    while True: 
        obs = obs[np.newaxis, ...]
        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(action)
        if done:
            print("info", info)
            break

if __name__ == '__main__':
    main()

Using cpu device
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.42e+03 |
|    ep_rew_mean      | -0.328   |
|    exploration_rate | 0.989    |
| time/               |          |
|    episodes         | 1        |
|    fps              | 139      |
|    time_elapsed     | 81       |
|    total_timesteps  | 11360    |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.85e+03 |
|    ep_rew_mean      | -0.419   |
|    exploration_rate | 0.983    |
| time/               |          |
|    episodes         | 2        |
|    fps              | 93       |
|    time_elapsed     | 196      |
|    total_timesteps  | 18264    |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 2e+03    |
|    ep_rew_mean      | -0.453   |
|    exploration_rate | 0.983    |
| time/               |          |
|  