In [None]:
import random
import gym
from gym import spaces
import pandas as pd
import numpy as np

MAX_ACCOUNT_BALANCE = 2147483647
MAX_NUM_UNITS = 100000
MAX_UNIT_PRICE = 200000
INITIAL_ACCOUNT_BALANCE = 10000
WINDOW_SIZE = 5
DATA_SIZE = 1
MDD_REWARD = 0.5

mode = 'ppo' # 'ppo', 'a2c'

if mode in ['a2c']:
    INTERVAL = 1000
    EPISODES = 150
else:
    INTERVAL = 1
    EPISODES = 200

dsource = 'trunc_data.pkl'
model_name = 'universal_model'

df = pd.read_pickle(dsource).reset_index(level=0)
MAX_STEPS = len(df) - WINDOW_SIZE - 1
TIME_STEPS_TRAIN = MAX_STEPS * EPISODES

In [None]:
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.logger import TensorBoardOutputFormat
from stable_baselines3.common.vec_env import SubprocVecEnv

#100-period sortino ratio
def roll_sortino(df): #t: last time ratio
  risk_free = 0 #0 percent
  returns = (df - df.shift(-1)).to_numpy()[:-1]
  return_negative_normal = returns[returns < 0]
  return_negative_std = return_negative_normal.std() if len(return_negative_normal) > 0 else 0
  sortino_roll = (returns.mean() - risk_free) / return_negative_std * np.sqrt(100) if return_negative_std > 0 else 0
  # print("srtdf", return_negative_normal, 'SSSTD', return_negative_std, 'SRLLL', sortino_roll)
  # print("SRRL", sortino_roll)
  return sortino_roll

def sortino(df, t, algorithm): #time period: m for minute, h for hour, t: truncated
  df_truncated = df.head(t)
  df_sortino = roll_sortino(df_truncated['{}_return'.format(algorithm)].tail(100))
  # print("srt", df_truncated['sortino'], df_truncated['dqn_return'])
  sortino_ = 0 if t == 0 else df_sortino
  # print(float(sortino_))
  return float(sortino_)

def sigmoid(x):
	return 1 / (1 + np.exp(-x))

class SummaryWriterCallback(BaseCallback):
    def _on_training_start(self):
        self._log_freq = 1  # log every INTERVAL calls

        output_formats = self.logger.output_formats
        # Save reference to tensorboard formatter object
        # note: the failure case (not formatter found) is not handled here, should be done with try/except.
        self.tb_formatter = next(formatter for formatter in output_formats if isinstance(formatter, TensorBoardOutputFormat))

    def _on_step(self) -> bool:
        # Log my_custom_reward every _log_freq(th) to tensorboard for each environment
        if self.n_calls % self._log_freq == 0:
            # if vars(vars(self.locals['env'])['envs'][0])['current_step'] == MAX_STEPS - 1:
              total_reward = vars(vars(self.locals['env'])['envs'][0])['total_reward']
              mdd = vars(vars(self.locals['env'])['envs'][0])['mdd']
              sortino = vars(vars(self.locals['env'])['envs'][0])['sortino']
              return_ = vars(vars(self.locals['env'])['envs'][0])['net_worth'] / INITIAL_ACCOUNT_BALANCE
              # print(total_reward)
            # rewards = self.locals['my_custom_info_dict']['my_custom_reward']
            # for i in range(self.locals['env'].num_envs):
              self.tb_formatter.writer.add_scalar("rewards", total_reward, self.n_calls)
              self.tb_formatter.writer.add_scalar("mdd", mdd, self.n_calls)
              self.tb_formatter.writer.add_scalar("sortino", sortino, self.n_calls)
              self.tb_formatter.writer.add_scalar("return_", return_, self.n_calls)

In [None]:
class BTCTradingEnvCont(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, df, algorithm):
        super(BTCTradingEnvCont, self).__init__()

        self.df = df
        self.df['{}_return'.format(algorithm)] = 0.
        self.df['mdd'] = 0.
        self.df['sortino'] = 0.

        self.algorithm = algorithm

        # Actions of the format [0]=0 Buy x%, [0]=1 Sell x%, [0]=2 Hold, etc., amount: [1] from 0 to 1
        self.action_space = spaces.Box(low=np.array([0, 0]), high=np.array([3, 1]), dtype=np.float16)

        # Prices contains the values for the last [WINDOW_SIZE] prices
        self.observation_space = spaces.Box(low=0, high=1, shape=(DATA_SIZE + 1, WINDOW_SIZE + 1), dtype=np.float16) # 1 each dimension

        self.mdd = 0
        self.mdd_last_sell = 0
        self.sortino = 0
        self.current_reward = 0
        self.total_reward = 0

    def _next_observation(self):
        # Get the BTC data points for the last WINDOW_SIZE days and scale to between 0-16
        len_frame = len(self.df.loc[self.current_step: self.current_step + WINDOW_SIZE, 'close'].values)
        if DATA_SIZE == 3:
          frame = np.array([
              np.pad(self.df.loc[self.current_step: self.current_step + WINDOW_SIZE, 'close'].to_numpy(), (0, WINDOW_SIZE + 1 - len_frame), 'edge') / MAX_UNIT_PRICE,
              np.pad(self.df.loc[self.current_step: self.current_step + WINDOW_SIZE, 'ewm'].to_numpy(), (0, WINDOW_SIZE + 1 - len_frame), 'edge') / MAX_UNIT_PRICE,
              np.pad(self.df.loc[self.current_step: self.current_step + WINDOW_SIZE, 'macd_histo'].to_numpy(), (0, WINDOW_SIZE + 1 - len_frame), 'edge') / MAX_UNIT_PRICE,
          ])
        elif DATA_SIZE == 1:
            self.df.loc[0, 'return'] = 0
            frame = np.array([
              np.pad(self.df.loc[self.current_step: self.current_step + WINDOW_SIZE, 'return'].to_numpy(), (0, WINDOW_SIZE + 1 - len_frame), 'edge'),
          ])
        elif DATA_SIZE == 4:
            frame = np.array([
              np.pad(self.df.loc[self.current_step: self.current_step + WINDOW_SIZE, 'close'].to_numpy(), (0, WINDOW_SIZE + 1 - len_frame), 'edge') / MAX_UNIT_PRICE,
              np.pad(self.df.loc[self.current_step: self.current_step + WINDOW_SIZE, 'open'].to_numpy(), (0, WINDOW_SIZE + 1 - len_frame), 'edge') / MAX_UNIT_PRICE,
              np.pad(self.df.loc[self.current_step: self.current_step + WINDOW_SIZE, 'high'].to_numpy(), (0, WINDOW_SIZE + 1 - len_frame), 'edge') / MAX_UNIT_PRICE,
              np.pad(self.df.loc[self.current_step: self.current_step + WINDOW_SIZE, 'low'].to_numpy(), (0, WINDOW_SIZE + 1 - len_frame), 'edge') / MAX_UNIT_PRICE,
          ])

        elif DATA_SIZE == 6:
            frame = np.array([
              np.pad(self.df.loc[self.current_step: self.current_step + WINDOW_SIZE, 'close'].to_numpy(), (0, WINDOW_SIZE + 1 - len_frame), 'edge') / MAX_UNIT_PRICE,
              np.pad(self.df.loc[self.current_step: self.current_step + WINDOW_SIZE, 'open'].to_numpy(), (0, WINDOW_SIZE + 1 - len_frame), 'edge') / MAX_UNIT_PRICE,
              np.pad(self.df.loc[self.current_step: self.current_step + WINDOW_SIZE, 'high'].to_numpy(), (0, WINDOW_SIZE + 1 - len_frame), 'edge') / MAX_UNIT_PRICE,
              np.pad(self.df.loc[self.current_step: self.current_step + WINDOW_SIZE, 'low'].to_numpy(), (0, WINDOW_SIZE + 1 - len_frame), 'edge') / MAX_UNIT_PRICE,
              np.pad(self.df.loc[self.current_step: self.current_step + WINDOW_SIZE, 'ewm'].to_numpy(), (0, WINDOW_SIZE + 1 - len_frame), 'edge') / MAX_UNIT_PRICE,
              np.pad(self.df.loc[self.current_step: self.current_step + WINDOW_SIZE, 'macd_histo'].to_numpy(), (0, WINDOW_SIZE + 1 - len_frame), 'edge') / MAX_UNIT_PRICE,
          ])
        
        # Append additional data and scale each value to between 0-1
        obs = np.append(frame, [[
            self.balance / MAX_ACCOUNT_BALANCE,
            self.max_net_worth / MAX_ACCOUNT_BALANCE,
            self.units_held / MAX_NUM_UNITS,
            self.cost_basis / MAX_UNIT_PRICE,
            self.total_units_sold / MAX_NUM_UNITS,
            self.total_sales_value / (MAX_NUM_UNITS * MAX_UNIT_PRICE),
        ]], axis=0)

        return obs

    def _take_action(self, action):
        # Set the current price to a random price within the time step
        current_price = self.df.loc[self.current_step, "close"]

        action_type = action[0]
        amount = action[1] # 0 to 1
        
        if action_type < 1 and amount > 0.001 and self.balance > 100:
            # Buy amount % of balance in units
            total_possible = self.balance / current_price
            units_bought = total_possible * amount
            prev_cost = self.cost_basis * self.units_held
            additional_cost = units_bought * current_price

            self.balance -= additional_cost
            self.cost_basis = (prev_cost + additional_cost) / (self.units_held + units_bought)
            self.units_held += units_bought
            self.df.loc[self.current_step, 'action'] = f'buy {units_bought:5f} @ {current_price}'

            # Update new portfolio values
            self.net_worth = self.balance + self.units_held * current_price

            if self.net_worth > self.max_net_worth:
                self.max_net_worth = self.net_worth
                self.mdd_base = self.max_net_worth # reset mdd base at ath

            if self.net_worth < self.min_net_worth:
                self.min_net_worth = self.net_worth

            if self.net_worth < self.mdd_base:
                self.mdd_base = self.net_worth # mdd calculation here

            if self.units_held == 0:
                self.cost_basis = 0

            # MDD and Sortino after buying
            # when max worth goes up, then  min_aft_max resets to the ATH
            self.mdd = max(1 - self.mdd_base/self.max_net_worth, self.mdd)

            self.df.loc[self.current_step, 'mdd'] = self.mdd
            self.sortino = sortino(self.df, self.current_step, self.algorithm) if self.current_step > 100 else 0
            self.df.loc[self.current_step, 'sortino'] = self.sortino

            # Return tracking
            self.df.loc[self.current_step, '{}_return'.format(self.algorithm)] = (self.net_worth - INITIAL_ACCOUNT_BALANCE) / INITIAL_ACCOUNT_BALANCE
            self.df.loc[self.current_step, 'holding'] = self.units_held
            
            # 0 reward for buying
            self.current_reward = 0#.0005
            self.df.loc[self.current_step, 'reward'] = 0#.0005

        elif action_type < 2 and amount > 0.001 and self.units_held > 0.0001:
            # Sell amount % of units held
            units_sold = self.units_held * amount
            self.balance += units_sold * current_price
            self.units_held -= units_sold
            self.total_units_sold += units_sold
            self.total_sales_value += units_sold * current_price
            
            # Update new portfolio values
            self.net_worth = self.balance + self.units_held * current_price

            if self.net_worth > self.max_net_worth:
                self.max_net_worth = self.net_worth
                self.mdd_base = self.max_net_worth # reset mdd base at ath

            if self.net_worth < self.min_net_worth:
                self.min_net_worth = self.net_worth
            
            if self.net_worth < self.mdd_base:
                self.mdd_base = self.net_worth # mdd calculation here

            if self.units_held == 0:
                self.cost_basis = 0
            
            # MDD and Sortino after buying
            self.mdd = max(1 - self.mdd_base/self.max_net_worth, self.mdd)
            self.df.loc[self.current_step, 'mdd'] = self.mdd
            self.sortino = sortino(self.df, self.current_step, self.algorithm) if self.current_step > 100 else 0
            self.df.loc[self.current_step, 'sortino'] = self.sortino

            # Get current return (correct)
            self.df.loc[self.current_step, '{}_return'.format(self.algorithm)] = (self.net_worth - INITIAL_ACCOUNT_BALANCE) / INITIAL_ACCOUNT_BALANCE
            self.df.loc[self.current_step, 'holding'] = self.units_held

            # Reward calculation
            delay_modifier = (self.current_step / len(self.df))

            reward_1 = units_sold * (self.df.loc[self.current_step, 'close'] - self.cost_basis) / self.cost_basis if self.cost_basis > 1 else 0
            reward_2 = MDD_REWARD * (self.mdd_last_sell - self.df.loc[self.current_step, 'mdd'])
            reward_3 = 0#self.sortino / 10
            self.current_reward = reward_1 + reward_2 + reward_3

            self.mdd_last_sell = self.df.loc[self.current_step, 'mdd']

            self.df.loc[self.current_step, 'reward'] = f'{self.current_reward:2f} || {reward_1:3f} {reward_2:3f}'

            self.df.loc[self.current_step, 'action'] = f'sell {units_sold:5f} @ {current_price}'
        
        else: # HODL
            # Update new portfolio values
            self.net_worth = self.balance + self.units_held * current_price

            if self.net_worth > self.max_net_worth:
                self.max_net_worth = self.net_worth
                self.mdd_base = self.max_net_worth # reset mdd base at ath

            if self.net_worth < self.min_net_worth:
                self.min_net_worth = self.net_worth

            if self.net_worth < self.mdd_base:
                self.mdd_base = self.net_worth # mdd calculation here

            if self.units_held == 0:
                self.cost_basis = 0

            # MDD and Sortino after buying
            self.mdd = max(1 - self.mdd_base/self.max_net_worth, self.mdd)
            self.df.loc[self.current_step, 'mdd'] = self.mdd
            self.sortino = sortino(self.df, self.current_step, self.algorithm) if self.current_step > 100 else 0
            self.df.loc[self.current_step, 'sortino'] = self.sortino

            # Get current return (correct)
            self.df.loc[self.current_step, '{}_return'.format(self.algorithm)] = (self.net_worth - INITIAL_ACCOUNT_BALANCE) / INITIAL_ACCOUNT_BALANCE
            self.df.loc[self.current_step, 'holding'] = self.units_held

            self.df.loc[self.current_step, 'action'] = f'hold {self.units_held:5f} @ {self.cost_basis}'

            # Reward # HODL 
            self.current_reward = self.units_held * (-0.0005 + min(0, 0.0001 * (self.net_worth - INITIAL_ACCOUNT_BALANCE) / INITIAL_ACCOUNT_BALANCE))
            self.df.loc[self.current_step, 'reward'] = self.current_reward
        self.total_reward += self.current_reward

    def step(self, action):
        # Execute one time step within the environment
        end_ = len(self.df) - WINDOW_SIZE - 1
        if self.current_step == end_ - 1:
          self._take_action([1,1]) #sell all
        else:
          self._take_action(action)

        self.current_step += 1
        # print(self.current_step)
        # if self.current_step > len(self.df.loc[:, 'close'].values) - WINDOW_SIZE - 1 and mode != 'td3':
        #     self.current_step = self.current_step - WINDOW_SIZE - 1
        obs = self._next_observation()
        done = self.current_step == end_
        if done:
          print('RW', self.total_reward, "MD", self.mdd, 'RET', (self.net_worth - INITIAL_ACCOUNT_BALANCE) / INITIAL_ACCOUNT_BALANCE)
        return obs, self.current_reward, done, {}

    def reset(self):
        # Reset the state of the environment to an initial state
        self.balance = INITIAL_ACCOUNT_BALANCE
        self.net_worth = INITIAL_ACCOUNT_BALANCE
        self.max_net_worth = INITIAL_ACCOUNT_BALANCE
        self.mdd_base = INITIAL_ACCOUNT_BALANCE
        self.mdd = 0
        self.mdd_last_sell = 0
        self.min_net_worth = INITIAL_ACCOUNT_BALANCE
        self.units_held = 0
        self.cost_basis = 0
        self.total_units_sold = 0
        self.total_sales_value = 0
        self.total_reward = 0

        # Set the current step to a random point within the data frame
        # self.current_step = random.randint(0, len(self.df.loc[:, 'open'].values) - WINDOW_SIZE - 1)
        self.current_step = 0

        return self._next_observation()

    def render(self, mode='human', close=False):
        # Render the environment to the screen
        profit = self.net_worth - INITIAL_ACCOUNT_BALANCE

        print(f'Step: {self.current_step}')
        print(f'Balance: {self.balance}')
        print(f'Units held: {self.units_held} (Total sold: {self.total_units_sold})')
        print(f'Avg cost for held units: {self.cost_basis} (Total sales value: {self.total_sales_value})')
        print(f'Net worth: {self.net_worth} (Max net worth: {self.max_net_worth}, min: {self.min_net_worth})')
        print(f'MDD: {self.mdd} Sortino: {self.sortino})')
        print(f'Profit: {profit}')
        print()


In [None]:
import datetime as dt

from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3 import PPO, A2C
import pandas as pd
from stable_baselines3.common.noise import NormalActionNoise
from stable_baselines3.common.callbacks import CallbackList, CheckpointCallback, EvalCallback
import pickle, random

# The algorithms require a vectorized environment to run
env = DummyVecEnv([lambda: BTCTradingEnvCont(df, mode)])

# TD3 actions
n_actions = env.action_space.shape[-1]
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))

eval_callback = EvalCallback(env, best_model_save_path=f"./models/B3/{mode}_best_model", log_path="./models/B3/{mode}_res", eval_freq=len(df))
checkpoint_callback = CheckpointCallback(save_freq=10 * len(df), save_path="./models/B3/{mode}-log")
callback = CallbackList([checkpoint_callback, SummaryWriterCallback()])

if mode == 'ppo': 
  model = PPO("MlpPolicy", env, verbose=1, tensorboard_log="./models/B3/xlog")
elif mode == 'a2c':
  model = A2C("MlpPolicy", env, verbose=1, tensorboard_log="./models/B3/xlog")

# model.learn(total_timesteps=TIME_STEPS_TRAIN, log_interval=INTERVAL, callback=callback)
model.learn(total_timesteps=TIME_STEPS_TRAIN, log_interval=INTERVAL)

# Save model
model.save(f'model-cont-{mode}-{DATA_SIZE}-{WINDOW_SIZE}-{MDD_REWARD}-{random.randint(0,99)}')
pickle.dump({'name': f'{mode}', 'data': df}, open(f"models/B3/data-cont{mode}-{DATA_SIZE}-{WINDOW_SIZE}-{random.randint(0,99)}.pkl", 'ab'))

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

def plot_fig(df_, title, custom_range=[0,1]):
  df = df_.head(len(df_) - WINDOW_SIZE - 1)
  # Price line
  fig = make_subplots(rows=3, cols=1, 
                      specs = [[{"secondary_y": False}], 
                              [{"secondary_y": False}],
                              [{"secondary_y": False}]])
  fig.update_layout(
      autosize=False,
      width=1300,
      height=800,
      title_text=title,
    )

  initial_btc_price = float(df.head(1)['close'])
  fig.append_trace(
      go.Scatter(
          x=df.index,
          y=df['close'] / initial_btc_price,
          # line=dict(color='#ff9900', width=1),
          name='BTC/benchmark',
          # showlegend=False,
          legendgroup='1',
          marker=dict(
          size=42,
          # I want the color to be green if 
          # lower_limit ≤ y ≤ upper_limit
          # else red
          color='black',
        )
      ), row=1, col=1
  )

  # Sortino
  fig.add_trace(
      go.Scatter(
          x=df.index,
          y=df['sortino'],
          name='Sortino',
      ), row=2, col=1
  )

  # Portfolio
  fig.append_trace(
    go.Scatter(
          x=df.index,
          y=df[f'{mode}_return'] + 1,
          # line=dict(color='#ff9900', width=1),
          name='Portfolio',
          # showlegend=False,
          marker=dict(
          size=42,
          # I want the color to be green if 
          # lower_limit ≤ y ≤ upper_limit
          # else red
          color='red',
        )
      ), row=1, col=1
  )

  # MDD
  fig.add_trace(
    go.Scatter(
          x=df.index,
          y=-df['mdd'],
          # line=dict(color='#ff9900', width=1),
          name='MDD',
          # showlegend=False,
          marker=dict(
          size=42,
          # I want the color to be green if 
          # lower_limit ≤ y ≤ upper_limit
          # else red
          color='blue',
        )
      ), row=3, col=1
  )
  return fig

In [None]:
from matplotlib import pyplot as plt 

#testing with other data
data_addrs = ['hourly_bull.pkl', 'hourly_bear.pkl', 'minutely_crab.pkl', 'minutely_bull.pkl']
results_ = []

def evaluate_model(addr):   
    dat_te = pd.read_pickle(addr)
    dat_te = dat_te.reset_index(level=0)

    env_te = DummyVecEnv([lambda: BTCTradingEnvCont(dat_te, mode)])

    obs_te = env_te.reset()

    for i in range(len(dat_te) - WINDOW_SIZE - 1):
        action, _states = model.predict(obs_te)
        obs_te, rewards, done, info = env_te.step(action)
        if (i + 1) % 500 == 0 or i == len(dat_te) - 3 - WINDOW_SIZE:
          env_te.render()
    
    fig = plot_fig(dat_te, f'./models/B3/{mode}-{addr}-test', custom_range=[min(dat_te[f'{mode}_return']), max(dat_te[f'{mode}_return'])])

    res_ = {
        'name': f'test-{addr}-{mode}-{DATA_SIZE}-{WINDOW_SIZE}',
        'data': dat_te,
        'figure': fig
    }

    results_.append(res_)

In [None]:
for addr in data_addrs:
    evaluate_model(addr)

pickle.dump(results_, open(f'./models/B3/{mode}-cont-results.pkl', 'ab'))

In [None]:
results_[3]['figure']

In [None]:
def get_delta(dataset, indexe):
  benchmark = dataset[indexe]['figure']['data'][0]['y']
  portfolio = dataset[indexe]['figure']['data'][2]['y']
  delta = portfolio/benchmark
  ave_delta = delta.mean()
  return ave_delta - 1

def get_data_note(dataset, indexe):
  portfolio = dataset[indexe]['figure']['data'][2]['y'][-1]
  mdd = dataset[indexe]['figure']['data'][3]['y'][-1]
  sortino = dataset[indexe]['figure']['data'][1]['y'][-1]
  return portfolio - 1, -mdd, sortino

for i in range (4):
  print(get_delta(results_, i), get_data_note(results_, i))