In [29]:
import gym
from gym import spaces
from gym.utils import seeding
from empyrical import max_drawdown, alpha_beta, sharpe_ratio, annual_return

In [30]:
import pandas as pd
import numpy as np

In [31]:
from arctic import CHUNK_STORE, Arctic

In [32]:
import ray
# Start up Ray. This must be done before we instantiate any RL agents.
ray.init(num_cpus=10, ignore_reinit_error=True, log_to_driver=False)

2020-10-19 11:25:12,367	ERROR worker.py:643 -- Calling ray.init() again after it has already been called.


In [54]:

def load_data(price_source='Alpaca_Equity_daily',tickers=['SPY','QQQ'],start='2008-01-02',end='2010-01-02'):
    '''Returned price data to use in gym environment'''
    ## Load data 
    ## Each dataframe will have columns date and a collection of fields 
    if price_source in ['Alpaca_Equity_daily', 'Alpaca_Equity_minute', 'Quandl_Futures_daily']:
        price_df = []
        a = Arctic('localhost')
        lib = a[price_source]
        for t in tickers:
            df1 = lib.read(t).set_index('date').loc[start:end]
            price_df.append(df1[['Open','Volume']])
    if price_source in ['csvdata']:
        price_df = []
        for t in tickers:
            df1 = pd.read_csv('csvdata/{}.csv'.format(t)).set_index('date').loc[start:end]
            price_df.append(df1)
    if price_source in ['Alphavnatage_Equity_daily', 'Alphavnatage_Equity_minute',]:
        price_df = []
        a = Arctic('localhost')
        lib = a[price_source]
        for t in tickers:
            df1 = lib.read(t).set_index('date').loc[start:end]
            price_df.append(df1[['Close','Volume','Open','High','Low']])
    
    ## Merge data 
    ## Reference dataframe is taken from the first ticker read where the column labels are assumed to be the same
    if len(price_df) > 0:
        ref_df = price_df[0]
        ref_df_columns = price_df[0].columns
        for i in range(1,len(price_df)):
            ref_df = ref_df.merge(price_df[i], how='outer', on='date',)
        merged_df = ref_df.sort_values(by='date').fillna(0)
    
    ## Prepare price tensor for observation space 
    price_tensor = np.zeros(shape=(merged_df.shape[0],len(ref_df_columns),len(price_df)))
    for count in range(len(price_df)):
        price_tensor[:,:,count] = merged_df.values[:,len(ref_df_columns)*count:len(ref_df_columns)*(count+1)]
        
    return {'dates':merged_df.index, 'fields':ref_df_columns, 'data':price_tensor }
        

In [55]:
class Equitydaily(gym.Env):

    def __init__(self,env_config):
        
        self.tickers = env_config['tickers']
        self.lookback = env_config['lookback']
        # Load price data
        price_data = load_data(env_config['pricing_source'],env_config['tickers'],env_config['start'],env_config['end'])
        self.dates = price_data['dates']
        self.fields = price_data['fields']
        self.pricedata = price_data['data']
        # Set up historical actions and rewards 
        self.n_assets = len(self.tickers) + 1
        self.n_metrics = 2 
        self.n_assets_fields = len(self.fields)
        self.n_features = self.n_assets_fields * len(self.tickers) + self.n_assets + self.n_metrics # reward function
        
        # Set up action and observation space
        # The last asset is cash 
        self.action_space = spaces.Box(low=-1, high=1, shape=(len(self.tickers)+1,), dtype=np.float32)
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf,
                                            shape=(self.lookback,self.n_features), dtype=np.float32)

        self.reset()

        

    def step(self, action):
        
        ## Normalise action space 
        normalised_action = action / np.sum(np.abs(action))
        
        done = False
        # Rebalance portfolio at open, use log return of open price in the following day 
        next_day_log_return = self.pricedata[self.index+1,0,:]
        # transaction cost 
        transaction_cost = self.transaction_cost(normalised_action,self.position_series[-1])
        
        # Rebalancing 
        self.position_series = np.append(self.position_series, [normalised_action], axis=0)
        today_portfolio_return = np.sum(normalised_action[:-1] * next_day_log_return) + np.sum(transaction_cost)
        self.log_return_series = np.append(self.log_return_series, [today_portfolio_return], axis=0)
        
        
        # Calculate reward 
        # Need to cast log_return in pd series to use the functions in empyrical 
        live_days = self.index - self.lookback
        burnin = 250
        recent_series = pd.Series(self.log_return_series)[-100:]
        whole_series = pd.Series(self.log_return_series)
        if live_days > burnin: 
            self.metric = annual_return(whole_series) + 0.5* max_drawdown(whole_series)
        else:
            self.metric = annual_return(whole_series) + 0.5* max_drawdown(whole_series) *live_days / burnin
        reward = self.metric - self.metric_series[-1]
        #reward = self.metric
        self.metric_series = np.append(self.metric_series, [self.metric], axis=0)
        
        # Check if the end of backtest
        if self.index >= self.pricedata.shape[0]-2:
            done = True
            
        # Prepare observation for next day
        self.index += 1
        price_lookback = self.pricedata[self.index-self.lookback:self.index,:,:].reshape(self.lookback,-1)
        ## Smooth price_fields and add rolling volatility window 
        
        
        metrics = np.vstack((self.log_return_series[self.index-self.lookback:self.index], 
                             self.metric_series[self.index-self.lookback:self.index])).transpose()
        self.observation = np.concatenate( (price_lookback,  metrics,
                                            self.position_series[self.index-self.lookback:self.index]), axis=1)
            
            
        return self.observation, reward, done, {}
    
    
    def reset(self):
        
        self.log_return_series = np.zeros(shape=self.lookback)
        self.metric_series = np.zeros(shape=self.lookback)
        self.position_series = np.zeros(shape=(self.lookback,self.n_assets))
        
        self.metric = 0                    
        self.index = self.lookback
        # Observation join the price, metric and position 
        price_lookback = self.pricedata[:self.index,:,:].reshape(self.lookback,-1)
        metrics = np.vstack((self.log_return_series, self.metric_series)).transpose()
        self.observation = np.concatenate((price_lookback, metrics, self.position_series), axis=1)
        
        return self.observation
    
    # 0.05% t-cost for institutional portfolios 
    def transaction_cost(self,new_action,old_action,):
        turnover = np.abs(new_action - old_action) 
        fees = 0.9995
        tcost = turnover * np.log(fees)
        return tcost 

In [37]:
import time
startt = time.time()
load_data(tickers=['SPY','QQQ','SHY','GLD','TLT','LQD'],end='2018-12-31')
time.time() - startt 

0.03921914100646973


76.92202639579773

PPO policy

In [56]:

from ray.rllib.agents.ppo import PPOTrainer, DEFAULT_CONFIG
from ray.tune.logger import pretty_print

In [57]:
config['model']['dim'] = 50
config['model']['conv_filters'] = [[16, [5, 1], 1], [32, [5, 1], 5], [16, [10, 1], 1]]

In [58]:
config = DEFAULT_CONFIG.copy()
config['num_workers'] = 1
config["num_envs_per_worker"] = 1
config["rollout_fragment_length"] = 20
config["train_batch_size"] = 5000
config["batch_mode"] = "complete_episodes"
config['num_sgd_iter'] = 20
config['sgd_minibatch_size'] = 200
config['model']['fcnet_hiddens'] = [100, 100]
config['num_cpus_per_worker'] = 2  # This avoids running out of resources in the notebook environment when this cell is re-executed
config['env_config'] = {'pricing_source':'Alpaca_Equity_daily', 'tickers':['SPY','QQQ','SHY','GLD','TLT','LQD'], 'lookback':50, 'start':'2008-01-02', 'end':'2018-12-31'}

In [59]:
config

{'num_workers': 1,
 'num_envs_per_worker': 1,
 'rollout_fragment_length': 20,
 'batch_mode': 'complete_episodes',
 'num_gpus': 0,
 'train_batch_size': 5000,
 'model': {'fcnet_hiddens': [100, 100],
  'fcnet_activation': 'tanh',
  'conv_filters': [[16, [5, 1], 1], [32, [5, 1], 5], [16, [10, 1], 1]],
  'conv_activation': 'relu',
  'free_log_std': False,
  'no_final_linear': False,
  'vf_share_layers': True,
  'use_lstm': False,
  'max_seq_len': 20,
  'lstm_cell_size': 256,
  'lstm_use_prev_action_reward': False,
  '_time_major': False,
  'framestack': True,
  'dim': 50,
  'grayscale': False,
  'zero_mean': True,
  'custom_model': None,
  'custom_model_config': {},
  'custom_action_dist': None,
  'custom_preprocessor': None},
 'optimizer': {},
 'gamma': 0.99,
 'horizon': None,
 'soft_horizon': False,
 'no_done_at_end': False,
 'env_config': {'pricing_source': 'Alpaca_Equity_daily',
  'tickers': ['SPY', 'QQQ', 'SHY', 'GLD', 'TLT', 'LQD'],
  'lookback': 50,
  'start': '2008-01-02',
  'end': 

Check to see if agents can be trained

In [60]:
agent = PPOTrainer(config, Equitydaily)
best_reward = -np.inf

2020-10-19 12:58:52,033	INFO trainable.py:252 -- Trainable.setup took 79.222 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


In [61]:
for i in range(2):
    result = agent.train()
    if result['episode_reward_mean'] > best_reward + 0.01:
        path = agent.save('sampleagent')
        print(path)
        best_reward = result['episode_reward_mean']
        print(best_reward)

sampleagent/checkpoint_1/checkpoint-1
-0.6304444350903895


In [62]:
result

{'episode_reward_max': -0.561724132078505,
 'episode_reward_min': -0.652029927175597,
 'episode_reward_mean': -0.6075568131618514,
 'episode_len_mean': 2717.0,
 'episodes_this_iter': 2,
 'policy_reward_min': {},
 'policy_reward_max': {},
 'policy_reward_mean': {},
 'custom_metrics': {},
 'hist_stats': {'episode_reward': [-0.6076142503881217,
   -0.561724132078505,
   -0.652029927175597,
   -0.6088589430051818],
  'episode_lengths': [2717, 2717, 2717, 2717]},
 'sampler_perf': {'mean_env_wait_ms': 1.1325459108908027,
  'mean_raw_obs_processing_ms': 0.10075595122260311,
  'mean_inference_ms': 0.9959995274910309,
  'mean_action_processing_ms': 0.10824959068204312},
 'off_policy_estimator': {},
 'num_healthy_workers': 1,
 'timesteps_total': 10868,
 'timers': {'sample_time_ms': 13268.656,
  'sample_throughput': 409.537,
  'load_time_ms': 51.685,
  'load_throughput': 105137.179,
  'learn_time_ms': 1744.321,
  'learn_throughput': 3115.253,
  'update_time_ms': 2.217},
 'info': {'learner': {'def

In [49]:
agent.restore('sampleagent/checkpoint_1/checkpoint-1')

2020-10-19 11:42:27,943	INFO trainable.py:481 -- Restored on 155.198.192.44 from checkpoint: sampleagent/checkpoint_1/checkpoint-1
2020-10-19 11:42:27,944	INFO trainable.py:489 -- Current state after restoring: {'_iteration': 1, '_timesteps_total': None, '_time_total': 14.936065912246704, '_episodes_total': 2}


In [50]:
for i in range(5):
    result = agent.train()
    if result['episode_reward_mean'] > best_reward + 0.01:
        path = agent.save('sampleagent')
        print(path)
        best_reward = result['episode_reward_mean']
        print(best_reward)

In [51]:
result

{'episode_reward_max': -1.4606538207848538,
 'episode_reward_min': -1.4645063547769122,
 'episode_reward_mean': -1.4626622356814447,
 'episode_len_mean': 2717.0,
 'episodes_this_iter': 2,
 'policy_reward_min': {},
 'policy_reward_max': {},
 'policy_reward_mean': {},
 'custom_metrics': {},
 'hist_stats': {'episode_reward': [-1.4636325476804446,
   -1.4626063880059572,
   -1.4630606312488108,
   -1.4633451856324962,
   -1.46218844444843,
   -1.4617151830460218,
   -1.4626371702060625,
   -1.4626367047668551,
   -1.4607733785111567,
   -1.4627215988366584,
   -1.4629789065195686,
   -1.463788392166825,
   -1.4637532287247266,
   -1.4606538207848538,
   -1.464005891906464,
   -1.4645063547769122,
   -1.461118710916315,
   -1.4635861598443032,
   -1.463104434903402,
   -1.4620578211559732,
   -1.4616518108417296,
   -1.461373642925195,
   -1.4625759648611647,
   -1.4634212836443437],
  'episode_lengths': [2717,
   2717,
   2717,
   2717,
   2717,
   2717,
   2717,
   2717,
   2717,
   2717,

SAC 

In [7]:
from ray.rllib.agents.sac import SACTrainer, DEFAULT_CONFIG
from ray.tune.logger import pretty_print

Instructions for updating:
non-resource variables are not supported in the long term


In [14]:
config = DEFAULT_CONFIG.copy()
config['num_workers'] = 1
config["num_envs_per_worker"] = 1

config["rollout_fragment_length"] = 10
config["train_batch_size"] = 50
config["timesteps_per_iteration"] = 10
config["buffer_size"] = 10000

config['Q_model']['fcnet_hiddens'] = [50, 50]
config['policy_model']['fcnet_hiddens'] = [50, 50]
config['num_cpus_per_worker'] = 5  # This avoids running out of resources in the notebook environment when this cell is re-executed
config['env_config'] = {'pricing_source':'Alpaca_Equity_daily', 'tickers':['SPY','QQQ','SHY','GLD','TLT','LQD'], 'lookback':50, 'start':'2008-01-02', 'end':'2018-12-31'}


In [15]:
# Train agent 
agent = SACTrainer(config, Equitydaily)
best_reward = -np.inf

2020-10-18 11:25:29,626	INFO trainable.py:252 -- Trainable.setup took 79.252 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


In [23]:
for i in range(20):
    result = agent.train()
    if result['episode_reward_mean'] > best_reward + 0.01:
        path = agent.save('sampleagent')
        print(path)
        best_reward = result['episode_reward_mean']
    print(result['episode_reward_mean'])

-1.4620830970919299
-1.4620830970919299
-1.4620830970919299
-1.462393085861623
-1.462393085861623
-1.462393085861623
-1.462393085861623
-1.462393085861623
-1.462393085861623
-1.462393085861623
-1.462393085861623
-1.462393085861623
-1.462393085861623
-1.462393085861623
-1.462393085861623
-1.4626715012281806
-1.4626715012281806
-1.4626715012281806
-1.4626715012281806
-1.4626715012281806


In [22]:
result

{'episode_reward_max': -1.4603430197940985,
 'episode_reward_min': -1.4644678157497302,
 'episode_reward_mean': -1.4620830970919299,
 'episode_len_mean': 2717.0,
 'episodes_this_iter': 0,
 'policy_reward_min': {},
 'policy_reward_max': {},
 'policy_reward_mean': {},
 'custom_metrics': {},
 'hist_stats': {'episode_reward': [-1.4603430197940985,
   -1.4623457962025583,
   -1.4621110663130388,
   -1.4611477874002234,
   -1.4644678157497302],
  'episode_lengths': [2717, 2717, 2717, 2717, 2717]},
 'sampler_perf': {'mean_env_wait_ms': 1.429634864225194,
  'mean_raw_obs_processing_ms': 0.15369715130817593,
  'mean_inference_ms': 1.0387717205894833,
  'mean_action_processing_ms': 0.12727329722517183},
 'off_policy_estimator': {},
 'num_healthy_workers': 1,
 'timesteps_total': 15750,
 'timers': {'learn_time_ms': 4.064,
  'learn_throughput': 12301.743,
  'update_time_ms': 4.777},
 'info': {'learner': {'default_policy': {'mean_td_error': 0.28931177,
    'actor_loss': -26.91102,
    'critic_loss':

Run environment 

In [None]:
config

In [None]:
agent = PPOTrainer(config, Equitydaily)

In [None]:
env = Equitydaily({'pricing_source':'Alpaca_Equity_daily', 'tickers':['SPY','QQQ','SHY','GLD','TLT','EEM'], 'lookback':50, 'start':'2011-01-02', 'end':'2020-12-31'})

In [None]:
agent.restore('checkpoint_1087/checkpoint-1087')

In [None]:
state = env.reset()
done = False
reward_list = []
cum_reward = 0
actions = list()

while not done:
    #action = agent.compute_action(state)
    action = np.array([0,0,0,0,0,0,1])
    state, reward, done, _ = env.step(action)
    cum_reward += reward
    actions.append(action)
    reward_list.append(reward)

pd.Series(env.log_return_series).cumsum().plot()

In [None]:
pd.Series(reward_list).plot()

In [None]:
pd.DataFrame(actions)

Run environment for RNN environment

In [None]:
env = Equitydaily({'pricing_source':'Alpaca_Equity_daily', 'tickers':['SPY','QQQ'], 'lookback':50, 'start':'2018-01-02', 'end':'2020-12-31'})

state = env.reset()
done = False
cum_reward = 0 
actions = list()

rnn_state = agent.get_policy().get_initial_state()

while not done:
    action, rnn_state, _ = agent.compute_action(state,rnn_state)
    #action = np.array([1,-1])
    state, reward, done, _ = env.step(action)
    cum_reward += reward
    actions.append(actions)

pd.Series(env.log_return_series).cumsum().plot()

In [None]:
max_drawdown(pd.Series(env.log_return_series))

In [None]:
annual_return(pd.Series(env.log_return_series))