### Uses Paper Trade - test money
##### https://alpaca.markets/docs/api-documentation/api-v2/

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import time, datetime as dt
import pandas as pd, numpy as np

import alpha
import alpaca
import database as db
from populate import download_data
from rl_algos import TD3, ReplayBuffer, Actor, Critic
from portfolios import Portfolio
from history import *

DataStore = db.DataStore()

import gym
from gym import spaces

In [2]:
class StockTraderEnvironment(gym.Env):
    def __init__(self,
                portfolio,
                history, 
                short=False):
        
        self.portfolio = portfolio
        self.indicators = history.indicators
        self.prices = history.prices
        self.symbol = history.symbol
        self.short = short
        
        num_indicators = self.indicators.shape[1]
        assert num_indicators > 0, "supply 1 or more indicators"

        self.action_space = spaces.Discrete(3)

        # set space for alpha indicators at +- infinity...?
        low_array = np.full((num_indicators), -np.inf)
        high_array = np.full((num_indicators), np.inf)
        self.observation_space = spaces.Box(low=low_array, high=high_array, dtype=np.float64)
        
        self.nS, self.nA = self.observation_space.shape[0], self.action_space.n
        
        self.previous_price = 0  # didn't exist before first day, so set previous_price to 0
        first_day = self.indicators.index[0] # starting at first day indicators exist
        self.prices = self.prices.loc[first_day:] # rewriting prices to fit indicator list
        
        self.state = np.array(self.indicators.iloc[0]) # first day is inititial state
        self.days = iter(self.prices.index.values)
        
        # Iterate through days, checking action/reward, etc. in step...
        self.trades = pd.DataFrame(0, index = self.prices.index, columns = self.prices.columns)
        self.trades_dupl = self.trades.copy(deep = True)
        
        # position is how much long (positive), short (negative) or holding (zero)
        self.portfolio.positions.append(self.symbol)
        self.portfolio.position_amount[self.symbol] = 0  # how parse?

        
    def reset(self):
        self.previous_price = 0
        self.days = iter(self.prices.index.values)
        self.state = np.array(self.indicators.iloc[0])
        self.trades = pd.DataFrame(0, index = self.prices.index, columns = self.prices.columns)
        self.portfolio.position_amount[self.symbol] = 0
       
    
    def make_trade(self, action, current_price):
        position = self.portfolio.position_amount[self.symbol]
        if not self.short:
            assert position >= 0, "Error in logic - shorted position with shorting disabled"
        buysell_amount = 0
        if action == 0 and position == 0:
            buysell_amount = 100
            self.portfolio.buy(self.symbol, buysell_amount, current_price)
        elif action == 0 and position > 0:
            buysell_amount = 50
            self.portfolio.buy(self.symbol, buysell_amount, current_price)
        elif action == 1 and position < 0:
            if not self.short:
                pass # for clarity
            else:
                buysell_amount = -50
                self.portfolio.sell(self.symbol, -buysell_amount, current_price)
        elif action == 1 and position == 0:
            if not self.short:
                pass
            else:
                buysell_amount = -100
                self.portfolio.sell(self.symbol, -buysell_amount, current_price)
        elif action == 1 and position > 0:
            if not self.short:
                buysell_amount = -position # sell off all of position if not shorting
                self.portfolio.sell(self.symbol, -buysell_amount, current_price)
            else:
                buysell_amount = -position - 50 # sell off all of position if shorting and short additioanl 50
                self.portfolio.sell(self.symbol, -buysell_amount, current_price)
        elif action == 2:
            pass # no action, left for clairty
        return buysell_amount
    
    
    def step(self, action):
        #https://github.com/openai/gym/blob/master/gym/envs/classic_control/cartpole.py
        err_msg = "%r (%s) invalid" % (action, type(action))
        assert self.action_space.contains(action), action
        
        #Calculate reward here... first day = 0
        # 0 is buy, 1 is sell, 2 is hold
        try:
            new_day = next(self.days)
            current_price = self.prices.loc[new_day, 'adjusted close']
            self.state = self.indicators.loc[new_day]

            if action == 0 and current_price > self.previous_price:
                reward = 2
            elif action == 0 and current_price < self.previous_price:
                reward = -2
            elif action == 1 and current_price < self.previous_price:
                reward = 2
            elif action == 1 and current_price > self.previous_price:
                reward = -2
            elif action == 2 and (current_price > self.previous_price or current_price < self.previous_price):
                reward = -2
            elif action == 2 and current_price == self.previous_price:
                reward = 2
            else:
                reward = 0
                
            buysell_amount = self.make_trade(action, current_price)
            self.trades[new_day] = buysell_amount
            
            self.previous_price = current_price
            done = False
        except StopIteration:
            if self.trades.equals(self.trades_dupl):
                done = True
            else:
                self.trades_dupl = self.trades.copy(deep = True)

        return self.state, reward, done, info
    
    
    def render(self):
        #ToDo - show progression via graph?
        pass

In [5]:
class Experiment:
    def __init__(self, DataStore, td3_kwargs, portfolio, sym, dates=None, indicators='all', shorting_allowed=False):
        print('Setting up experiment, loading history...')
        self.available_cash = portfolio.cash_remaining
        
        #date format for range: dates=[dt.datetime(2000,1,1), dt.datetime(2020,12,31)]
        self.history = History(DataStore, sym, dates, indicators=indicators)
        self.indicators = self.history.indicators
        self.prices = self.history.prices
        
        self.num_days = self.prices.shape[0]
        training = int(.75* self.num_days)
        validation = int(.3*training)
        test = self.num_days-training
        
        self.price_train = self.prices.iloc[:(training-validation)]
        self.indicator_train = self.indicators.iloc[:(training-validation)]
        
        self.price_validation = self.prices.iloc[(training-validation):training]
        self.indicator_validation = self.indicators.iloc[(training-validation):training] 
        
        self.price_test = self.prices.iloc[training:]
        self.indicator_test = self.indicators.iloc[training:]
        
        self.portfolio = portfolio
        self.symbol = sym
        
        self.env = StockTraderEnvironment(self.portfolio, 
                                          self.history, 
                                          short=shorting_allowed)
        
        self.batch_size = 64 # not parameterized...
        self.buffer = ReplayBuffer(self.env.nS, self.env.nA, max_buffer=int(1e6), batch_size=self.batch_size)
        
        self.max_action = 2  # 3 actions: [0,1,2], so 2 is max
        
        kwargs = {
            "state_dim": self.env.nS,
            "action_dim": self.env.nA,
            "max_action": self.max_action,
            "discount": td3_kwargs['discount'],
            "tau": td3_kwargs['tau'],
            "policy_noise": td3_kwargs['policy_noise'],            
            "noise_clip": td3_kwargs['noise_clip'],
            "policy_freq": td3_kwargs['policy_freq']
        }
        
        self.policy = TD3(**kwargs)
        self.expl_noise = td3_kwargs['expl_noise']

        
    def run(self, num_episodes, max_steps=int(1e6)):
        
        random_warmup = 25e3
        total_days_run = 0
        self.total_reward = 0
        
        for idx in range(num_episodes):
            
            state, done = self.env.reset(), False
            episode_reward = 0
            steps = 0
            
            for days_passed in range(self.num_days):
                
                steps += 1
                
                if total_days_run < random_warmup:
                    action = self.env.action_space.sample()
                else:
                    #ToDo - what is this doing? https://github.com/sfujim/TD3/blob/master/main.py
                    action = (self.policy.select_action(np.array(state)) + np.random.normal(0, self.max_action * self.expl_noise, size=self.env.nA)).clip(-self.max_action, self.max_action)
                    action = np.argmax(action)
            
                next_state, reward, done, _ = self.env.step(action)
                
                experience = [state, action, next_state, reward, done]
                self.buffer.update(experience)
                
                #ToDo: collect per episode, per iteration reward, total reward, etc.; portfolio value (or final portfolio value?)
                self.total_reward += reward
                episode_reward += reward
                
                if done:
                    print(f'Episode finished after {days_passed+1} timesteps')
                    break
                
                state = next_state
                if days_passed >= random_warmup:
                    self.policy.train(self.buffer, self.batch_size)
                
                if days_passed == self.num_days - 1:
                    if num_episodes - idx - 1 == 0:
                        print('Finished all episodes, did not converge')
                    else:
                        print(f'Finished all days without converging, starting from day 1 for {num_episodes - idx - 1} more episodes.')
                        
                total_days_run += 1
                
            print(f'episode reward {episode_reward}')
            
        print(f'total reward {self.total_reward}')
                    
        #self.env.close() # not defined
    
fake_portfolio = Portfolio(use_alpaca=False)

kwargs = {
    "discount": 0.99,
    "tau": 0.005,
    "policy_noise": 0.2,            
    "noise_clip": 0.5,
    "policy_freq": 2,
    "expl_noise": 0.1
}

exp1 = Experiment(DataStore, kwargs, portfolio=fake_portfolio, sym='JPM)

Portfolio created - available cash: 10000
Setting up experiment, loading history...


In [6]:
exp1.run(100)

Finished all days without converging, starting from day 1 for 99 more episodes.
episode reward -3472
Finished all days without converging, starting from day 1 for 98 more episodes.
episode reward -3498
Finished all days without converging, starting from day 1 for 97 more episodes.
episode reward -3826
Finished all days without converging, starting from day 1 for 96 more episodes.
episode reward -3738
Finished all days without converging, starting from day 1 for 95 more episodes.
episode reward -2134


TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, int64, int32, int16, int8, uint8, and bool.

In [None]:
ToDo: check negation in replay buffer for done...?