### Uses Paper Trade - test money
##### https://alpaca.markets/docs/api-documentation/api-v2/

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import time, datetime as dt
import pandas as pd, numpy as np

import alpha, database as db
import alpaca_trade as alpaca
from populate import download_data
from populate import *
from portfolios import *
from history import *

DataStore = db.DataStore()

import copy
import gym
from gym import spaces
import torch
import torch.nn as nn
import torch.nn.functional as F

In [7]:
class ReplayBuffer:
    def __init__(self, state_dim, action_dim, max_buffer=int(1e6), batch_size=64):
        # create state, action, next_state, reward, done tables
        # state_dim is total # indicators or # values?; action is # actions
        self.state = np.empty((max_buffer, state_dim))
        self.action = np.empty((max_buffer, action_dim))
        self.next_state = np.empty((max_buffer, state_dim))
        self.reward = np.empty((max_buffer, 1))
        self.done = np.empty((max_buffer, 1))
        
        self.max_size = max_buffer
        self.batch_size = batch_size
        self.size = 0
        self.current_memory = 0
        self._idx = 0
        
    def update(self, experience):
        state, action, next_state, reward, done = experience
        self.state[self._idx] = state
        self.action[self._idx] = action
        self.next_state[self._idx] = next_state
        self.reward[self._idx] = reward
        self.done[self._idx] = done
        self._idx = (self._idx + 1) % self.max_size
        self.size = min(self.size + 1, self.max_size)
        
    def sample(self):
        idxs = np.random.choice(self.size, self.batch_size, replace=False)
        batch = np.vstack(self.state[idxs]), \
                np.vstack(self.action[idxs]), \
                np.vstack(self.next_state[idxs]), \
                np.vstack(self.reward[idxs]), \
                np.vstack(self.done[idxs])
        return batch


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Implementation of Twin Delayed Deep Deterministic Policy Gradients (TD3)
# Paper: https://arxiv.org/abs/1802.09477
# https://github.com/sfujim/TD3/blob/master/TD3.py

class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, max_action):
        super(Actor, self).__init__()

        self.l1 = nn.Linear(state_dim, 256)
        self.l2 = nn.Linear(256, 256)
        self.l3 = nn.Linear(256, action_dim)

        self.max_action = max_action


    def forward(self, state):
        a = F.relu(self.l1(state))
        a = F.relu(self.l2(a))
        return self.max_action * torch.tanh(self.l3(a))


class Critic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()

        # Q1 architecture
        self.l1 = nn.Linear(state_dim + action_dim, 256)
        self.l2 = nn.Linear(256, 256)
        self.l3 = nn.Linear(256, 1)

        # Q2 architecture
        self.l4 = nn.Linear(state_dim + action_dim, 256)
        self.l5 = nn.Linear(256, 256)
        self.l6 = nn.Linear(256, 1)


    def forward(self, state, action):
        sa = torch.cat([state, action], 1)

        q1 = F.relu(self.l1(sa))
        q1 = F.relu(self.l2(q1))
        q1 = self.l3(q1)

        q2 = F.relu(self.l4(sa))
        q2 = F.relu(self.l5(q2))
        q2 = self.l6(q2)
        return q1, q2


    def Q1(self, state, action):
        sa = torch.cat([state, action], 1)

        q1 = F.relu(self.l1(sa))
        q1 = F.relu(self.l2(q1))
        q1 = self.l3(q1)
        return q1


class TD3(object):
    def __init__(
        self,
        state_dim,
        action_dim,
        max_action,
        discount=0.99,
        tau=0.005,
        policy_noise=0.2,
        noise_clip=0.5,
        policy_freq=2
    ):

        self.actor = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target = copy.deepcopy(self.actor)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=3e-4)

        self.critic = Critic(state_dim, action_dim).to(device)
        self.critic_target = copy.deepcopy(self.critic)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=3e-4)

        self.max_action = max_action
        self.discount = discount
        self.tau = tau
        self.policy_noise = policy_noise
        self.noise_clip = noise_clip
        self.policy_freq = policy_freq

        self.total_it = 0


    def select_action(self, state):
        state = torch.FloatTensor(state.reshape(1, -1)).to(device)
        return self.actor(state).cpu().data.numpy().flatten()


    def train(self, replay_buffer, batch_size=256):
        self.total_it += 1

        # Sample replay buffer 
        state, action, next_state, reward, not_done = replay_buffer.sample(batch_size)

        with torch.no_grad():
            # Select action according to policy and add clipped noise
            noise = (
                torch.randn_like(action) * self.policy_noise
            ).clamp(-self.noise_clip, self.noise_clip)

            next_action = (
                self.actor_target(next_state) + noise
            ).clamp(-self.max_action, self.max_action)

            # Compute the target Q value
            target_Q1, target_Q2 = self.critic_target(next_state, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward + not_done * self.discount * target_Q

        # Get current Q estimates
        current_Q1, current_Q2 = self.critic(state, action)

        # Compute critic loss
        critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)

        # Optimize the critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Delayed policy updates
        if self.total_it % self.policy_freq == 0:

            # Compute actor losse
            actor_loss = -self.critic.Q1(state, self.actor(state)).mean()

            # Optimize the actor 
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # Update the frozen target models
            for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
                target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

            for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
                target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)


    def save(self, filename):
        torch.save(self.critic.state_dict(), filename + "_critic")
        torch.save(self.critic_optimizer.state_dict(), filename + "_critic_optimizer")

        torch.save(self.actor.state_dict(), filename + "_actor")
        torch.save(self.actor_optimizer.state_dict(), filename + "_actor_optimizer")


    def load(self, filename):
        self.critic.load_state_dict(torch.load(filename + "_critic"))
        self.critic_optimizer.load_state_dict(torch.load(filename + "_critic_optimizer"))
        self.critic_target = copy.deepcopy(self.critic)

        self.actor.load_state_dict(torch.load(filename + "_actor"))
        self.actor_optimizer.load_state_dict(torch.load(filename + "_actor_optimizer"))
        self.actor_target = copy.deepcopy(self.actor)

In [10]:
class Portfolio:
    def __init__(self, use_alpaca=True, cash=10000, positions=POPULAR):
        self.use_alpaca = use_alpaca
        if use_alpaca:
            self.cash_remaining = int(alpaca.get_account()['cash'])
            self.positions = alpaca.get_positions() # check on how returned from alpaca...
        else:
            self.cash_remaining = cash
            self.positions = positions
        print(f'Portfolio loaded - available cash: {self.cash_remaining}')
        
    def position(self, sym):
        if not self.use_alpaca: return sym in self.positions
        if sym in self.positions: # not sure format positions returned...
            return alpaca.get_position(sym)
        else:
            return "no position"
        

class StockTraderEnvironment(gym.Env):
    def __init__(self,
                portfolio,
                history, 
                short=False):
        """dfIndicators is a DataFrame where each column is a different indicator; short=True would allow shorting position"""
        
        self.portfolio = portfolio
        self.indicators = history.indicators
        self.prices = history.prices
        self.symbol = history.symbol
        num_indicators = self.indicators.shape[1]
        assert num_indicators > 0, "supply 1 or more indicators"

        self.action_space = spaces.Discrete(3)

        # set space for alpha indicators at +- infinity...?
        low_array = np.full((num_indicators), -np.inf)
        high_array = np.full((num_indicators), np.inf)
        self.observation_space = spaces.Box(low=low_array, high=high_array, dtype=np.float64)
        
        self.nS, self.nA = self.observation_space.shape[0], self.action_space.n
        
        self.previous_price = 0  # didn't exist before first day, so set previous_price to 0
        first_day = self.indicators.index[0] # starting at first day indicators exist
        self.prices = self.prices.loc[first_day:] # rewriting prices to fit indicator list
        
        self.state = np.array(self.indicators.iloc[0]) # first day is inititial state
        self.days = iter(self.prices.index.values)
        
        # Iterate through days, checking action/reward, etc. in step...
        self.trades = pd.DataFrame(0, index = self.prices.index, columns = self.prices.columns)
        self.trades_dupl = self.trades.copy(deep = True)
        
        # position is how much long (positive), short (negative) or holding (zero)
        self.position = 0  # how parse?

    def reset(self):
        self.previous_price = 0
        self.days = iter(self.prices.index.values)
        self.state = np.array(self.indicators.iloc[0])
        self.trades = pd.DataFrame(0, index = self.prices.index, columns = self.prices.columns)
        self.position = 0
    
    def step(self, action):
        #https://github.com/openai/gym/blob/master/gym/envs/classic_control/cartpole.py
        err_msg = "%r (%s) invalid" % (action, type(action))
        assert self.action_space.contains(action), err_msg
        
        #Calculate reward here... first day = 0
        # 0 is buy, 1 is sell, 2 is hold
        try:
            new_day = next(trader.days)
            current_price = self.prices.loc[new_day]
            self.state = self.indicators.loc[new_day]

            if action == 0 and current_price > self.previous_price:
                reward = 2
            elif action == 0 and current_price < self.previous_price:
                reward = -2
            elif action == 1 and current_price < self.previous_price:
                reward = 2
            elif action == 1 and current_price > self.previous_price:
                reward = -2
            elif action == 2 and (current_price > self.previous_price or current_price < self.previous_price):
                reward = -2
            elif action == 2 and current_price == self.previous_price:
                reward = 2
            else:
                reward = 0
                
            if action == 0 and self.position == 0:
                # buying into a position, use nn probability for how much...?
                pass
            elif action == 0 and self.position > 0:
                # don't overcommit to one symbol?
                pass
            elif action == 0 and self.position < 0:
                # more signal to buy... keep same as last?
                pass
            elif action == 1 and self.position == 0:
                # sell, with no position
                pass
            elif action == 1 and self.position > 0:
                # sell, with some bought - sell all? or fraction? can't sell more than have if self.short=False
                if not self.short:
                    pass
                pass
            elif action == 1 and self.position < 0:
                if not self.short:
                    print(f'ERROR - short {self.symbol} with shorting disabled!')

            elif action == 2:
                # hold long or short (or no) position
                pass

            
            self.position = portfolio.position(self.symbol)
            self.previous_price = current_price
            done = False
        except StopIteration:
            if self.trades.equals(self.trades_dupl):
                done = True
            else:
                done = False
                self.trades_dupl = self.trades.copy(deep = True)

        return self.state, reward, done, {}
    
    def render(self):
        #ToDo - show progression via graph?
        pass

In [30]:
class Experiment:
    def __init__(self, DataStore, td3_kwargs, portfolio, sym, dates=None, indicators='all'):
        print('Setting up experiment, loading history...')
        self.available_cash = portfolio.cash_remaining
        
        #date format for range: dates=[dt.datetime(2000,1,1), dt.datetime(2020,12,31)]
        self.history = History(DataStore, sym, dates, indicators=indicators)
        self.indicators = self.history.indicators
        self.prices = self.history.prices
        
        self.num_days = self.prices.shape[0]
        training = int(.75* self.num_days)
        validation = int(.3*training)
        test = self.num_days-training
        
        self.price_train = self.prices.iloc[:(training-validation)]
        self.indicator_train = self.indicators.iloc[:(training-validation)]
        
        self.price_validation = self.prices.iloc[(training-validation):training]
        self.indicator_validation = self.indicators.iloc[(training-validation):training] 
        
        self.price_test = self.prices.iloc[training:]
        self.indicator_test = self.indicators.iloc[training:]
        
        self.portfolio = portfolio
        self.symbol = sym
        
        self.env = StockTraderEnvironment(self.portfolio, 
                                          self.history, 
                                          short=False) # hard-coded to prevent shorting
        
        self.batch_size = 64 # not parameterized...
        self.buffer = ReplayBuffer(self.env.nS, self.env.nA, max_buffer=int(1e6), batch_size=self.batch_size)
        
        self.max_action = 2 # 3 actions: [0,1,2], so 2 is max
        
        kwargs = {
            "state_dim": self.env.nS,
            "action_dim": self.env.nA,
            "max_action": self.max_action,
            "discount": td3_kwargs['discount'],
            "tau": td3_kwargs['tau'],
            "policy_noise": td3_kwargs['policy_noise'],            
            "noise_clip": td3_kwargs['noise_clip'],
            "policy_freq": td3_kwargs['policy_freq']
        }
        
        self.policy = TD3(**kwargs)
        self.expl_noise = td3_kwargs['expl_noise']
        
        
    def run(self, num_episodes, max_steps=int(1e6)):
        
        random_warmup = 25e3
        total_days_run = 0
        
        for idx in range(num_episodes):
            
            state, done = self.env.reset(), False
            episode_reward = 0
            steps = 0
            
            for days_passed in range(self.num_days):
                
                steps += 1
                
                if total_days_run < random_warmup:
                    action = env.action_space.sample()
                else:
                    #ToDo - what is this doing? https://github.com/sfujim/TD3/blob/master/main.py
                    action = (self.policy.select_action(np.array(state)) + np.random.normal(0, self.max_action * self.expl_noise, size=self.env.nA)).clip(-self.max_action, self.max_action)
            
                next_state, reward, done, _ = self.env.step(action)
                experience = [state, action, next_state, reward, done]
                self.buffer.update(experience)
                
                #ToDo: collect per episode, per iteration reward, total reward, etc.; portfolio value (or final portfolio value?)
                
                if done:
                    print(f'Episode finished after {days_passed+1} timesteps')
                    break
                
                state = next_state
                if days_passed >= random_warmup:
                    policy.train(self.buffer, self.batch_size)
                
                if days_passed == self.num_days - 1:
                    if num_episodes - idx - 1 == 0:
                        print('Finished all episodes, did not converge')
                    else:
                        print(f'Finished all days without converging, starting from day 1 for {num_episodes - idx - 1} more episodes.')
                        
                total_days_run += 1
                    
        self.env.close() # not defined
    
fake_portfolio = Portfolio(use_alpaca=False)

kwargs = {
    "discount": 0.99,
    "tau": 0.005,
    "policy_noise": 0.2,            
    "noise_clip": 0.5,
    "policy_freq": 2,
    "expl_noise": 0.1
}

exp1 = Experiment(DataStore, kwargs, portfolio=fake_portfolio, sym='AAPL',indicators=['ADX', 'CCI', 'EMA'])

Portfolio loaded - available cash: 10000
Setting up experiment, loading history...


In [2]:
history = History(DataStore, 'AAPL', None, indicators='all')

In [18]:
env = StockTraderEnvironment(fake_portfolio, history, short=False)

In [5]:
prices = history.prices.iloc[:,:]/history.prices.iloc[0,:]

In [18]:
import plotly.express as px
from jupyter_dash import JupyterDash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
# Load Data
df = history.prices
# Build App
app = JupyterDash(__name__)
app.layout = html.Div([
    html.H1(f'{history.symbol} price'),
    dcc.Graph(id='graph'),
    html.Label([
        "colorscale",
        dcc.Dropdown(
            id='colorscale-dropdown', clearable=False,
            value='plasma', options=[
                {'label': c, 'value': c}
                for c in px.colors.named_colorscales()
            ])
    ]),
])
# Define callback to update graph
@app.callback(
    Output('graph', 'figure'),
    [Input("colorscale-dropdown", "value")]
)
def update_figure(colorscale):
    return px.line(
        df, x=df.index, y="adjusted close",
        render_mode="webgl"
    )
# Run app and display result inline in the notebook
app.run_server(mode='inline')