## [Deep Q Reinforcement Learning](http://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html)

Notebook based of linked tutorial. Started adjusting things for our application. General work flow, we are going to create a neural network which takes in features(these could be past prices, technical indicators,etc) for now this will be the 10 past values, we are going to then create a network to create the Q function for each of our results(Huber Loss). The outputs to this network will be the action pairs to run. Actions will be buy sell and hold. We will need to create a reward function which we will tweak till things look good. Some of this notebook may seem blotted but a lot of the things are recommended tricks for training like maintaining replay memory and having two different networks one for training and one for actually evaluating. 

In [84]:
import sys
sys.path.append("..")
import financial_data as fd
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from itertools import count

import pixiedust
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import torchvision.transforms as T
import pdb
# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()
use_cuda = torch.cuda.is_available()
FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if use_cuda else torch.ByteTensor
Tensor = FloatTensor
pm = fd.financial_data(10)

Training data I am using is from MSFT for learning. You can change ticker for other stocks

In [85]:
ticker='MSFT'
state=pm.norm_data_ls[pm.ticker_ls.index(ticker)].Close
date=pm.norm_data_ls[pm.ticker_ls.index(ticker)].date

Class that stores all of the different transitions we make and ensures that when we sample it during the training process the samples aren't correlated

In [118]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))
class ReplayMemory(object):

    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, *args):
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

Defining Deep Q Learning Network. Here one inputs the features(AKA State) and these states then predict our Q function for each action. The action with the largest Q is chosen. This learns based off reward function defined later.  

In [119]:
class DQN(nn.Module):
    def __init__(self):
        super(DQN, self).__init__()
        hidden_size = 120  # Random Parameter that can be tuned
        actions = 3  # 3 Different Actions Buy Sell Hold
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(len(pm.x_test[1]), hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, actions)
        self.softmax = nn.Softmax(dim=0)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        x = self.softmax(x)
        return x

In [125]:
BATCH_SIZE = 128
GAMMA = 0.9
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
TARGET_UPDATE = 10

policy_net = DQN()
target_net = DQN()
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

if use_cuda:
    policy_net.cuda()
    target_net.cuda()

optimizer = optim.RMSprop(policy_net.parameters())
memory = ReplayMemory(10000)
steps_done = 0


def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        return policy_net(Variable(state, volatile=True).type(FloatTensor)).data.max(0)[1]
    else:
        return LongTensor([random.randrange(3)])


episode_durations = []


def plot_durations():
    plt.figure(2)
    plt.clf()
    durations_t = torch.FloatTensor(episode_durations)
    plt.title('Training...')
    plt.xlabel('Episode')
    plt.ylabel('Duration')
    plt.plot(durations_t.numpy())
    # Take 100 episode averages and plot them too
    if len(durations_t) >= 100:
        means = durations_t.unfold(0, 100, 1).mean(1).view(-1)
        means = torch.cat((torch.zeros(99), means))
        plt.plot(means.numpy())

    plt.pause(1)  # pause a bit so that plots are updated
    if is_ipython:
        display.clear_output(wait=True)
        display.display(plt.gcf())

In [126]:
def optimize_model():
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
    # Transpose the batch (see http://stackoverflow.com/a/19343/3343043 for
    # detailed explanation).
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    non_final_mask = ByteTensor(tuple(map(lambda s: s is not None, batch.next_state)))
    non_final_next_states = Variable(torch.cat([s for s in batch.next_state if s is not None]), volatile=True)
    non_final_next_states = non_final_next_states.view(128,10)
    
    state_batch = Variable(torch.cat(batch.state))
    state_batch = state_batch.view(128,10)
    action_batch = Variable(torch.cat(batch.action))
    action_batch = action_batch.view(128,1)
    reward_batch = Variable(torch.cat(batch.reward))
    reward_batch = reward_batch.view(128,1)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states.
    next_state_values = Variable(torch.zeros(BATCH_SIZE).type(Tensor))
    next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0]
    next_state_values = next_state_values.unsqueeze(1)
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch
    # Undo volatility (which was used to prevent unnecessary gradients)
    expected_state_action_values = Variable(expected_state_action_values.data)

    # Compute Huber loss
    loss = F.smooth_l1_loss(state_action_values, expected_state_action_values)

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()

In [127]:
def step(action, cur_price, next_price, days, since_buy, price_buy):
    #Write Logic to determine if at end of time series
    #Write Logic to figure out reward for action(% Profit?)
    
    # action: 1 - Buy, 2 - Sell, 3 - Hold
    # cur_price: current price of the stock (10th value in array)
    # next_price: next price of the stock (y value)
    # days: day count within current episode
    # since_buy: days since the last buy (-1 if no holdings)
    # price_buy: price at the last buy (-1 if no holdings)
    
    if (action == 0):
        #BUY
        if (price_buy == -1):
            if ((next_price - cur_price) > 0):
                reward = (next_price - cur_price)*5
                price_buy = cur_price
                since_buy = 1
            else:
                reward = (next_price - cur_price)*5
                price_buy = cur_price
                since_buy = 1
        else:
            reward = 0
            price_buy = -1
            since_buy = -1
            
    elif (action == 1):
        #SELL
        if ((since_buy > 0) & (price_buy > 0)):
            reward = (cur_price - price_buy)*100
            price_buy = -1
            since_buy = -1
        else:
            reward = 0
            price_buy = -1
            since_buy = -1
            
    elif (action == 2):
        #HOLD
        if (price_buy == -1):
            if ((next_price - cur_price) > 0):
                reward = -10
                price_buy = -1.0
                since_buy = -1.0
            else:
                reward = 10
                price_buy = -1.0
                since_buy = -1.0
        else:
            if ((next_price - cur_price) > 0):
                reward = 10*since_buy + (cur_price - price_buy)
                price_buy = price_buy
                since_buy = since_buy + 1
            else:
                reward = -2*since_buy + (cur_price - price_buy)
                price_buy = price_buy
                since_buy = since_buy + 1
        
        
    if (days > 500):
        done = True
    else:
        done = False
    
    return reward, done, since_buy, price_buy

In [132]:
num_episodes = 5

for i_episode in range(num_episodes):
    state = torch.Tensor(pm.x_test[1])
    state.unsqueeze(0)
    since_buy = -1.0
    price_buy = -1.0
    for t in count():
        # Select and perform an action
        
        action = select_action(state)
        reward, done, since_buy, price_buy = step(action[0], pm.x_test[t][9], pm.y_test[t], t, since_buy, price_buy)
        reward = Tensor([reward])
        
        next_state = torch.Tensor(pm.x_test[(t+1)])
        next_state.unsqueeze(0)
        # Store the transition in memory
        memory.push(state, action, next_state, reward)
        
        # Move to the next state
        state = next_state

        # Perform one step of the optimization (on the target network)
        optimize_model()
        if done:
            print('Episode Done')
            break
    # Update the target network
    if i_episode % TARGET_UPDATE == 0:
        target_net.load_state_dict(policy_net.state_dict())
print('Complete')
plt.ioff()
plt.show()

Episode Done
Episode Done
Episode Done
Episode Done
Episode Done
Complete


In [133]:
action_index = ['Buy', 'Sell', 'Hold']
for i in range(0,100):
    action = target_net(Variable(torch.Tensor(pm.x_train[i])))
    a = pm.x_train[i]
    a = np.append(a, pm.y_train[i])
    _, n = torch.max(action, 0)
    print(action_index[n.data[0]])



Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Sell
Hold
Sell
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Sell
Hold
Sell
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Sell
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Sell
Hold
Hold
Hold
Sell
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Sell
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold
Hold


In [2]:
x, y = self.fd.split_data([self.fd.norm_data_ls[self.fd.ticker_ls.index(self.TICKER)]])

NameError: name 'self' is not defined

In [None]:
        if (action == 0):
            #BUY
            if (((next_price - cur_price) > 0) and (since_buy == -1) and (price_buy == -1)):
                reward = (next_price - cur_price)*20
                price_buy = cur_price
                since_buy = 1
            elif (((next_price - cur_price) <= 0) and (since_buy == -1) and (price_buy == -1)):
                reward = (next_price - cur_price)*20
                price_buy = cur_price
                since_buy = 1
            else:
                reward = 0
                price_buy = cur_price
                since_buy = 1

        elif (action == 1):
            #SELL
            if ((since_buy > 0) & (price_buy > 0)):
                reward = (cur_price - price_buy)*100
                price_buy = -1
                since_buy = -1
            else:
                reward = 0
                price_buy = -1
                since_buy = -1



        elif (action == 2):
            #HOLD
            if (((next_price - cur_price) > 0) and (since_buy == -1) and (price_buy == -1)):
                reward = -10
                price_buy = -1.0
                since_buy = -1.0

            elif (((next_price - cur_price) <= 0) and (since_buy == -1) and (price_buy == -1)):
                reward = 10
                price_buy = -1.0
                since_buy = -1.0

            elif (((next_price - cur_price) > 0) and (since_buy > 0)):
                reward = 10*since_buy + (cur_price - price_buy)
                price_buy = price_buy
                since_buy = since_buy + 1

            elif (((next_price - cur_price) < 0) and (since_buy > 0)):
                reward = -2*since_buy + (cur_price - price_buy)
                price_buy = price_buy
                since_buy = since_buy + 1

            else:
                reward = -100
                price_buy = price_buy
                since_buy = since_buy