In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib ipympl

class Network(nn.Module):
    def __init__(self, output_size, activation = nn.Identity()):
        super(Network, self).__init__()
        # in this case, the number of channels is 10 because we retrieve last 10 snapshots
        self.conv1 = nn.Conv1d(in_channels=10, out_channels=10, kernel_size=2, stride=2, padding=0)
        self.conv2 = nn.Conv1d(in_channels=10, out_channels=10, kernel_size=2, stride=2, padding=0)
        self.conv3 = nn.Conv1d(in_channels=10, out_channels=10, kernel_size=3, stride=1, padding=0)
        self.lstm = nn.LSTM(input_size=1, hidden_size=1, num_layers=1, batch_first=True)
        self.fc1 = nn.Linear(11, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, output_size)
        self.activation = activation
        

    def forward(self, x1, x2):
        x1 = self.conv1(x1) 
        x1 = F.relu(x1)
        x1 = self.conv2(x1)
        x1 = F.relu(x1)
        x1 = self.conv3(x1)
        x1 = F.relu(x1)
        x1 = x1.reshape(10, 1, 1)
        x1, _ = self.lstm(x1)
        x1 = x1.reshape(10, )
        x1 = torch.cat((x1, x2), dim = 0)
        x1 = self.fc1(x1)
        x1 = F.relu(x1)
        x1 = self.fc2(x1)
        x1 = F.relu(x1)
        x1 = self.fc3(x1)
        x1 = self.activation(x1)
        return x1
    

In [3]:
df = pd.read_csv('cleaned_dataset.csv')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
train_data = df[df['trading_day'] == 25]
train_data = train_data.drop(['trading_day'], axis=1)
train_data = train_data[['AskPrice1', 'AskVolume1', 'BidPrice1', 'BidVolume1', 'AskPrice2', 'AskVolume2', 'BidPrice2', 'BidVolume2', 'AskPrice3', 'AskVolume3', 'BidPrice3', 'BidVolume3']]
train_data = train_data.iloc[:4000, :]
print(train_data.head())
train_data = train_data.to_numpy()
train_data = torch.from_numpy(train_data)
train_data = train_data.float()
train_data = train_data.to(device)

         AskPrice1  AskVolume1  BidPrice1  BidVolume1  AskPrice2  AskVolume2  \
1887801      769.0          40      768.5         116      769.5         293   
1887802      768.5           5      768.0         293      769.0         132   
1887803      768.5           4      768.0         303      769.0         311   
1887804      769.0         185      768.5          24      769.5         409   
1887805      769.0         163      768.5          23      769.5         417   

         BidPrice2  BidVolume2  AskPrice3  AskVolume3  BidPrice3  BidVolume3  
1887801      768.0         293      770.0         603      767.5         154  
1887802      767.5         184      769.5         338      767.0         323  
1887803      767.5         305      769.5         396      767.0         364  
1887804      768.0         357      770.0         754      767.5         317  
1887805      768.0         458      770.0         765      767.5         319  


Therefore, I think it makes sense that to consider snapshots for the last 10 seconds as an input, and start from there. For each snapshot, we only use the first three bid-ask price level, which means 12 features

In [4]:
# write a function to get a path from the policy network
def get_path(policy_network, train_dataset):
    holding_positions = [0]
    pos_changes = []
    rewards = []
    states1 = []
    states2 = [] 
    action_dist_set = []
    action_set = []
    for i in range(9, len(train_dataset)):

        # bid_ask information of past 10 snapshots
        bid_ask = train_dataset[i - 9:i + 1, :]
        bid_ask = bid_ask.unsqueeze(0)
        old_hold = holding_positions[-1]
        hold = torch.tensor([old_hold]).float().to(device)
        states1.append(bid_ask)
        states2.append(hold)
        # get the action from the policy network, which is a probability distribution
        action_dist = policy_network(bid_ask, hold)
        action_dist_set.append(action_dist)
        # sample an action from the probability distribution
        action = torch.multinomial(action_dist, 1).item() - 1
        action_set.append(action + 1)
        
        # decide the position change based on the action and current holding position
        if old_hold == 0:
            # make the action to be integer
            a = int(action)
            holding_positions.append(a)
        if old_hold == 1:
            holding_positions.append(min(action + old_hold, old_hold))
        if old_hold == -1:
            holding_positions.append(max(action + old_hold, old_hold))
           
        # compute the reward (cash change) 
        new_hold = holding_positions[-1]
        position_change = new_hold - old_hold
        pos_changes.append(position_change)
    
        reward = 0
        if position_change == 0:
            reward = 0
        if position_change == 1:
            reward = - train_dataset[i][2] 
        if position_change == -1:
            reward = train_dataset[i][0] 

        rewards.append(reward)
    
    # at the end, the agent needs to liquid all positions
    if holding_positions[-1] == 1:
        rewards.append(train_dataset[-1][2])
    elif holding_positions[-1] == -1:
        rewards.append(-train_dataset[-1][0])
    else:
        rewards.append(0)

    # convert the list of rewards to a tensor
    rewards = torch.tensor(rewards).float().to(device)
    # convert the list of states to a tensor
    states1 = torch.stack(states1).to(device)
    states2 = torch.stack(states2).to(device)
    action_dist_set = torch.stack(action_dist_set).to(device)
    return states1, states2, rewards, action_dist_set, action_set


        
# train the value network using the TD(0) algorithm
def train_value_network(value_network, policy_network, train_dataset, value_optimizer, gamma = 0.99, epochs = 10):
    # define the loss function
    loss = 0
    # loop over the epochs
    for epoch in range(epochs):
        # get the path from the policy network
        # only use one trajectory each epoch
        states1, states2, rewards, action_dist_set, action_set = get_path(policy_network, train_dataset)
        # get the value estimate from the value network
        value_estimate = []
        for i in range(len(states1)):
            value_estimate.append(value_network(states1[i], states2[i]))
        value_estimate = torch.stack(value_estimate).squeeze(1)
        new_value_estimate = torch.cat((value_estimate, torch.tensor([0]).to(device)))
        # compute the TD(0) error
        loss = (rewards[:-1] + gamma * new_value_estimate[1:] - new_value_estimate[:-1]).pow(2).mean()
        # zero the gradient
        value_optimizer.zero_grad()
        # compute the gradient
        loss.backward()
        # update the weights
        value_optimizer.step()
        # print the loss
        print('Epoch: {}, Loss: {:.5f}'.format(epoch, loss.item()))
    return None


# the following define a function that compute advantage estimation for a trajectory
def advantage_estimate(states1, states2, rewards, value_network, gamma):
    value_estimate = []
    for i in range(len(states1)):
        value_estimate.append(value_network(states1[i], states2[i]))
    value_estimate = torch.stack(value_estimate).squeeze(1)
    new_value_estimate = torch.cat((value_estimate, torch.tensor([0]).to(device)))
    advantages = rewards[:-1] + gamma * new_value_estimate[1:] - new_value_estimate[:-1]
    return advantages


        

In [5]:
# states, rewards, actions are trajetory data of old policy
# there is a new_policy_network that is updated by ppo_update()
def ppo_loss(new_policy_network, policy_network, value_network, train_dataset, batch_size, epsilon=0.2, gamma = 0.99):
    # batch size: the number of trajectories
    loss = torch.tensor(0.0, requires_grad=True).to(device)
    for _ in range(batch_size):
        states1, states2, rewards, action_dist_set, action_set = get_path(policy_network, train_dataset)
        new_action_dist_set = [policy_network(states1[i], states2[i]) for i in range(len(states1))]
        new_action_dist_set = torch.stack(new_action_dist_set).to(device)
        action_dist_set = action_dist_set.detach()
        rewards = rewards.detach()
        ratio = []
        for i in range(len(action_dist_set)):
            ratio.append(new_action_dist_set[i][action_set[i]] / action_dist_set[i][action_set[i]])
        ratio = torch.stack(ratio).to(device).detach()
        # compute the advantage of the trajectory
        advantage = advantage_estimate(states1, states2, rewards, value_network, gamma)
        advantage = advantage.detach()
        # compute the clipped ratio
        clipped_ratio = torch.clamp(ratio, 1.0 - epsilon, 1.0 + epsilon)
        # compute the surrogate loss
        policy_loss = -torch.min(ratio * advantage, clipped_ratio * advantage).mean()
        # compute the total loss
        loss = loss + policy_loss
    return loss

    
def ppo_train(new_policy_network, policy_network, value_network, optimizer, train_dataset, batch_size, epochs, epsilon=0.2, gamma = 0.99):
    # this function is used to train the new_policy_network
    for epoch in range(epochs):
        loss = ppo_loss(new_policy_network, policy_network, value_network, train_dataset, batch_size, epsilon, gamma)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        # print the loss and epoch
        print("epoch: ", epoch, "loss: ", loss.item())


def wealth_dist(num_traj, policy_network, dataset):
    wealths = []
    for num in range(num_traj):
        states1, states2, rewards, action_dist_set, action_set = get_path(policy_network, dataset)
        wealths.append(sum(rewards))

    return wealths


In [6]:
policy_network = Network(3, activation = nn.Softmax())
value_network = Network(1)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
policy_network.to(device)
value_network.to(device)

policy_optimizer = optim.Adam(policy_network.parameters(), lr=0.1)
value_optimizer = optim.Adam(value_network.parameters(), lr=0.1)


In [9]:
num_iter = 5
# num_iter this is the number of times that we improve the policy
for iteration in range(num_iter):
    train_value_network(value_network, policy_network, train_data, value_optimizer, 0.9)
    new_policy_network = Network(3, activation = nn.Softmax()).to(device)
    new_policy_optimizer = optim.Adam(new_policy_network.parameters(), lr = 0.1)
    ppo_train(new_policy_network, policy_network, value_network, new_policy_optimizer, train_data, batch_size = 5, epochs = 15, epsilon = 0.2, gamma = 0.99)
    policy_network = new_policy_network



        

  x1 = self.activation(x1)


Epoch: 0, Loss: 272686.90625
Epoch: 1, Loss: 264782.40625
Epoch: 2, Loss: 268063.59375
Epoch: 3, Loss: 265880.40625
Epoch: 4, Loss: 270217.68750
Epoch: 5, Loss: 268513.34375
Epoch: 6, Loss: 261726.56250
Epoch: 7, Loss: 253950.43750
Epoch: 8, Loss: 264373.21875
Epoch: 9, Loss: 253227.62500
epoch:  0 loss:  -0.10315828025341034
epoch:  1 loss:  -0.2628318667411804
epoch:  2 loss:  -0.5058820247650146
epoch:  3 loss:  -0.14138999581336975
epoch:  4 loss:  -0.09657317399978638
epoch:  5 loss:  0.07471542805433273
epoch:  6 loss:  0.06681688129901886
epoch:  7 loss:  -0.702826976776123
epoch:  8 loss:  -0.29848170280456543
epoch:  9 loss:  -0.6828583478927612
epoch:  10 loss:  -0.10874849557876587
epoch:  11 loss:  -0.27514711022377014
epoch:  12 loss:  -0.13941770792007446
epoch:  13 loss:  -0.46001237630844116
epoch:  14 loss:  -0.0982646644115448
Epoch: 0, Loss: 242559.14062
Epoch: 1, Loss: 238693.00000
Epoch: 2, Loss: 231260.04688
Epoch: 3, Loss: 233034.73438
Epoch: 4, Loss: 227410.8281

KeyboardInterrupt: 

In [11]:
w = wealth_dist(5, policy_network, train_data)
print(w)

  x1 = self.activation(x1)


[tensor(445., device='cuda:0'), tensor(434., device='cuda:0'), tensor(441.5000, device='cuda:0'), tensor(433.5000, device='cuda:0'), tensor(444.5000, device='cuda:0')]
