In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import pandas as pd

class Network(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, activation=nn.Identity()):
        super(Network, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)
        self.activation = activation

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.activation(self.fc3(x))
        return x

In [2]:
df = pd.read_csv('cleaned_dataset.csv')
df.head()

Unnamed: 0,AskPrice1,BidPrice1,AskVolume1,BidVolume1,AskPrice5,AskPrice4,AskPrice3,AskPrice2,BidPrice2,BidPrice3,...,BidPrice5,AskVolume5,AskVolume4,AskVolume3,AskVolume2,BidVolume2,BidVolume3,BidVolume4,BidVolume5,trading_day
0,705.5,705.0,82,304,707.5,707.0,706.5,706.0,704.5,704.0,...,703.0,25,68,16,53,56,95,22,56,1
1,707.0,706.0,7,37,709.5,709.0,708.5,708.0,705.5,705.0,...,704.0,24,124,4,272,376,300,56,95,1
2,707.0,706.5,36,29,709.0,708.5,708.0,707.5,706.0,705.5,...,704.5,133,46,284,97,4,440,396,124,1
3,706.5,705.5,16,74,708.5,708.0,707.5,707.0,705.0,704.5,...,703.5,47,302,95,40,409,80,125,31,1
4,706.5,705.5,35,116,708.5,708.0,707.5,707.0,705.0,704.5,...,703.5,58,296,95,80,414,80,128,39,1


In [4]:
train_data = df[df['trading_day'] == 10]
test_data = df[df['trading_day'] == 13]
train_data = train_data.drop(['trading_day'], axis=1)
test_data = test_data.drop(['trading_day'], axis=1)
train_data.head()

Unnamed: 0,AskPrice1,BidPrice1,AskVolume1,BidVolume1,AskPrice5,AskPrice4,AskPrice3,AskPrice2,BidPrice2,BidPrice3,BidPrice4,BidPrice5,AskVolume5,AskVolume4,AskVolume3,AskVolume2,BidVolume2,BidVolume3,BidVolume4,BidVolume5
686982,727.0,726.5,59,527,729.0,728.5,728.0,727.5,726.0,725.5,725.0,724.5,1147,734,1458,738,811,436,765,372
686983,727.0,726.5,78,528,729.0,728.5,728.0,727.5,726.0,725.5,725.0,724.5,1147,734,1458,735,811,436,765,372
686984,727.0,726.5,84,528,729.0,728.5,728.0,727.5,726.0,725.5,725.0,724.5,1147,734,1458,735,811,436,765,372
686985,727.0,726.5,85,523,729.0,728.5,728.0,727.5,726.0,725.5,725.0,724.5,1148,734,1458,736,811,436,765,372
686986,727.0,726.5,112,528,729.0,728.5,728.0,727.5,726.0,725.5,725.0,724.5,1148,734,1458,737,811,436,764,372


In [43]:
# write a function to get a path from the policy network
def get_path(policy_network, train_dataset):
    holding_positions = [0]
    rewards = []
    states = [] 
    action_dist_set = []
    action_set = []
    for i in range(len(train_dataset) - 1):

        # bid_ask contain the information of bid and ask price and volume
        bid_ask = train_dataset[i] 
        # hold is the current holding position
        hold = holding_positions[-1]
        # concat bid_ask and hold to get the state
        state = torch.cat((bid_ask, torch.tensor([hold]).float().to(device)), dim = 0)
        states.append(state)
        # get the action from the policy network, which is a probability distribution
        action_dist = policy_network(state)
        action_dist_set.append(action_dist)
        # sample an action from the probability distribution
        action = torch.multinomial(action_dist, 1).item() - 1
        action_set.append(action + 1)
        # decide the position change based on the action and current holding position
        if hold == 0:
            holding_positions.append(action)
        if hold == 1:
            holding_positions.append(min(action + hold, hold))
        if hold == -1:
            holding_positions.append(max(action + hold, hold))

        new_hold = holding_positions[-1]
        
        # compute the reward (cash change) 
        position_change = new_hold - hold
        reward = 0
        if position_change == 1:
            reward = - state[0] 
        elif position_change == -1:
            reward = state[1] 

        rewards.append(reward)
    
    # at the end, the agent needs to liquid all positions
    if holding_positions[-1] == 1:
        rewards.append(train_dataset[-1][1])
    elif holding_positions[-1] == -1:
        rewards.append(-train_dataset[-1][0])
    else:
        rewards.append(0)

    # convert the list of rewards to a tensor
    rewards = torch.tensor(rewards).float().to(device)
    # convert the list of states to a tensor
    states = torch.stack(states).to(device)
    action_dist_set = torch.stack(action_dist_set).to(device)
    return states, rewards, action_dist_set, action_set


        
# train the value network using the TD(0) algorithm
def train_value_network(value_network, policy_network, train_dataset, value_optimizer, gamma, epochs = 10, lr = 0.001):
    # define the loss function
    loss = 0
    # loop over the epochs
    for epoch in range(epochs):
        # get the path from the policy network
        # only use one trajectory each epoch
        states, rewards, action_dist_set, action_set = get_path(policy_network, train_dataset)
        # get the value estimate from the value network
        value_estimate = value_network(states)
        value_estimate = value_estimate.squeeze(1)
        new_value_estimate = torch.cat((value_estimate, torch.tensor([0]).to(device)))
        # compute the TD(0) error
        loss = (rewards[:-1] + gamma * new_value_estimate[1:] - new_value_estimate[:-1]).pow(2).mean()
        # zero the gradient
        value_optimizer.zero_grad()
        # compute the gradient
        loss.backward()
        # update the weights
        value_optimizer.step()
        # print the loss
        print('Epoch: {}, Loss: {:.5f}'.format(epoch, loss.item()))
    return None


# the following define a function that compute advantage estimation for a trajectory
def advantage_estimate(states, rewards, value_network, gamma):
    value_estimate = value_network(states)
    value_estimate = value_estimate.squeeze(1)
    new_value_estimate = torch.cat((value_estimate, torch.tensor([0]).to(device)))
    advantages = rewards[:-1] + gamma * new_value_estimate[1:] - new_value_estimate[:-1]
    return advantages


        

In [60]:
# states, rewards, actions are trajetory data of old policy
# there is a new_policy_network that is updated by ppo_update()
def ppo_loss(new_policy_network, policy_network, value_network, train_dataset, batch_size, epsilon=0.2, gamma = 0.99):
    loss = torch.tensor(0.0, requires_grad=True).to(device)
    for _ in range(batch_size):
        states, rewards, action_dist_set, action_set = get_path(policy_network, train_dataset)
        new_action_dist_set = new_policy_network(states)
        action_dist_set = action_dist_set.detach()
        rewards = rewards.detach()
        ratio = []
        for i in range(len(action_dist_set)):
            ratio.append(new_action_dist_set[i][action_set[i]] / action_dist_set[i][action_set[i]])
        ratio = torch.stack(ratio).to(device).detach()
        # compute the advantage of the trajectory
        advantage = advantage_estimate(states, rewards, value_network, gamma)
        advantage = advantage.detach()
        # compute the clipped ratio
        clipped_ratio = torch.clamp(ratio, 1.0 - epsilon, 1.0 + epsilon)
        # compute the surrogate loss
        policy_loss = -torch.min(ratio * advantage, clipped_ratio * advantage).mean()
        # compute the total loss
        loss = loss + policy_loss
    return loss

    
def ppo_train(new_policy_network, policy_network, value_network, optimizer, train_dataset, batch_size, epochs, epsilon=0.2, gamma = 0.99):
    for epoch in range(epochs):
        loss = ppo_loss(new_policy_network, policy_network, value_network, train_dataset, batch_size, epsilon, gamma)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        # print the loss and epoch
        print("epoch: ", epoch, "loss: ", loss.item())



In [56]:
policy_network = Network(21, 128, 3, activation = nn.Softmax())
value_network = Network(21, 128, 1)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
policy_network.to(device)
value_network.to(device)

policy_optimizer = optim.Adam(policy_network.parameters(), lr=0.001)
value_optimizer = optim.Adam(value_network.parameters(), lr=0.001)

# convert the training data into tensors
train_dataset = torch.tensor(train_data.values, dtype=torch.float32)
train_dataset = train_dataset.to(device)

# convert the test data into tensors
test_dataset = torch.tensor(test_data.values, dtype=torch.float32)
test_dataset = test_dataset.to(device)

In [22]:
small_train_dataset = train_dataset[:150, :]

In [61]:
num_iter = 10
for iteration in range(num_iter):
    train_value_network(value_network, policy_network, small_train_dataset, value_optimizer, 0.9)
    new_policy_network = Network(21, 128, 3, activation = nn.Softmax()).to(device)
    new_policy_network.load_state_dict(policy_network.state_dict())
    new_policy_optimizer = optim.Adam(new_policy_network.parameters(), lr = 0.001)
    ppo_train(new_policy_network, policy_network, value_network, new_policy_optimizer, small_train_dataset, 12, 2, 0.2, 0.99)



        

  x = self.activation(self.fc3(x))


Epoch: 0, Loss: 2.98779
Epoch: 1, Loss: 1.55526
Epoch: 2, Loss: 1.20920
Epoch: 3, Loss: 1.39280
Epoch: 4, Loss: 1.54465
Epoch: 5, Loss: 1.76629
Epoch: 6, Loss: 1.96971
Epoch: 7, Loss: 1.63758
Epoch: 8, Loss: 0.79369
Epoch: 9, Loss: 0.28855
epoch:  0 loss:  0.36139407753944397
epoch:  1 loss:  0.36139407753944397
Epoch: 0, Loss: 0.64564
Epoch: 1, Loss: 1.26349
Epoch: 2, Loss: 1.28218
Epoch: 3, Loss: 0.75245
Epoch: 4, Loss: 0.35962
Epoch: 5, Loss: 0.38034
Epoch: 6, Loss: 0.52212
Epoch: 7, Loss: 0.57039
Epoch: 8, Loss: 0.58788
Epoch: 9, Loss: 0.56036
epoch:  0 loss:  0.47627732157707214
epoch:  1 loss:  0.47627732157707214
Epoch: 0, Loss: 0.38274
Epoch: 1, Loss: 0.18178
Epoch: 2, Loss: 0.20872
Epoch: 3, Loss: 0.40537
Epoch: 4, Loss: 0.46118
Epoch: 5, Loss: 0.30234
Epoch: 6, Loss: 0.16169
Epoch: 7, Loss: 0.17375
Epoch: 8, Loss: 0.23205
Epoch: 9, Loss: 0.24484
epoch:  0 loss:  -0.23284707963466644
epoch:  1 loss:  -0.23284707963466644
Epoch: 0, Loss: 0.23987
Epoch: 1, Loss: 0.22013
Epoch: 2

KeyboardInterrupt: 