In [1]:
from functions import *
from simulation import *
from policy_simulation import *
import matplotlib.pyplot as plt
import torch 
import numpy as np

In [3]:
# define a neural network for the policy 
class Net(torch.nn.Module):
    def __init__(self, n):
        self.n = n
        super(Net, self).__init__()
        self.fc1 = torch.nn.Linear(2 + 2 * n, 128)
        self.fc2 = torch.nn.Linear(128, 1)
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        # add relu to make sure the output is positive
        x = torch.relu(x)
        return x
    

value_net = Net(2)

value_net.forward(torch.tensor([1, 2, 3, 4, 5, 6], dtype=torch.float32))

tensor([1.2003], grad_fn=<ReluBackward0>)

In [32]:
# once we have the a path of trading, we can compute the loss function 

def temporal_diff_error(new_value_net, policy, t, Q, P, S, t_prime, Q_prime, P_prime, S_prime, dt, delta, gamma):
    # new_value_net is the value network to estimate the value function of the current policy
    # policy is the trading policy object
    # (t, Q, P, S) is the current state
    # (t', Q', P', S') is the next state
    # dt is the time step
    # delta is the delta of the option
    # gamma is the gamma of the option

    # the difference between the value function
    # concatenate (t, Q, P, S) and (t', Q', P', S')
    state = np.concatenate([t, Q, P, S])
    state_prime = np.concatenate([t_prime, Q_prime, P_prime, S_prime])

    # transfer the state to tensor
    state = torch.tensor(state, dtype=torch.float32)
    state_prime = torch.tensor(state_prime, dtype=torch.float32)

    value_diff = new_value_net.forward(state_prime) - new_value_net.forward(state)
    value_diff /= dt

    # compute the expected reward 
    profits = policy.expected_profits(t, Q, P, S)

    # compute the option related penalty
    option_penalty = np.dot(Q, (delta + gamma))

    # compute the entropy 
    entropy = policy.policy_entropy(t, Q, P, S)


    error = profits + value_diff + option_penalty + entropy

    return (error**2) * dt


# this is to compute the loss function of one simulated trajectories
# new_value_net is to estimate the value function of the current policy
# new_value_net is the only network to be trained    
def martingale_loss(new_value_net, policy, stock_price_path, options_price_path, options_delta_path, options_gamma_path, inv_path, dt, T):
    # new_value_net is the value network to estimate the value function of the current policy
    # policy is the trading policy object
    # stock_price_path is the path of stock price
    # options_price_path is the path of option price
    # options_delta_path is the path of option delta
    # options_gamma_path is the path of option gamma
    # inv_path is the path of inventory
    # dt is the time step
    # T is the maturity of the option

    N = int(T / dt)
    loss = 0

    for i in range(N - 1):
        t = np.array([i * dt])
        t_prime = np.array([(i + 1) * dt])

        Q = inv_path[i]
        Q_prime = inv_path[i + 1]

        P = options_price_path[i]
        P_prime = options_price_path[i + 1]

        S = np.array([stock_price_path[i]])
        S_prime = np.array([stock_price_path[i + 1]])

        delta = options_delta_path[i]
        gamma = options_gamma_path[i]

        loss += temporal_diff_error(new_value_net, policy, t, Q, P, S, t_prime, Q_prime, P_prime, S_prime, dt, delta, gamma)

    return loss



In [26]:
# let bid_range be n*2 array
bid_range = np.array([[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7], [8, 8], [9, 9]])
ask_range = np.array([[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7], [8, 8], [9, 9]])

t = np.array([0])
dt = 0.01
Q = np.array([0, 0])
P = np.array([12, 23])
S = np.array([1])   
penalty = 10
A = 30
kappa = 3
#print(policy_distribution(value_net, t, Q, P, S, penalty, A, kappa, bid_range, ask_range))


# define trading policy object
policy = TradingPolicy(value_net, penalty, A, kappa, bid_range, ask_range)
# define a new value network
new_value_net = Net(2)


# generate the training data 
V = np.array([[1, 0.5], [0.5, 1]])
stock_path = stock_price_path(100, 0.05, 1, 0.01)
K = np.array([30, 50])
daily_sigma = 0.01
option_price_path, delta_path, gamma_path = option_simulation(V, stock_path, 1, 0.01, K, 0.01, daily_sigma) 

inv, buy, sell = entire_trading(policy, option_price_path, stock_path, 0.01, A, kappa)



In [33]:
# compute the loss function 
loss = martingale_loss(new_value_net, policy, stock_path, option_price_path, delta_path, gamma_path, inv, 0.01, 1)

In [35]:
for i in range(2):
    print('training epoch:', i)
    option_price_path, delta_path, gamma_path = option_simulation(V, stock_path, 1, 0.01, K, 0.01, daily_sigma)
    inv, buy, sell = entire_trading(policy, option_price_path, stock_path, 0.01, A, kappa)

    loss = martingale_loss(new_value_net, policy, stock_path, option_price_path, delta_path, gamma_path, inv, 0.01, 1)
    print('loss:', loss)

    # update the value network
    loss.backward()

    # update the value network
    optimizer = torch.optim.Adam(new_value_net.parameters(), lr=0.01)
    optimizer.step()

    # clear the gradient
    optimizer.zero_grad()



training epoch: 0
loss: tensor([118.6872], grad_fn=<AddBackward0>)
training epoch: 1
loss: tensor([60.6772], grad_fn=<AddBackward0>)
