In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.multiprocessing as mp
from torch.autograd import Variable

In [3]:
N_S=6
N_A=3

In [41]:
def v_wrap(np_array, dtype=np.float32):
    if np_array.dtype != dtype:
        np_array = np_array.astype(dtype)
    return torch.from_numpy(np_array)

In [42]:
UPDATE_GLOBAL_ITER = 60
GAMMA = 0.9
MAX_EP = 5

In [None]:
class Net(nn.Module):
    def __init__(self, s_dim, a_dim):
        super(Net, self).__init__()
        
        self.s_dim = s_dim
        self.a_dim = a_dim
        
        self.fc1=torch.nn.LSTM(
            input_size=s_dim,
            hidden_size=20,
            num_layers=2)
        
        self.fc2=torch.nn.LSTM(
            input_size=s_dim,
            hidden_size=20,
            dropout=0.2,
            num_layers=1,
            bidirectional=True)

        self.out = nn.Linear(20,a_dim)
        self.out.weight.data.normal_(0,0.1)
        
        self.v1 = nn.Linear(20*2, 1)
        self.v1.weight.data.normal_(0,0.1)
        self.distribution = torch.distributions.Categorical
    

    def forward(self, x):
        s=np.expand_dims(x,1)
        s=v_wrap(s)
        output,(h_n,c_n)=self.fc1(s)
        output_in_last_timestep=output[-1] # many to one
        s=torch.squeeze(output_in_last_timestep,1)
        logits = self.out(s)
        
        a=np.expand_dims(x,1)
        a=v_wrap(a)
        action,(h_n,c_n)=self.fc2(a)
        action_in_last_timestep=action[-1] # many to one
        a=torch.squeeze(action_in_last_timestep,1)
        values = self.v1(a)
        return logits, values

    def choose_action(self, s):
        self.eval()
        logits, _ = self.forward(s)
        prob = F.softmax(logits, dim=1)
        print(prob.data.type(),2)
        #m = self.distribution(prob)
        return np.random.choice(3,p=prob[0,:])

    def loss_func(self, s, a, v_t):
        self.train()
        logits, values = self.forward(s)
        td = v_t - values
        c_loss = td.pow(2)
        
        probs = F.softmax(logits, dim=1)
        m = self.distribution(probs)
        exp_v = m.log_prob(a) * td.detach().squeeze()
        a_loss = -exp_v
        total_loss = (c_loss + a_loss).mean()
        return total_loss


In [31]:
import pandas as pd
import numpy as np
data=pd.read_csv('changed data.csv')

In [32]:
data.set_index('Date',inplace=True)

In [33]:
data2=data.copy()

In [34]:
data2

Unnamed: 0_level_0,STI,HBAN,VIX,spread,splog,dowlog
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2002-01-02,62.00,17.25,22.71,44.75,7.051570,7.590473
2002-01-03,62.30,17.45,21.34,44.85,7.060708,7.622796
2002-01-04,62.30,17.77,20.45,44.53,7.066902,7.630160
2002-01-07,62.38,17.71,21.94,44.67,7.060382,7.619283
2002-01-08,60.99,17.74,21.83,43.25,7.056787,7.628391
...,...,...,...,...,...,...
2019-09-09,64.97,14.17,15.27,50.80,7.999152,8.998068
2019-09-10,66.19,14.37,15.20,51.82,7.999474,8.997662
2019-09-11,67.28,14.61,14.61,52.67,8.006678,9.008185
2019-09-12,68.16,14.65,14.22,53.51,8.009552,9.011215


In [35]:
import numpy
import copy
def getState(t,data=data2, window=30):
    if t - window >= -1:
        vec = data.iloc[t - window + 1:t + 1,:].to_numpy()
    else: 
        x = np.tile(data.iloc[0,:].to_numpy(), (window-t-1, 1))
        y = data.iloc[0: t + 1,:].to_numpy()
        vec = np.vstack((x,y))
    return vec

In [44]:
class Simulator(object):

    def __init__(self, data):

        # frame a time period as worl
        # initialize cash holdings
        init_cash = 100000

        #for visualization
        #self.data_out = data_out

        
        # first trading day
        self.dateIdx = 0
        #self.date = start_date
        #self.start_date = start_date
        #self.end_date = end_date

        self.data = data
        #self.stockA = data['STI']
        #self.stockB = data['HBAN']

        # keep track of portfolio value as a series
        self.portfolio = {'cash': init_cash, 'a_vol': [], 'a_price': [], 'b_vol': [], 'b_price': [], 'longA': 0}
        self.port_val = 100000

        
    def init_state(self):
        """
        return init states of the market
        """
        states = getState(0)
        self.dateIdx =0
        return states

    def step(self, action):

        buy_volume = 100
    
        if (action == 1):#buy
            if (self.portfolio['longA'] >= 0):

                
                long_cost = buy_volume * self.data.iloc[self.dateIdx,0]

                short_cost = buy_volume * self.data.iloc[self.dateIdx,1]

                total_cost = short_cost + long_cost

                self.portfolio['cash'] -= total_cost

                
                self.portfolio['a_vol'].append(buy_volume)
                self.portfolio['a_price'].append(self.data.iloc[self.dateIdx,0])
                self.portfolio['b_vol'].append(buy_volume)
                self.portfolio['b_price'].append(self.data.iloc[self.dateIdx,1])
                self.portfolio['longA'] = 1

                
                old_port_val = self.port_val
                self.port_val = self.port_value_for_output()

                reward = self.port_val - old_port_val
                

            else: #longA < 0 --> sell in reverse
                long_return = self.portfolio['b_vol'].pop(0) * self.data.iloc[self.dateIdx,1]              
                self.portfolio['b_price'].pop(0)
            
                short_return = self.portfolio['a_vol'][0] * self.portfolio['a_price'][0]
                short_return += (self.portfolio['a_vol'].pop(0) * (self.portfolio['a_price'].pop(0) -self.data.iloc[self.dateIdx,0]))

                
                new_cash = self.portfolio['cash'] + long_return + short_return
                self.portfolio['cash'] = new_cash
                self.portfolio['longA'] = -1 if (len(self.portfolio['a_vol']) > 0) else 0
                old_port_val = self.port_val
                self.port_val = self.port_value_for_output()
                reward = self.port_val - old_port_val
                
        elif (action == 2): #sell
            if (self.portfolio['longA'] > 0):
                long_return = self.portfolio['a_vol'].pop(0) *self.data.iloc[self.dateIdx,0] 
                self.portfolio['a_price'].pop(0)
                
                short_return = self.portfolio['b_vol'][0] * self.portfolio['b_price'][0]
                short_return += (self.portfolio['b_vol'].pop(0) * (self.portfolio['b_price'].pop(0) -self.data.iloc[self.dateIdx,1]))
                
                new_cash = self.portfolio['cash'] + long_return + short_return
                self.portfolio['cash'] = new_cash
                self.portfolio['longA'] = 1 if (len(self.portfolio['a_vol']) > 0) else 0
                old_port_val = self.port_val
                self.port_val = self.port_value_for_output()
                reward = self.port_val - old_port_val
            else: # longA <= 0 --> buy in reverse
                
                long_cost = buy_volume * self.data.iloc[self.dateIdx,0]
                short_cost = buy_volume * self.data.iloc[self.dateIdx,1]
                total_cost = short_cost + long_cost
                self.portfolio['cash'] -= total_cost
                self.portfolio['a_vol'].append(buy_volume)
                self.portfolio['a_price'].append(self.data.iloc[self.dateIdx,0])
                self.portfolio['b_vol'].append(buy_volume)
                self.portfolio['b_price'].append(-self.data.iloc[self.dateIdx,1])
                self.portfolio['longA'] = -1
                old_port_val = self.port_val
                self.port_val = self.port_value_for_output()
                reward = self.port_val - old_port_val
                
        else: #hold    
            old_port_val = self.port_val
            self.port_val = self.port_value_for_output()
            reward = self.port_val - old_port_val
        
        #self.data_out.append([self.port_val/100000-1,self.port_val])
        self.dateIdx += 1
        state = getState(self.dateIdx)

        if self.dateIdx < len(self.data.index)-1:
            done=False
        else:
            done =True
            print(self.port_val)
        return (state, reward, done)

   
    # alternate calculation of the current value of cash and stock holdings
    def port_value_for_output(self):
        value = self.portfolio['cash']
        if (self.portfolio['longA'] > 0):
            value += (sum(self.portfolio['a_vol']) * self.data.iloc[self.dateIdx,0])
            for i in range(len(self.portfolio['b_vol'])):
                value += (self.portfolio['b_vol'][i] * self.portfolio['b_price'][i])
                value += (self.portfolio['b_vol'][i] * (self.portfolio['b_price'][i] - self.data.iloc[self.dateIdx,1]))
        if (self.portfolio['longA'] < 0):
            value += (sum(self.portfolio['b_vol']) * self.data.iloc[self.dateIdx,1])
            for i in range(len(self.portfolio['a_vol'])):
                value += (self.portfolio['a_vol'][i] * self.portfolio['a_price'][i])
                value += (self.portfolio['a_vol'][i] * (self.portfolio['a_price'][i] - self.data.iloc[self.dateIdx,0]))
        return value





In [45]:
def push_and_pull(opt, lnet, gnet, done, s_, bs, ba, br, gamma):
    if done:
        v_s_ = 0.               # terminal
    else:
        v_s_ = lnet.forward(s_)[-1].data.numpy()[0, 0]

    buffer_v_target = []
    for r in br[::-1]:    # reverse buffer r
        v_s_ = r + gamma * v_s_
        buffer_v_target.append(v_s_)
    buffer_v_target.reverse()

    loss = lnet.loss_func(
        np.vstack(bs),
        np.array(ba), 
        np.array(buffer_v_target))

    # calculate local gradients and push local parameters to global
    opt.zero_grad()
    loss.backward()
    for lp, gp in zip(lnet.parameters(), gnet.parameters()):
        gp._grad = lp.grad
    opt.step()

    # pull global parameters
    lnet.load_state_dict(gnet.state_dict())


def record(global_ep, global_ep_r, ep_r, res_queue, name):
    with global_ep.get_lock():
        global_ep.value += 1
    with global_ep_r.get_lock():
        if global_ep_r.value == 0.:
            global_ep_r.value = ep_r
        else:
            global_ep_r.value = global_ep_r.value * 0.99 + ep_r * 0.01
    res_queue.put(global_ep_r.value)
    print(
        name,
        "Ep:", global_ep.value,
        "| Ep_r: %.0f" % global_ep_r.value,
    )

In [None]:
class Worker(mp.Process):
    def __init__(self, gnet, opt, global_ep, global_ep_r, res_queue, name):
        super(Worker, self).__init__()
        self.name = 'w%i' % name
        self.g_ep, self.res_queue = global_ep, res_queue
        self.gnet, self.opt = gnet, opt
        self.lnet = Net(N_S, N_A)           # local network
        self.env = Simulator(data)

    def run(self):
        total_step = 1
        while self.g_ep.value < MAX_EP:
            s = self.env.init_state()
            buffer_s, buffer_a, buffer_r = [], [], []
            ep_r = 0.
            done=False
            while (done==False):
                a = self.lnet.choose_action(s)
                a = a.detach().numpy()
                a = np.random.choice(3,p=a[0,:])
                s_, r, done = self.env.step(a)
                if done: r = -1
                ep_r += r
                buffer_a.append(a)
                buffer_s.append(s[-1])
                buffer_r.append(r)

                if total_step % UPDATE_GLOBAL_ITER == 0 or done:  # update global and assign to local net
                    # sync
                    push_and_pull(self.opt, self.lnet, self.gnet, done, s_, buffer_s, buffer_a, buffer_r, GAMMA)
                    buffer_s, buffer_a, buffer_r = [], [], []

                    if done:  # done and print information
                        record(self.g_ep, self.g_ep_r, ep_r, self.res_queue, self.name)
                        break
                s = s_
                total_step+=1
        self.res_queue.put(None)


In [None]:
class SharedAdam(torch.optim.Adam):
    def __init__(self, params, lr=1e-3, betas=(0.9, 0.9), eps=1e-8,
                 weight_decay=0):
        super(SharedAdam, self).__init__(params, lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
        # State initialization
        for group in self.param_groups:
            for p in group['params']:
                state = self.state[p]
                state['step'] = 0
                state['exp_avg'] = torch.zeros_like(p.data)
                state['exp_avg_sq'] = torch.zeros_like(p.data)

                # share in memory
                state['exp_avg'].share_memory_()
                state['exp_avg_sq'].share_memory_()

In [None]:
if __name__ == "__main__":
    gnet = Net(N_S, N_A)        # global network
    gnet.share_memory()         # share the global parameters in multiprocessing
    opt = SharedAdam(gnet.parameters(), lr=0.0008)      # global optimizer
    global_ep, global_ep_r, res_queue = mp.Value('i', 0), mp.Value('d', 0.), mp.Queue()

    # parallel training
    workers = [Worker(gnet, opt, global_ep, global_ep_r, res_queue, i) for i in range(4-1)]
    [w.start() for w in workers]
    res = []                    # record episode reward to plot
    while True:
        r = res_queue.get()
        if r is not None:
            res.append(r)
        else:
            break
    [w.join() for w in workers]

    import matplotlib.pyplot as plt
    plt.plot(res)
    plt.ylabel('Moving average ep reward')
    plt.xlabel('Step')
    plt.show()