In [1]:
import time
import copy
import numpy as np
import pandas as pd
import chainer
import chainer.functions as F
import chainer.links as L
from chainer.serializers import save_npz,load_npz
from plotly import tools
from plotly.graph_objs import *
from plotly.offline import init_notebook_mode, iplot, iplot_mpl
import plotly.express as px
init_notebook_mode()

In [None]:
data = pd.read_csv('stock_data.csv')
data['Date'] = pd.to_datetime(data['Date'])
data = data.set_index('Date')
print(data.index.min(), data.index.max())
data.head()

In [None]:
len(data.columns)

In [None]:
date_split = '2020-01-01'
start_date = '2016-01-01'
train = data[start_date:date_split]
test = data[date_split:]
len(train), len(test)

In [None]:
def plot_train_test(train, test, date_split,ticker_name):
    
    data = [
        Scatter(x=train.index, y=train[ticker_name], mode="lines",name='train'),
        Scatter(x=test.index,  y=test[ticker_name], mode="lines",name='test')
    ]
    layout = {
         'shapes': [
             {'x0': date_split, 'x1': date_split, 'y0': 0, 'y1': 1, 'xref': 'x', 'yref': 'paper', 'line': {'color': 'rgb(0,0,0)', 'width': 1}}
         ],
        'annotations': [
            {'x': date_split, 'y': 1.0, 'xref': 'x', 'yref': 'paper', 'showarrow': False, 'xanchor': 'left', 'text': ' test data'},
            {'x': date_split, 'y': 1.0, 'xref': 'x', 'yref': 'paper', 'showarrow': False, 'xanchor': 'right', 'text': 'train data '}
        ],
        "title": {"text": ticker_name},
    }
    figure = Figure(data=data, layout=layout)
    iplot(figure)
    

In [None]:
plot_train_test(train, test, date_split,"EBAY")
# plot_train_test(train, test, date_split,"AMD")

In [None]:
fig = px.line(data[start_date:], x=data[start_date:].index, y=data.columns[3:6])
fig.show()

In [27]:
class Environment1:
    
    def __init__(self, data, history_t=90, starting_position=10000):
        self.data = data
        self.tickers = data.columns
        self.history_t = history_t
        self.starting_position = starting_position
        self.reset()
        
    def reset(self):
        self.t = 0
        self.done = False
        self.profits = 0
        self.total_invest = 0
        self.positions = {ticker:[] for ticker in self.tickers}
        self.position_value = {ticker:0 for ticker in self.tickers}
        self.history = {ticker:[0 for _ in range(self.history_t)] for ticker in self.tickers}
        return {ticker:[self.position_value[ticker]] + self.history[ticker] for ticker in self.tickers} # obs
    
    def step(self, act):
        reward = 0
        
        # act = 0: stay, 1: buy, 2: sell
        for ticker in self.tickers:
            if act[ticker] == 1: # buy
                if self.data.iloc[self.t, :][ticker] < (self.starting_position - self.total_invest): # have enough money
                    self.positions[ticker].append(self.data.iloc[self.t, :][ticker])
                    self.total_invest += self.data.iloc[self.t, :][ticker]
                else:
                    reward += -1
            elif act[ticker] == 2: # sell
                if len(self.positions[ticker]) == 0:
                    reward += -1
                else:
                    profits = 0
                    for p in self.positions[ticker]:
                        profits += (self.data.iloc[self.t, :][ticker] - p)
                        self.total_invest -= self.data.iloc[self.t, :][ticker]
                    reward += profits
                    self.profits += profits
                    self.positions[ticker] = []
        
        # set next time
        self.t += 1
        for ticker in self.tickers: 
            self.position_value[ticker] = 0
            for p in self.positions[ticker]:
                self.position_value[ticker] += (self.data.iloc[self.t, :][ticker] - p)
            self.history[ticker].pop(0)
            self.history[ticker].append(self.data.iloc[self.t, :][ticker] - self.data.iloc[(self.t-1), :][ticker])
        
        # clipping reward
        if reward > 0:
            reward = 1
        elif reward < 0:
            reward = -1
        
        return {ticker:[self.position_value[ticker]] + self.history[ticker] for ticker in self.tickers}, reward, self.done # obs, reward, done

In [28]:
# naive agent
env = Environment1(train,history_t=4)
print(env.reset())
for _ in range(3):
    pact = {ticker:np.random.randint(3) for ticker in train.columns}
    print(pact)
    print(env.step(pact))

{'DLTR': [0, 0, 0, 0, 0], 'BKNG': [0, 0, 0, 0, 0], 'CTSH': [0, 0, 0, 0, 0], 'VRTX': [0, 0, 0, 0, 0], 'FISV': [0, 0, 0, 0, 0], 'AMD': [0, 0, 0, 0, 0], 'ADP': [0, 0, 0, 0, 0], 'CSX': [0, 0, 0, 0, 0], 'EBAY': [0, 0, 0, 0, 0], 'WBA': [0, 0, 0, 0, 0], 'EXC': [0, 0, 0, 0, 0]}
{'DLTR': 1, 'BKNG': 1, 'CTSH': 1, 'VRTX': 0, 'FISV': 1, 'AMD': 0, 'ADP': 0, 'CSX': 0, 'EBAY': 2, 'WBA': 1, 'EXC': 2}
({'DLTR': [1.1700057983398438, 0, 0, 0, 1.1700057983398438], 'BKNG': [-34.6800537109375, 0, 0, 0, -34.6800537109375], 'CTSH': [0.0, 0, 0, 0, 0.0], 'VRTX': [0, 0, 0, 0, 0.5599975585937642], 'FISV': [0.2050018310546875, 0, 0, 0, 0.2050018310546875], 'AMD': [0, 0, 0, 0, -0.019999980926513672], 'ADP': [0, 0, 0, 0, 0.1999969482421875], 'CSX': [0, 0, 0, 0, -0.08666706085205078], 'EBAY': [0, 0, 0, 0, -0.3099994659423828], 'WBA': [-2.1800003051757812, 0, 0, 0, -2.1800003051757812], 'EXC': [0, 0, 0, 0, 0.13552093505859375]}, -1, False)
{'DLTR': 1, 'BKNG': 1, 'CTSH': 2, 'VRTX': 1, 'FISV': 1, 'AMD': 1, 'ADP': 2, 'CS

In [29]:
# Dueling Double DQN

class Q_Network(chainer.Chain):

    def __init__(self, input_size, hidden_size, output_size, N):
        super(Q_Network, self).__init__(
            fc1 = L.Linear(input_size, hidden_size),
            fc2 = L.Linear(hidden_size, hidden_size),
            fc3 = L.Linear(hidden_size, hidden_size),
            fc4 = L.Linear(hidden_size, hidden_size//2),
            fc5 = L.Linear(hidden_size, hidden_size//2),
            state_value = L.Linear(hidden_size//2, N),
            advantage_value = L.Linear(hidden_size//2, output_size*N)
        )
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.N = N

    def __call__(self, x):
        h = F.relu(self.fc1(x))
        h = F.relu(self.fc2(h))
        h = F.relu(self.fc3(h))
        hs = F.relu(self.fc4(h))
        ha = F.relu(self.fc5(h))
        state_value = self.state_value(hs) # [50,11]
        advantage_value = self.advantage_value(ha).reshape(-1,self.N,self.output_size) # [50,11,3]
        advantage_mean = (F.sum(advantage_value, axis=2)/float(self.output_size)).reshape(-1, self.N) # [50,11]
        state_value_reshape = F.concat([state_value for _ in range(self.output_size)], axis=1).reshape(-1,self.N,self.output_size)
        advantage_mean_reshape = F.concat([advantage_mean for _ in range(self.output_size)], axis=1).reshape(-1,self.N,self.output_size)
        q_value = state_value_reshape - advantage_value - advantage_mean_reshape
        return q_value

    def reset(self):
        self.zerograds()

In [34]:
def train_test_by_q(train_env, test_env, Q):
    
    print("Runing training data...")
    pobs = train_env.reset()
    train_acts = []
    train_rewards = []

    for _ in range(len(train_env.data)-1):
        
        if _%365 == 0:
            print("Now on date: ",str(train_env.data.index[_]))
            print("Profits so far:", train_env.profits)
            print("Last Rewards:", train_rewards[-5:])
            print("How much in?", {ticker:sum(positions) for ticker,positions in train_env.positions.items()})
            print("Sum", sum([sum(positions) for positions in train_env.positions.values()]))
            
        
        pact = Q(np.array(list(pobs.values()), dtype=np.float32).reshape(1,-1)).reshape(N,3)
        pact = np.argmax(pact.data, axis=1)
        pact = {ticker:act for ticker,act in zip(env.data.columns,pact)}
        train_acts.append(pact)
        
        profits_before = train_env.profits
            
        obs, reward, done = train_env.step(pact)
        train_rewards.append(reward)

        pobs = obs
        
        profits_after = train_env.profits
        
        if profits_after-profits_before > 100:
            print(pact)
            print("profits_before",profits_before)
            print("profits_after",profits_after)
            print("date:", str(train_env.data.index[_]))
            print("How much in?", {ticker:sum(positions) for ticker,positions in train_env.positions.items()})
            print("Sum", sum([sum(positions) for positions in train_env.positions.values()]))
            print("reward",reward)
        
    train_profits = train_env.profits
    
    print("Runing test data...")
    pobs = test_env.reset()
    test_acts = []
    test_rewards = []

    for _ in range(len(test_env.data)-1):
        
        if _%100 == 0:
            print("Now on date: ",str(test_env.data.index[_]))
            print("Profits so far:", test_env.profits)
            print("Last Rewards:", test_rewards[-5:])
            print("How much in?", {ticker:sum(positions) for ticker,positions in test_env.positions.items()})
            print("Sum", sum([sum(positions) for positions in test_env.positions.values()]))
    
        pact = Q(np.array(list(pobs.values()), dtype=np.float32).reshape(1,-1)).reshape(N,3)
        pact = np.argmax(pact.data, axis=1)
        pact = {ticker:act for ticker,act in zip(env.data.columns,pact)}
        test_acts.append(pact)
            
        obs, reward, done = test_env.step(pact)
        test_rewards.append(reward)

        pobs = obs
        
    test_profits = test_env.profits
    
    return train_rewards, train_profits, test_rewards, test_profits

In [None]:
# def train_dddqn(env):

env = Environment1(train,history_t=90)
env.reset()
N = len(env.data.columns)

Q = Q_Network(input_size=(env.history_t+1)*N, hidden_size=256, output_size=3,N=N)
Q_ast = copy.deepcopy(Q)
optimizer = chainer.optimizers.Adam()
optimizer.setup(Q)

with open('log.txt', 'w') as the_file:
    the_file.write('\t\t'.join(map(str, ["epoch", "epsilon", "log_reward", "log_loss", "profits", "total_invest","elapsed_time"])))
    the_file.write("\n")
print('\t\t'.join(map(str, ["epoch", "epsilon", "log_reward", "log_loss", "profits", "total_invest","elapsed_time"])))

epoch_num = 100
step_max = len(env.data)-1
memory_size = 200
batch_size = 50
epsilon = 1.0
epsilon_decrease = 1e-5
epsilon_min = 0.1
start_reduce_epsilon = 200
train_freq = 10
update_q_freq = 20
gamma = 0.97
show_log_freq = 1

memory = []
total_step = 0
total_rewards = []
total_losses = []

start = time.time()
for epoch in range(epoch_num):

    pobs = env.reset()
    step = 0
    done = False
    total_reward = 0
    total_loss = 0
    
    while not done and step < step_max:
        
        # select act
        if len(memory) != memory_size:
            pact = {ticker:np.random.randint(3) for ticker in env.data.columns}
        else:
            pact = Q(np.array(list(pobs.values()), dtype=np.float32).reshape(1,-1)).reshape(N,3)
            pact = np.argmax(pact.data, axis=1)
            pact = {ticker:act for ticker,act in zip(env.data.columns,pact)}
            if np.random.rand() < epsilon:
                random_ticker = env.data.columns[np.random.randint(N)]
                pact[random_ticker] = np.random.randint(3)

        # act
        obs, reward, done = env.step(pact)
        
        # add memory
        memory.append((pobs, pact, reward, obs, done))
        if len(memory) > memory_size:
            memory.pop(0)
            
        # train or update q
        if len(memory) == memory_size:
            if total_step % train_freq == 0:
                shuffled_memory = np.random.permutation(memory)
                memory_idx = range(len(shuffled_memory))
                for i in memory_idx[::batch_size]:
                    batch = np.array(shuffled_memory[i:i+batch_size])
                    b_pobs = [list(batch[:, 0][i].values()) for i in range(len(batch[:, 0]))]
                    b_pobs = np.array(b_pobs, dtype=np.float32)
                    b_pact = [list(batch[:, 1][i].values()) for i in range(len(batch[:, 0]))]
                    b_pact = np.array(b_pact, dtype=int)
                    b_reward = np.array(batch[:, 2].tolist(), dtype=np.int32)
                    b_obs = [list(batch[:, 3][i].values()) for i in range(len(batch[:, 0]))]
                    b_obs = np.array(b_obs, dtype=np.float32)
                    b_done = np.array(batch[:, 4].tolist(), dtype=np.bool)
                    
                    q = Q(b_pobs)
                    indices = np.argmax(q.data, axis=2) # [50,11]
                    maxqs = Q_ast(b_obs).data
                    target = copy.deepcopy(q.data)
                    for j in range(batch_size):
                        target[j, b_pact[j]] = b_reward[j]+gamma*maxqs[j, indices[j]]*(not b_done[j])
                    
                    Q.reset()
                    loss = F.mean_squared_error(q, target)
                    total_loss += loss.data
                    loss.backward()
                    optimizer.update()
                
            if total_step % update_q_freq == 0:
                Q_ast = copy.deepcopy(Q)
                    
        if epsilon > epsilon_min and total_step > start_reduce_epsilon:
            epsilon -= epsilon_decrease
        
        # next step
        total_reward += reward
        pobs = obs
        step += 1
        total_step += 1
        
    total_rewards.append(total_reward)
    total_losses.append(total_loss)
    
    if (epoch+1) % show_log_freq == 0:
        log_reward = sum(total_rewards[((epoch+1)-show_log_freq):])/show_log_freq
        log_loss = sum(total_losses[((epoch+1)-show_log_freq):])/show_log_freq
        elapsed_time = time.time()-start
        print('\t'.join(map(str, [epoch+1, epsilon, log_reward, log_loss, env.profits, env.total_invest ,elapsed_time])))
        with open('log.txt', 'a') as the_file:
            the_file.write('\t'.join(map(str, [epoch+1, epsilon, log_reward, log_loss, env.profits , env.total_invest ,elapsed_time])))
            the_file.write('\n')
        start = time.time()
        save_npz('my.model', Q)

print("total_losses",total_losses)
print("total_rewards",total_rewards)
save_npz('my.model', Q)

with open('log.txt', 'a') as the_file:
    the_file.write("Total Losses: ")
    the_file.write(str(total_losses))
    the_file.write("Total Rewards: ")
    the_file.write(str(total_rewards))    

                   
                    


epoch		epsilon		log_reward		log_loss		profits		total_invest		elapsed_time



Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations



1	0.9919600000000366	-696.0	353.5906059592962	159.6025760173776	8409.576512575151	21.810511827468872
2	0.9819100000000823	-775.0	1020.193588450551	3017.483112335205	-1101.2897605895996	32.422977685928345
3	0.9718600000001281	-779.0	1237.8937463462353	1298.632668495179	735.091494560241	29.233184814453125
4	0.9618100000001738	-833.0	1359.1212265193462	1306.1199312210067	10030.080149650576	32.16288113594055
5	0.9517600000002195	-863.0	1022.6845187842846	6749.659621477124	10058.97022891045	31.116665601730347
6	0.9417100000002653	-833.0	860.4015000760555	9204.584937095638	10013.82495594025	31.775076389312744
7	0.931660000000311	-753.0	1861.9803243279457	14074.468946576117	-4186.072239041327	36.498414754867554
8	0.9216100000003568	-752.0	1167.1433885991573	5481.521406531334	4314.761642098427	35.224907875061035
9	0.9115600000004025	-665.0	593.2996732592583	4622.020439267159	6796.409847140311	35.2990608215332
10	0.9015100000004482	-644.0	217.79697972536087	3552.3838019371037	4655.248968601227	

82	0.1779100000019534	-945.0	52.052133712917566	11079.120793938635	9638.892538428308	43.65379023551941
83	0.16786000000194334	-961.0	92.42128460854292	11211.40221464634	6341.587821364403	59.03050637245178
84	0.1578100000019333	-952.0	172.55951770022511	5209.163728833199	10051.800203204155	56.79376482963562
85	0.14776000000192324	-971.0	89.55159099027514	8442.460577845573	10032.399256825447	51.627121925354004
86	0.1377100000019132	-970.0	67.465286411345	4528.671954035759	10010.858130574226	46.937116622924805
87	0.12766000000190314	-969.0	53.38370597921312	10039.20580291748	7915.130873680115	43.91768741607666
88	0.11761000000190334	-942.0	76.61307282373309	10556.525907516478	9976.660776138307	51.156524896621704
89	0.10756000000190724	-989.0	66.195281567052	4478.017648696899	10062.07736980915	57.06883668899536
90	0.09999000000191018	-968.0	64.31526703946292	7456.146258831023	9647.823744297028	50.32973289489746


In [31]:
env = Environment1(train,history_t=90)
env.reset()
N = len(env.data.columns)

Q = Q_Network(input_size=(env.history_t+1)*N, hidden_size=256, output_size=3,N=N)

In [32]:
load_npz("./my.model",Q)

In [35]:
train_rewards, train_profits, test_rewards, test_profits = train_test_by_q(Environment1(train), Environment1(test), Q)

Runing training data...
Now on date:  2016-01-04 00:00:00
Profits so far: 0
Last Rewards: []
How much in? {'DLTR': 0, 'BKNG': 0, 'CTSH': 0, 'VRTX': 0, 'FISV': 0, 'AMD': 0, 'ADP': 0, 'CSX': 0, 'EBAY': 0, 'WBA': 0, 'EXC': 0}
Sum 0
{'DLTR': 2, 'BKNG': 2, 'CTSH': 1, 'VRTX': 0, 'FISV': 1, 'AMD': 1, 'ADP': 2, 'CSX': 0, 'EBAY': 1, 'WBA': 0, 'EXC': 0}
profits_before 28.869983673095703
profits_after 4209.539661407471
date: 2017-03-07 00:00:00
How much in? {'DLTR': 0, 'BKNG': 0, 'CTSH': 0, 'VRTX': 0, 'FISV': 1268.2299995422363, 'AMD': 84.46999883651733, 'ADP': 0, 'CSX': 0, 'EBAY': 680.4500045776367, 'WBA': 0, 'EXC': 19.950071334838867}
Sum 2053.1000742912292
reward 1
Now on date:  2017-06-15 00:00:00
Profits so far: 4209.539661407471
Last Rewards: [-1, -1, -1, -1, -1]
How much in? {'DLTR': 0, 'BKNG': 10551.25, 'CTSH': 0, 'VRTX': 0, 'FISV': 2144.2049980163574, 'AMD': 305.3399968147278, 'ADP': 0, 'CSX': 0, 'EBAY': 1179.8500061035156, 'WBA': 0, 'EXC': 19.950071334838867}
Sum 14200.59507226944
{'DLT