In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from macro_agent import MacroAgent, ReplayMemory, QNetwork, Transition
import torch
import torch.optim
import torch.nn as nn
import torch.nn.functional
#from mpl_finance import candlestick_ohlc
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import datetime as datetime
from datetime import datetime
import matplotlib.dates as mdates
from tqdm import tqdm
from torch.autograd import Variable
import warnings
#warnings.filterwarnings("ignore")

In [3]:
CAPACITY = 500
INIT_EPSILON = 0.9
GAMMA_DISCOUNT = 0.8
EPOCH_COUNT = 500
MINI_BATCH_SIZE = 1
WINDOW_SIZE = 20
ALPHA = 10000
ACTIONS={"BUY":torch.tensor([1,0,0], dtype=torch.float32),
 "HOLD":torch.tensor([0,1,0], dtype=torch.float32),
  "SELL":torch.tensor([0,0,1], dtype=torch.float32)}

In [4]:
class Environment:
    
    def __init__(self, all_data):
        self._all_data = all_data
        self._pointer = WINDOW_SIZE
        self.done = False
        
        pass

    def taken_action(self):
        self._pointer += 1
        if self._pointer >= len(self._all_data) - 1:
            self.done = True
        pass
    
    def get_price(self):
        return self._all_data.iloc[self._pointer]['Close']

    def get_change_zscore(self, column):
        start_idx = np.max([0, self._pointer - WINDOW_SIZE])
        frame = self._all_data.iloc[start_idx:self._pointer + 1]
        market_now = self._all_data.iloc[self._pointer]
        PC = market_now[column] / np.mean(frame[column]) - 1

        PCs = np.empty_like(frame[column])
        for i in range(start_idx, self._pointer + 1):
            start_frame = np.max([0, i - WINDOW_SIZE])
            end_frame = i
            PCs[i - start_idx] = self._all_data.iloc[end_frame][column] / np.mean(
                self._all_data.iloc[start_frame:end_frame+1][column]) - 1

        z_score_price_change = (PC - np.mean(PCs)) / np.std(PCs) 
        return z_score_price_change

    def get_EMA(self, t):
        start_idx = np.max([0, self._pointer - WINDOW_SIZE])
        g = 2 * self._all_data.iloc[t]['Close'] / (WINDOW_SIZE + 1)
        ex = (100 - 2/(WINDOW_SIZE + 1))
        EMA = g + np.mean(self._all_data.iloc[start_idx:t+1]['Close']) * ex
        return EMA
        

    def get_indicators(self):
        start_idx = np.max([0, self._pointer - WINDOW_SIZE])
        market_now = self._all_data.iloc[self._pointer]
        frame = self._all_data.iloc[start_idx:self._pointer + 1]
        # price
        z_score_price = (market_now['Close'] - 
            np.mean(frame['Close'])) / np.std(frame['Close'])
        # price change
        z_score_price_change = self.get_change_zscore('Close')
        # volume
        z_score_volume = (market_now['Volume'] - 
            np.mean(frame['Volume'])) / np.std(frame['Volume'])
        # volume change
        z_score_volume_change = self.get_change_zscore('Volume')
        # Volatility
        volatility = (self.get_EMA(self._pointer) -
         self.get_EMA(self._pointer - WINDOW_SIZE))/ self.get_EMA(self._pointer - WINDOW_SIZE)
        return z_score_price,z_score_price_change,z_score_volume,z_score_volume_change,volatility

    def get_state(self, agent:MacroAgent):
        return torch.tensor(np.hstack((self.get_price(), self.get_indicators(),
            agent.estimate_assets(self.get_price()))), dtype=torch.float32)

def decay_epsilon(cur_epsilon):
    return cur_epsilon * 0.9

In [5]:
def load_data():
    market_data = pd.read_json('RESULT.json')
    market_data.rename(columns={1:'Open',2:'High', 3:'Low', 4:'Close', 5:'Volume'}, inplace=True)
    market_data[0] = market_data[0].transform(datetime.fromtimestamp)
    market_data.set_index([0], inplace=True)
    market_data.sort_index(inplace=True)
    market_data = market_data[(market_data.index >= '2018-11-15 00:00:00') & (market_data.index <= '2018-11-17 17:06:00')]
    return market_data

def get_train_data(market_data):
    return market_data[market_data.index <= '2018-11-16 00:00:00']

def get_test_data(market_data):
    return market_data[market_data.index >= '2018-11-16 00:00:00']
    

In [6]:
replay_memory = ReplayMemory(CAPACITY)
macro_agent = MacroAgent()
optimizer = torch.optim.Adam(macro_agent.q_network.parameters())
criterion = nn.MSELoss()

In [7]:
def calculate_reward(chosen_action, estimated_assets):
    if torch.allclose(chosen_action,ACTIONS['SELL']):
        if  estimated_assets == 0:
            return -1
        else: 
            return 1 if estimated_assets > 0 else -1
    else:
        return 0

def get_best_reward(next_state):
    sell_reward = calculate_reward(ACTIONS['SELL'], next_state[6])
    buy_reward = calculate_reward(ACTIONS['BUY'], next_state[6])
    hold_reward = calculate_reward(ACTIONS['HOLD'], next_state[6])
    if sell_reward >= buy_reward and  sell_reward >= hold_reward:
        return (ACTIONS['SELL'], sell_reward)
    elif buy_reward >= sell_reward and buy_reward >= hold_reward:
        return (ACTIONS['BUY'], buy_reward)
    else:
        return (ACTIONS['HOLD'], hold_reward)

            

In [15]:
replay_memory = ReplayMemory(CAPACITY)
macro_agent = MacroAgent()
optimizer = torch.optim.Adam(macro_agent.q_network.parameters())
criterion = nn.MSELoss()
market_data = load_data()
train_data = get_train_data(market_data)
test_data = get_test_data(market_data)
current_max_balance = 0


for epoch in tqdm(range(EPOCH_COUNT)):
    done = False
    cur_epsilon = INIT_EPSILON
    environment = Environment(train_data)
    macro_agent.sell_assets(0)
    while not done:
        now_state = environment.get_state(macro_agent)
        macro_agent.q_network.eval()
        decision = np.random.rand()
        if decision < cur_epsilon:
            # epsilon
            action = ACTIONS[np.random.choice(list(ACTIONS))]
            est_assets = macro_agent.estimate_assets(environment.get_price())
            cur_reward = calculate_reward(action, est_assets)
        else:
            # 1-epsilon
            action, cur_reward = macro_agent.q_network(now_state)
        

        # reward algo
        environment.taken_action()
        next_state = environment.get_state(macro_agent)
        done = environment.done
        
        replay_memory.push(now_state, action, cur_reward, next_state, done)
        
        # taking batch
        batch = replay_memory.sample(MINI_BATCH_SIZE)
        q = []
        current_states_batch = []

        for i, object in enumerate(batch):
            current_states_batch.append(object.state)
            if not object.done:
                # r_i + gamma * Q()
                q.append(object.action + GAMMA_DISCOUNT * get_best_reward(object.next_state)[0])
            else:
                # r_i
                q.append(object.action)
        # grad d
        macro_agent.q_network.train()
        rewards = []
        actions = []
        for batch_state in current_states_batch:
            rewards.append(macro_agent.q_network(batch_state))
            actions.append(macro_agent.q_network.DQN.forward(batch_state))

        #optimizer.zero_grad()
        print(actions, q)
        for i in range(len(rewards)):
            loss = criterion(actions[i], q[i])
            loss.backward()
            optimizer.step()
        
        cur_epsilon = decay_epsilon(cur_epsilon)  
    

    
  
    torch.save(macro_agent.q_network.state_dict(), 'q_net_epoch_{}'.format(epoch))  

  0%|          | 0/500 [00:00<?, ?it/s]

[tensor([0.1298, 0.5299, 0.3404], grad_fn=<SoftmaxBackward0>)] [tensor([1.8000, 0.0000, 0.0000])]
[tensor([0.1375, 0.5142, 0.3482], grad_fn=<SoftmaxBackward0>)] [tensor([0.8000, 1.0000, 0.0000])]
[tensor([0.1451, 0.5028, 0.3521], grad_fn=<SoftmaxBackward0>)] [tensor([0.8000, 1.0000, 0.0000])]
[tensor([0.1519, 0.4944, 0.3537], grad_fn=<SoftmaxBackward0>)] [tensor([0.8000, 1.0000, 0.0000])]
[tensor([0.1584, 0.4866, 0.3549], grad_fn=<SoftmaxBackward0>)] [tensor([0.8000, 1.0000, 0.0000])]
[tensor([0.1637, 0.4811, 0.3552], grad_fn=<SoftmaxBackward0>)] [tensor([1.8000, 0.0000, 0.0000])]
[tensor([0.1701, 0.4732, 0.3568], grad_fn=<SoftmaxBackward0>)] [tensor([0.8000, 1.0000, 0.0000])]
[tensor([0.1676, 0.4613, 0.3711], grad_fn=<SoftmaxBackward0>)] [tensor([0.8000, 1.0000, 0.0000])]
[tensor([0.1770, 0.4679, 0.3551], grad_fn=<SoftmaxBackward0>)] [tensor([0.8000, 1.0000, 0.0000])]
[tensor([0.1772, 0.4722, 0.3507], grad_fn=<SoftmaxBackward0>)] [tensor([0.8000, 1.0000, 0.0000])]
[tensor([0.1759, 0.4

  0%|          | 0/500 [00:11<?, ?it/s]

[tensor([1.0000e+00, 7.4245e-08, 9.2944e-17], grad_fn=<SoftmaxBackward0>)] [tensor([1.8000, 0.0000, 0.0000])]





KeyboardInterrupt: 

In [16]:
for param in optimizer.param_groups[0]['params']:
    if param.requires_grad:
        print(param.grad)

tensor([[-2.6278e+02, -1.6763e-02,  1.6766e-02, -7.9665e-03, -8.6275e-03,
         -2.0191e-04,  0.0000e+00],
        [ 4.6390e+02,  8.1858e-03, -1.3437e-02,  3.2233e-02,  4.2579e-02,
          1.6054e-04,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00]])
tensor([-0.0473,  0.0836,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000])
tensor([[-112.9418,   -8.2626,    0.0000,    0.0000,    0.0000,    0.0000,
            0.0000],
        [  98.0209,   77.8896,    0.0000,    0.0000,    0.000

In [17]:
ACTIONS[np.random.choice(list(ACTIONS))]

tensor([0., 0., 1.])

In [None]:
  if epoch % 5 == 0:
        # test model
        macro_agent.q_network.eval()
        test_env = Environment(test_data)
        macro_agent.sell_assets(price=0)
        current_balance = [0.0]
        while not test_env.done:
            now_state = test_env.get_state(macro_agent)
            action = soft_argmax(macro_agent.q_network(now_state.unsqueeze(0)))
            price = test_env.get_price()
            print(action)
            if torch.allclose(action, ACTIONS['BUY']):
                prev_assets = macro_agent.estimate_assets(price)
                macro_agent.buy_asset(price)
                after_assets = macro_agent.estimate_assets(price)
                current_balance.append(current_balance[-1] + after_assets - prev_assets)
            elif torch.allclose(action,ACTIONS['SELL']):
                earning = macro_agent.sell_assets(price)
                current_balance.append(current_balance[-1] + earning)
            else:
                current_balance.append(current_balance[-1])
            test_env.taken_action()
        plt.plot(current_balance)
        pd.DataFrame(current_balance).to_csv('macro_agent_epoch_{}.csv'.format(epoch))
        plt.savefig('testing_epoch_{}.jpg'.format(epoch))

        if current_balance[-1] > current_max_balance:
            torch.save(macro_agent.q_network.state_dict(), 'best_q_net_epoch_{}'.format(epoch))
