In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from macro_agent import MacroAgent, ReplayMemory, QNetwork, Transition
import torch
import torch.optim
import torch.nn as nn
#from mpl_finance import candlestick_ohlc
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import datetime as datetime
from datetime import datetime
import matplotlib.dates as mdates
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

In [2]:
CAPACITY = 500
INIT_EPSILON = 0.9
GAMMA_DISCOUNT = 0.8
EPOCH_COUNT = 500
MINI_BATCH_SIZE = 10
WINDOW_SIZE = 20
ACTIONS={"HOLD":0, "SELL":1, "BUY":2}

In [3]:
class Environment:
    
    def __init__(self, all_data):
        self._all_data = all_data
        self._pointer = 0
        self.done = False
        
        pass

    def taken_action(self):
        self._pointer += 1
        if self._pointer >= len(self._all_data) - 1:
            self.done = True
        pass
    
    def get_price(self):
        return self._all_data.iloc[self._pointer]['Close']

    def get_change_zscore(self, column):
        start_idx = np.max([0, self._pointer - WINDOW_SIZE])
        frame = self._all_data.iloc[start_idx:self._pointer + 1]
        market_now = self._all_data.iloc[self._pointer]
        PC = market_now[column] / np.mean(frame[column]) - 1

        PCs = np.empty_like(frame[column])
        for i in range(start_idx, self._pointer + 1):
            start_frame = np.max([0, i - WINDOW_SIZE])
            end_frame = i
            PCs[i - start_idx] = self._all_data.iloc[end_frame][column] / np.mean(
                self._all_data.iloc[start_frame:end_frame+1][column]) - 1

        z_score_price_change = (PC - np.mean(PCs)) / np.std(PCs) 
        return z_score_price_change

    def get_EMA(self, t):
        start_idx = np.max([0, self._pointer - WINDOW_SIZE])
        g = 2 * self._all_data.iloc[t]['Close'] / (WINDOW_SIZE + 1)
        ex = (100 - 2/(WINDOW_SIZE + 1))
        EMA = g + np.mean(self._all_data.iloc[start_idx:t+1]['Close']) * ex
        return EMA
        

    def get_indicators(self):
        start_idx = np.max([0, self._pointer - WINDOW_SIZE])
        market_now = self._all_data.iloc[self._pointer]
        frame = self._all_data.iloc[start_idx:self._pointer + 1]
        # price
        z_score_price = (market_now['Close'] - 
            np.mean(frame['Close'])) / np.std(frame['Close'])
        # price change
        z_score_price_change = self.get_change_zscore('Close')
        # volume
        z_score_volume = (market_now['Volume'] - 
            np.mean(frame['Volume'])) / np.std(frame['Volume'])
        # volume change
        z_score_volume_change = self.get_change_zscore('Volume')
        # Volatility
        volatility = (self.get_EMA(self._pointer) -
         self.get_EMA(self._pointer - WINDOW_SIZE))/ self.get_EMA(self._pointer - WINDOW_SIZE)
        return z_score_price,z_score_price_change,z_score_volume,z_score_volume_change,volatility

    def get_state(self, agent:MacroAgent):
        return torch.tensor(np.hstack((self.get_price(), self.get_indicators(),
            agent.estimate_assets(self.get_price()))), dtype=torch.float32)

def decay_epsilon(cur_epsilon):
    return cur_epsilon * 0.9

In [4]:
def load_data():
    market_data = pd.read_json('RESULT.json')
    market_data.rename(columns={1:'Open',2:'High', 3:'Low', 4:'Close', 5:'Volume'}, inplace=True)
    market_data[0] = market_data[0].transform(datetime.fromtimestamp)
    market_data.set_index([0], inplace=True)
    market_data.sort_index(inplace=True)
    market_data = market_data[(market_data.index >= '2018-11-15 00:00:00') & (market_data.index <= '2018-11-17 17:06:00')]
    return market_data

def get_train_data(market_data):
    return market_data[market_data.index <= '2018-11-16 00:00:00']

def get_test_data(market_data):
    return market_data[market_data.index >= '2018-11-16 00:00:00']
    

In [5]:
replay_memory = ReplayMemory(CAPACITY)
macro_agent = MacroAgent()
optimizer = torch.optim.Adam(macro_agent.q_network.parameters())
criterion = nn.MSELoss()

In [7]:
def reward_algo(agent:MacroAgent, action, environment):
    if action == ACTIONS['SELL']:
        if len(agent.assets) == 0:
            cur_reward = -1
        else:
            earning = agent.sell_assets(environment.get_price())
            cur_reward = 1 if earning > 0 else -1
            
    elif action == ACTIONS['BUY']:
        agent.buy_asset(environment.get_price())
        cur_reward = 0
    else:
        cur_reward = 0
    return cur_reward

def reward_algo_total(taken_actions, agent:MacroAgent, environment):
    results = torch.empty_like(taken_actions, dtype=torch.float32)
    for idx in range(len(taken_actions)):
        results[idx] = reward_algo(agent, taken_actions[idx], environment)
    return results

In [15]:
market_data = load_data()
train_data = get_train_data(market_data)
test_data = get_test_data(market_data)
current_max_balance = 0
for epoch in tqdm(range(EPOCH_COUNT)):
    done = False
    cur_epsilon = INIT_EPSILON
    environment = Environment(train_data)
    macro_agent.sell_assets(0)
    while not done:
    
        now_state = environment.get_state(macro_agent)
        macro_agent.q_network.eval()
        decision = np.random.rand()
        if decision < cur_epsilon:
            # epsilon
            action = np.random.choice(3)
        else:
            # 1-epsilon
            action = torch.argmax(macro_agent.q_network(now_state))
        

        # reward algo
        cur_reward = reward_algo(macro_agent, action, environment)

        environment.taken_action()
        next_state = environment.get_state(macro_agent)
        done = environment.done
        

        replay_memory.push(now_state, action, cur_reward, next_state, done)
        
        # taking batch
        batch = replay_memory.sample(MINI_BATCH_SIZE)
        q = np.empty(len(batch))
        current_states_batch = []

        for i, object in enumerate(batch):
            current_states_batch.append(torch.tensor(object.state, dtype=torch.float32))
            if not object.done:
                # r_i + gamma * Q()
                cur_in = torch.tensor(
                    object.next_state, dtype=torch.float32).unsqueeze(0)
                act = torch.argmax(macro_agent.q_network.forward(cur_in))
                q[i] = object.reward + GAMMA_DISCOUNT * reward_algo(macro_agent, act, environment)
            else:
                # r_i
                q[i] = object.reward
        # grad d
        macro_agent.q_network.train()
        current_states_batch = torch.stack(current_states_batch)
        
        output_actions = torch.tensor(torch.argmax(macro_agent.q_network.forward(
            current_states_batch), dim=1), dtype=torch.float32)

        rewards = reward_algo_total(output_actions, macro_agent, environment)
        #print(rewards, q)
        loss = criterion(torch.tensor(q, dtype=torch.float32, requires_grad=True), 
            rewards)
        print(loss)
        loss.backward()
        optimizer.step()
        #optimizer.zero_grad()

        cur_epsilon = decay_epsilon(cur_epsilon)  
    


    if epoch % 5 == 0:
        # test model
        macro_agent.q_network.eval()
        test_env = Environment(test_data)
        macro_agent.sell_assets(price=0)
        current_balance = [0]
        while not test_env.done:
            now_state = test_env.get_state(macro_agent)
            action = torch.argmax(macro_agent.q_network(now_state)).float()
            price = test_env.get_price()
            if action == ACTIONS['BUY']:
                prev_assets = macro_agent.estimate_assets(price)
                macro_agent.buy_asset(price)
                now_assets = macro_agent.estimate_assets(price) 
                current_balance.append(current_balance[-1] + now_assets - prev_assets - price)
            elif action == ACTIONS['SELL']:
                earning = macro_agent.sell_assets(price)
                current_balance.append(earning)
            else:
                current_balance.append(macro_agent.estimate_assets(price))
            test_env.taken_action()
        plt.plot(current_balance)
        pd.DataFrame(current_balance).to_csv('macro_agent_epoch_{}.csv'.format(epoch))
        plt.savefig('testing_epoch_{}.jpg'.format(epoch))

        if current_balance[-1] > current_max_balance:
            torch.save(macro_agent.q_network.state_dict(), 'best_q_net_epoch_{}'.format(epoch))

    torch.save(macro_agent.q_network.state_dict(), 'q_net_epoch_{}'.format(epoch))  

  0%|          | 0/500 [00:00<?, ?it/s]

tensor(0.6400, grad_fn=<MseLossBackward0>)
tensor(0.5200, grad_fn=<MseLossBackward0>)
tensor(0.6400, grad_fn=<MseLossBackward0>)
tensor(0.6400, grad_fn=<MseLossBackward0>)
tensor(0.6400, grad_fn=<MseLossBackward0>)
tensor(0.6400, grad_fn=<MseLossBackward0>)
tensor(0.6400, grad_fn=<MseLossBackward0>)
tensor(0.5800, grad_fn=<MseLossBackward0>)
tensor(0.6400, grad_fn=<MseLossBackward0>)
tensor(0.5800, grad_fn=<MseLossBackward0>)
tensor(0.5800, grad_fn=<MseLossBackward0>)
tensor(0.6400, grad_fn=<MseLossBackward0>)
tensor(0.6400, grad_fn=<MseLossBackward0>)
tensor(0.6400, grad_fn=<MseLossBackward0>)
tensor(0.6400, grad_fn=<MseLossBackward0>)
tensor(0.6400, grad_fn=<MseLossBackward0>)
tensor(0.6400, grad_fn=<MseLossBackward0>)
tensor(0.6400, grad_fn=<MseLossBackward0>)
tensor(0.5800, grad_fn=<MseLossBackward0>)
tensor(0.5800, grad_fn=<MseLossBackward0>)
tensor(0.5800, grad_fn=<MseLossBackward0>)
tensor(0.6400, grad_fn=<MseLossBackward0>)
tensor(0.6400, grad_fn=<MseLossBackward0>)
tensor(0.64

  0%|          | 0/500 [00:08<?, ?it/s]

tensor(0.5800, grad_fn=<MseLossBackward0>)
tensor(0.6400, grad_fn=<MseLossBackward0>)





KeyboardInterrupt: 

Parameter containing:
tensor([[-0.3746, -0.3235,  0.0076,  0.0131,  0.1609, -0.2423, -0.3071],
        [ 0.0325, -0.3361, -0.0678,  0.1092,  0.2856, -0.0894, -0.0896],
        [ 0.2877,  0.1441,  0.0769,  0.0705,  0.2919,  0.1385,  0.2284],
        [-0.0973,  0.1532,  0.0725, -0.0172, -0.1954,  0.2231,  0.2433],
        [ 0.3384, -0.2137, -0.1595,  0.0083, -0.1695, -0.0801, -0.3170],
        [ 0.2473,  0.0634,  0.0417, -0.1358, -0.2976,  0.1134,  0.1790],
        [-0.0425,  0.3571, -0.0076, -0.0049,  0.0072, -0.1257, -0.1256]],
       requires_grad=True)