In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from macro_agent import MacroAgent, ReplayMemory, QNetwork, Transition
import torch
import torch.optim
import torch.nn as nn
#from mpl_finance import candlestick_ohlc
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import datetime as datetime
from datetime import datetime
import matplotlib.dates as mdates
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

In [2]:
CAPACITY = 500
INIT_EPSILON = 0.9
GAMMA_DISCOUNT = 0.8
EPOCH_COUNT = 500
MINI_BATCH_SIZE = 10
WINDOW_SIZE = 20
ACTIONS={"HOLD":0, "SELL":1, "BUY":2}

In [3]:
class Environment:
    
    def __init__(self, all_data):
        self._all_data = all_data
        self._pointer = 0
        self.done = False
        
        pass

    def taken_action(self):
        self._pointer += 1
        if self._pointer >= len(self._all_data) - 1:
            self.done = True
        pass
    
    def get_price(self):
        return self._all_data.iloc[self._pointer]['Close']

    def get_change_zscore(self, column):
        start_idx = np.max([0, self._pointer - WINDOW_SIZE])
        frame = self._all_data.iloc[start_idx:self._pointer + 1]
        market_now = self._all_data.iloc[self._pointer]
        PC = market_now[column] / np.mean(frame[column]) - 1

        PCs = np.empty_like(frame[column])
        for i in range(start_idx, self._pointer + 1):
            start_frame = np.max([0, i - WINDOW_SIZE])
            end_frame = i
            PCs[i - start_idx] = self._all_data.iloc[end_frame][column] / np.mean(
                self._all_data.iloc[start_frame:end_frame+1][column]) - 1

        z_score_price_change = (PC - np.mean(PCs)) / np.std(PCs) 
        return z_score_price_change

    def get_EMA(self, t):
        start_idx = np.max([0, self._pointer - WINDOW_SIZE])
        g = 2 * self._all_data.iloc[t]['Close'] / (WINDOW_SIZE + 1)
        ex = (100 - 2/(WINDOW_SIZE + 1))
        EMA = g + np.mean(self._all_data.iloc[start_idx:t+1]['Close']) * ex
        return EMA
        

    def get_indicators(self):
        start_idx = np.max([0, self._pointer - WINDOW_SIZE])
        market_now = self._all_data.iloc[self._pointer]
        frame = self._all_data.iloc[start_idx:self._pointer + 1]
        # price
        z_score_price = (market_now['Close'] - 
            np.mean(frame['Close'])) / np.std(frame['Close'])
        # price change
        z_score_price_change = self.get_change_zscore('Close')
        # volume
        z_score_volume = (market_now['Volume'] - 
            np.mean(frame['Volume'])) / np.std(frame['Volume'])
        # volume change
        z_score_volume_change = self.get_change_zscore('Volume')
        # Volatility
        volatility = (self.get_EMA(self._pointer) -
         self.get_EMA(self._pointer - WINDOW_SIZE))/ self.get_EMA(self._pointer - WINDOW_SIZE)
        return z_score_price,z_score_price_change,z_score_volume,z_score_volume_change,volatility

def decay_epsilon(cur_epsilon):
    return cur_epsilon * 0.9

In [4]:
def load_data():
    market_data = pd.read_json('RESULT.json')
    market_data.rename(columns={1:'Open',2:'High', 3:'Low', 4:'Close', 5:'Volume'}, inplace=True)
    market_data[0] = market_data[0].transform(datetime.fromtimestamp)
    market_data.set_index([0], inplace=True)
    market_data.sort_index(inplace=True)
    market_data = market_data[(market_data.index >= '2018-11-15 00:00:00') & (market_data.index <= '2018-11-17 17:06:00')]
    return market_data

def get_train_data(market_data):
    return market_data[market_data.index <= '2018-11-16 00:00:00']

def get_test_data(market_data):
    return market_data[market_data.index >= '2018-11-16 00:00:00']
    

In [5]:
replay_memory = ReplayMemory(CAPACITY)
macro_agent = MacroAgent()
optimizer = torch.optim.Adam(macro_agent.q_network.parameters())
criterion = nn.MSELoss()

In [6]:
def reward_algo(agent:MacroAgent, action, environment):
    if action == ACTIONS['SELL']:
        if len(agent.assets) == 0:
            cur_reward = -1
        else:
            earning = agent.sell_assets(environment.get_price())
            cur_reward = 1 if earning > 0 else -1
            
    elif action == ACTIONS['BUY']:
        agent.buy_asset(environment.get_price())
        cur_reward = 0
    else:
        cur_reward = 0
    return cur_reward

In [7]:
market_data = load_data()
train_data = get_train_data(market_data)
test_data = get_test_data(market_data)
for epoch in tqdm(range(EPOCH_COUNT)):
    done = False
    cur_epsilon = INIT_EPSILON
    environment = Environment(train_data)
    macro_agent.sell_assets(0)
    while not done:
    
        now_state = [environment.get_price(), environment.get_indicators(), macro_agent.assets]
        macro_agent.q_network.eval()
        decision = np.random.rand()
        if decision < cur_epsilon:
            # epsilon
            action = np.random.choice(3)
        else:
            # 1-epsilon
            action = np.argmax(macro_agent.q_network(now_state).detach().numpy())
        

        # reward algo
        cur_reward = reward_algo(macro_agent, action, environment)

        environment.taken_action()

        next_state = [environment.get_price(), environment.get_indicators(), macro_agent.assets]
        done = environment.done
        
        replay_memory.push(now_state, action, cur_reward, next_state, done)
        
        BATCH_SIZE = np.min([MINI_BATCH_SIZE, len(replay_memory.memory)])
        batch = replay_memory.sample(BATCH_SIZE)
        
        q = np.empty(BATCH_SIZE)
        learning_input = np.empty(BATCH_SIZE, dtype=list)
        for i, object in enumerate(batch):
            learning_input[i] = [object.state[0], object.state[1], object.state[2]]
            if not object.done:
                input = [object.next_state[0], object.next_state[1], object.next_state[2]]
                act = np.argmax(macro_agent.q_network.forward(input).detach().numpy())
                q[i] = object.reward + GAMMA_DISCOUNT * reward_algo(macro_agent, act, environment)
            else:
                q[i] = object.reward
        
        macro_agent.q_network.train()
        for i, q_i in enumerate(q):
            loss = criterion(torch.tensor(q_i, dtype=torch.float32), macro_agent.q_network.forward(learning_input[i]))
            loss.backward()
        
        optimizer.step()
        optimizer.zero_grad()

        cur_epsilon = decay_epsilon(cur_epsilon)  
    
    # test model
    macro_agent.q_network.eval()
    test_env = Environment(test_data)
    macro_agent.sell_assets(price=0)
    current_balance = [0]
    print('testing')
    while not test_env.done:
        now_state = [test_env.get_price(), test_env.get_indicators(), macro_agent.assets]
        res = macro_agent.q_network(now_state).detach().numpy()
        
        action = np.argmax(res)
        print(res, action)
        if action == ACTIONS['BUY']:
            macro_agent.buy_asset(test_env.get_price())
            current_balance.append(len(macro_agent.assets) * test_env.get_price())
        elif action == ACTIONS['SELL']:
            earning = macro_agent.sell_assets(test_env.get_price())
            current_balance.append(current_balance[-1] + earning)
        else:
            current_balance.append(len(macro_agent.assets) * test_env.get_price())
        test_env.taken_action()



    torch.save(macro_agent.q_network.state_dict(), 'q_net_epoch_{}'.format(epoch))  

  0%|          | 0/500 [00:00<?, ?it/s]


ValueError: expected 3D input (got 1D input)

Parameter containing:
tensor([[-0.2423, -0.3367, -0.2342,  0.3480, -0.2947,  0.2375,  0.2133],
        [ 0.0070,  0.2908,  0.3121,  0.3176,  0.0777,  0.3357,  0.0618],
        [ 0.1103,  0.2196, -0.1605, -0.1847, -0.1597,  0.2789,  0.2133]],
       requires_grad=True)