In [1]:
%load_ext autoreload
%autoreload 2

from tqdm import tqdm
import pandas as pd
import numpy as np
import gzip
import json
import math
from datetime import datetime
import seaborn as sns
import matplotlib.dates as mdates
from mpl_toolkits.mplot3d import Axes3D
from IPython.display import display
%matplotlib inline

import sys
sys.path.append('..')
from helper.rl_framework import *
from helper.orderbook_container import OrderbookContainer
from helper.manage_orderbooks import *
from helper.orderbook_trader import *
# from helper.RL_Agent import RLAgent
from helper.RL_Agent_Specific import RLAgent_NN

# Neural Network
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import RMSprop

Using TensorFlow backend.


In [2]:
filename_train = '/home/axel/data/small/obs_2016-11_USDT_BTC_maxVol100.dict'
V = 100
vol_intervals = 10
T = 4
P = 15
state_variables=['volume','time']


actions = list(np.linspace(-0.4, 1.0, num=5))
print("V={}, T={}, P={}".format(V, T, P))
print("Actions: ", ", ".join(["{:1.2f}".format(a) for a in actions]))

episodes_train = OrderbookEpisodesGenerator(filename=filename_train, episode_length=T*P)
print("Length of episodes_train: {}".format(len(episodes_train)))

V=100, T=4, P=15
Actions:  -0.40, -0.05, 0.30, 0.65, 1.00
Length of episodes_train: 541


### Settings

In [3]:
STATE_DIM = len(state_variables)
NUM_ACTIONS = len(actions)
print(STATE_DIM)
print(NUM_ACTIONS)

2
5


In [33]:
def base_model(input_dim=2, output_dim=15):
    model = Sequential()
    model.add(Dense(32, input_dim=input_dim, activation='relu', init='zero'))
    model.add(Dense(output_dim, activation='linear', init='zero'))
    model.compile(loss='mse', optimizer='Adam')
    return model

In [78]:
def train_RL(orderbooks, V, T, period_length, epochs, model=None, gamma=0.95, DECAY_RATE=0.005, epsilon=1.,
             bufferSize=50, batchSize=10, verbose=False, state_variables=['volume', 'time']):
    
    brain = RLAgent_NN(actions=actions, model=model, state_variables=state_variables, V=V, T=T, period_length=period_length)

    state_dim = len(state_variables)    

    MAX_EXPLORATION_RATE = 1.
    MIN_EXPLORATION_RATE = 0.05

    replay = Memory(bufferSize)
    # stores tuples of (S, A, R, S')

    for i_window, window in tqdm(enumerate(orderbooks)):
        ots = OrderbookTradingSimulator(orderbooks=window, volume=V, tradingperiods=T,
                                                        period_length=P)
        
        for e in range(epochs):
            volume = V
            startpoint = 0
            
            if random.random() < 0.5:
                # randomly start at other states in environment
                volume = random.randint(1, V)
                startpoint = random.randint(0, T-1)
                # print("random - V:{}, T:{}".format(volume, startpoint))
            
            ots.reset(custom_starttime=startpoint, custom_startvolume=volume)
            
            action_history = []

            acc_cost = 0
            for step in range(startpoint, T):
                time_left = T - step

                timepoint = step*period_length
                timepoint_next = min((step+1)*period_length, len(window)-1)
                
                ob_now = window[timepoint]
                ob_next = window[timepoint_next]
                
                state = brain.generate_state(time_left=time_left,
                                         volume_left=volume,
                                         orderbook=ob_now)
                
                action = brain.choose_action(state=state, exploration=epsilon)
                action_history.append(action)

                limit = ob_now.get_ask() * (1. + (action/100.))
                summary = ots.trade(limit=limit, verbose=False, extrainfo={'ACTION':action})  #agression_factor=action

                volume = float(ots.volume)

                new_state = brain.generate_state(time_left=time_left-1,
                                                 volume_left=volume,
                                                 orderbook=ob_next)
                
                cost = ots.history.cost.values[-1]

                acc_cost += cost

                if verbose:
                    print("{} {:1.1f} {} {:1.4f} {:1.4f}".format(step, action, ots.volume, cost, acc_cost))

                # if cost < 0:
                #     print("{} {:1.2f} {:1.4f} {}".format(state, action, cost, new_state))

                replay.add((state, action, cost, new_state))
                if (replay.size() >= batchSize):

                    # wait for buffer to be filled, before getting started with training
                    minibatch = replay.get_random_samples(batchSize)           

                    X_train = []
                    y_train = []
                    for memory in minibatch:
                        state_m, action_m, cost_m, new_state_m = memory
                        

                        qval_old = brain.model.predict(state_m.reshape(1, state_dim))               
                        #print(qval_old)
                        y = np.zeros((1, NUM_ACTIONS))
                        y[:] = qval_old[:]

                        qval_new_m = brain.model.predict(new_state_m.reshape(1, state_dim), batch_size=1)
                        # display(qval_old)
                        
                        minQ = np.min(qval_new_m)
                        # print(cost_m, gamma, minQ)
                        update = cost_m + (gamma*minQ)
                        # print("update", update)
                        action_m_idx = brain.get_action_index(action_m)

                        y[0][action_m_idx] = update  # target output
                        # display(y)
                        X_train.append(state_m.reshape(state_dim,))
                        y_train.append(y.reshape(NUM_ACTIONS,))
                        
                    X_train = np.array(X_train)
                    y_train = np.array(y_train)
                    display(X_train)
                    display(y_train)
                    return
                    if verbose:
                        print("Game #: %s" % (i_window,))

                    brain.model.fit(X_train, y_train, batch_size=batchSize, nb_epoch=1, verbose=0)

                state = new_state
                if summary['done']:
                    # display(ots.history)
                    break

            # reduce exploration rate
            if epsilon > MIN_EXPLORATION_RATE:
                epsilon = MAX_EXPLORATION_RATE *   math.exp(- DECAY_RATE * i_window)

            info = "   {} - {:4d}/{}: epsilon={:5.3f}, acc_cost: {:0.5f}, steps: {} (t={})\n"\
                    .format(e, i_window+1, epochs, epsilon,  acc_cost, step, ots.t)
            #print(info)
    return brain

In [79]:
episodes_train = OrderbookEpisodesGenerator(filename=filename_train, episode_length=T*P)
# cached_episodes = list(episodes_train[:5])

In [80]:
V=100
T=4
P=15

data = cached_episodes  # episodes_train  # cached_episodes

# plot_episode(episodes_train[1], volume=100)
agent = train_RL(orderbooks=data[:], V=V, T=T, period_length=P, epochs=10, model=None,
                 verbose=False, state_variables=['volume', 'time'])
#agent.heatmap_Q()

0it [00:00, ?it/s]

array([[ 0.23,  0.25],
       [ 0.96,  0.75],
       [ 1.  ,  1.  ],
       [ 0.23,  0.5 ],
       [ 0.63,  0.5 ],
       [ 0.2 ,  0.5 ],
       [ 0.91,  0.25],
       [ 1.  ,  1.  ],
       [ 0.63,  0.75],
       [ 0.6 ,  1.  ]])

array([[  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   6.01393938e-02],
       [  0.00000000e+00,   0.00000000e+00,   2.44749960e-01,
          0.00000000e+00,   0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          1.96703832e-01,   0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          2.00708085e-01,   0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          5.03902949e-02,   0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   7.64759782e-01,
          0.00000000e+00,   0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   1.37017957e-01,
          0.00000000e+00,   0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00],
       [  0.00000000e+00,  -3.8366362




AttributeError: 'NoneType' object has no attribute 'heatmap_Q'

In [None]:
heatmap_Q(model=model, vol_intervals=vol_intervals, T=T)
plot_Q(model=model, z_represents='action', state_variables=['volume', 'time'])
plot_Q(model=model, z_represents='Q', state_variables=['volume', 'time'])

In [None]:
def run(V, T, P, epochs=1, overwrite_actions=None):
    for i_episode in range(epochs):
        ots = OrderbookTradingSimulator(orderbooks=episode_windows[1], volume=V,
                                        tradingperiods=T, period_length=P)
        plot_episode(episode_windows[1], volume=V)
        time_left = P*T
        volume = V

        state = np.array([time_left, volume])  # , volume])
        # state = discretize_state(state)

        acc_cost = 0
        for step in range(T):
            qval = model.predict(state.reshape(1, STATE_DIM))
            
            action = actions[np.argmin(qval)]
            
            if overwrite_actions and step < len(overwrite_actions):
                action = overwrite_actions[step]

            # info = ots.trade(orderbooks, limit=lim, verbose=False)
            info = ots.trade(agression_factor=action, verbose=False, extrainfo={'ACTION':action})

            time_left -= 1
            volume = ots.volume
            new_state = np.array([time_left, volume])  # , volume])
            cost = ots.history.cost.values[-1]

            acc_cost += cost

            state = new_state
            if volume == 0:
                break

        info = "{:4d}/{}: acc_cost: {:0.5f}, steps: {} (t={})"\
                .format(i_episode+1, epochs, acc_cost, step, ots.t)
        print(info)
    return ots.history
hist = run(V=V, T=T, P=P)  #, overwrite_actions=[0,0,0,0,0,0,0,0,0])
display(hist)
hist = run(V=V, T=T, P=P, overwrite_actions=[0.7]*10)
display(hist)
hist = run(V=V, T=T, P=P, overwrite_actions=[0.1, 0.2, 0.2, 0.3, 0.4, 0.5, 0.6, 0.8])
display(hist)