In [1]:
%load_ext autoreload
%autoreload 2

from tqdm import tqdm
import pandas as pd
import numpy as np
import gzip
import json
import math
from datetime import datetime
import matplotlib.dates as mdates
from mpl_toolkits.mplot3d import Axes3D
from IPython.display import display
%matplotlib inline


import sys
sys.path.append('..')
from helper.rl_framework import *
from helper.rl_functions import *
from helper.orderbook_container import OrderbookContainer
from helper.manage_orderbooks_v2 import *
from helper.orderbook_trader import *
from helper.Q_learning import QLearn, QLearnEnvironment, state_as_string

Using TensorFlow backend.


In [2]:
# filename = '../../data/snapshots/orderbooks_USDT_BTC_range1.15_samplesNone_snapshot2017-01-11T17:19.dict'
filename = '../data/3000orderbooks'
currency_pair = 'USDT_BTC'

samples = 720
print("# Load Training Set")
orderbooks_train = load_orderbook_snapshot(infile=filename, first_line=0, last_line=samples)
print(orderbooks_train[0])

print("")
print("# Load Validation Set")
orderbooks_val = load_orderbook_snapshot(infile=filename, first_line=samples, last_line=2*samples)
print(orderbooks_val[0])

  0%|          | 3/720 [00:00<00:36, 19.80it/s]

# Load Training Set


100%|██████████| 720/720 [00:29<00:00, 21.92it/s]
  0%|          | 0/720 [00:00<?, ?it/s]

Loaded 720 orderbooks from file '../data/3000orderbooks'.
OrderbookContainer from 2016-11-08T10:00
  499 bids (best: 705.0)
  407 asks (best: 705.45)
  kind: 'orderbook'

# Load Validation Set


100%|██████████| 720/720 [00:32<00:00, 21.04it/s]

Loaded 720 orderbooks from file '../data/3000orderbooks'.
OrderbookContainer from 2016-11-08T22:01
  548 bids (best: 712.99)
  410 asks (best: 713.36)
  kind: 'orderbook'





### Settings

In [None]:
T = 4  # Time horizon: 10 periods -> P*T = 20 minutes
P = 30  # period length
print("T={}, P={}".format(T, P))

#### Split orderbook array into non-overlapping episodes

In [None]:
def create_episodes(orderbooks, episode_length):
    episode_windows = []
    for e in range(0, int(len(orderbooks) / episode_length)):
        window = [ob.copy() for ob in orderbooks[e*episode_length:(e+1)*episode_length]]

        # plot_episode(window, volume=V, outfile='graphs/episode_window{}'.format(e))
        episode_windows.append(window)
    return episode_windows

episode_windows_train = create_episodes(orderbooks_train, episode_length=T*P)
episode_windows_val = create_episodes(orderbooks_val, episode_length=T*P)

print("Training Episodes  : {}, episode length: {}, start at: {}".format(len(episode_windows_train), len(episode_windows_train[0]), episode_windows_train[0][0].timestamp))
print("Validation Episodes: {}, episode length: {}, start at: {}".format(len(episode_windows_val), len(episode_windows_val[0]), episode_windows_val[0][0].timestamp))
# plot_episode(episode_windows_val[0], volume=10)
# plot_episode(episode_windows_val[0], volume=100)

In [None]:
# STATE_DIM = 2
NUM_ACTIONS = 11
actions = list(np.linspace(-1, 5, num=NUM_ACTIONS))
print("available actions: {}".format(actions))

In [None]:
def round_custombase(val, *, base):
    return float(round(float(val) / base) * base)

test = round_custombase(12.43, base=5)
print(type(test), test)

In [None]:
test = QLearnEnvironment(volume=100)
print(test)

In [None]:
print(state_as_string(time_left=3, volume_left=1.0, orderbook=episode_windows_train[0][1]))
print(state_as_string(time_left=3, volume_left=1.0))  # , orderbook=episode_windows_train[0][1]))

In [None]:
for tt in tqdm(range(T)[::-1]):
    print(tt)

In [None]:
def optimal_strategy(V, T, decisionfrequency, vol_intervals, actions, verbose=True):
    timestamp = datetime.now()

    print("V: {}, T: {}, decisionfrequency: {}, vol_intervals: {}, num_actions: {}".format(V, T, decisionfrequency, vol_intervals, len(actions)))
    print("actions: {}".format(actions))
    volumes = np.linspace(0, 1, num=vol_intervals+1)[1:][::-1] # skip volumes=0
    
    volumes_base = float(V)/vol_intervals
    print("volumes_base: {}".format(volumes_base))
    print("volumes: {}".format(volumes))
    ql = QLearn(actions=actions, vol_intervals=vol_intervals, V=V, T=T, decisionfrequency=decisionfrequency)
    print("V: ", ql.V)

    H = T*decisionfrequency

    for tt in tqdm(range(T)[::-1]):
        trading_startpoint = decisionfrequency*tt
        time_left = T-tt

        for episode in tqdm(episode_windows_train):
            center = episode[trading_startpoint].get_center()
            # ask = episode[trading_startpoint].get_ask()
            initial_center = episode[0].get_center()
            
            for vol in volumes:
                if tt == 0:
                    if vol != 1.:
                        # at t=0 we always have 100% of the volume left.
                        print("x", vol)
                        break
                
                for a in actions:
                    state = state_as_string(time_left=time_left, volume_left=vol)  #, orderbook=episode[trading_startpoint])
                    
                    if vol == 0:
                        volume_left = 0
                        cost = 0
                    else:
                        ots = OrderbookTradingSimulator(orderbooks=episode[trading_startpoint:], volume=vol*V, tradingperiods=T-tt,
                                                        decisionfrequency=decisionfrequency)
                        limit = center + a

                        ots.trade(limit = limit)  # agression_factor=a)
                    
                        volume_left = ots.volume
                        volume_left_rounded = round_custombase(volume_left, base=volumes_base)
                        
                        volume_traded = ots.history.volume_traded.values[-1]
                        volume_traded_rounded = round_custombase(volume_traded, base=volumes_base)
                        
                        assert volume_left_rounded + volume_traded_rounded - vol*V <= 1.e-8, "{} {} {} {}".format(
                            volume_left_rounded, volume_traded_rounded, vol, V)
                        
                        cashflow = ots.history.cashflow[-1]
                        
                        avg = ots.history.avg[-1]
                        
                        # manually compute costs, since we have to think in discrete volume steps (rounding ...)
                        cost = volume_traded_rounded * (avg - initial_center) / initial_center
                    
                    new_state = state_as_string(time_left=time_left-1, volume_left=volume_left_rounded/V)  #, orderbook=ots.masterbook)
                    
                    print(state, a, cost, new_state)
                    
                    ql.learn(state, a, cost, new_state)
                    
            ql.plot_Q(outfile="../graphs/Q_function_{}_action".format(T-tt), outformat='pdf', z_represents='action', verbose=verbose)
            ql.plot_Q(outfile="../graphs/Q_function_{}_Q".format(T-tt), outformat='pdf', z_represents='Q', verbose=verbose)
        ql.save("../pickles/Q_function_{}".format(timestamp))
    plot_episode(episode, volume=V)
    return ql

In [None]:
V = 100
print("V={}, T={}, P={}".format(V, T, P))
ql = optimal_strategy(V=V, T=T, decisionfrequency=P, vol_intervals=10, actions=actions)
ql.plot_Q(z_represents='action')
ql.plot_Q(z_represents='Q')

In [None]:
ql.plot_Q(z_represents='action')
ql.plot_Q(z_represents='Q')

In [None]:
V=100
T=4
# P=15
ql = QLearn(actions = actions, vol_intervals=10)
# ql = ql.load("pickles/Q_function_e375_T4_P2_V200")
ql = ql.load("pickles/Q_function_e12_T4_P15_V100_I10")
ql.plot_Q(V, T, z_represents='action')
ql.plot_Q(V, T, z_represents='Q')

print(ql)
# ql.plot_Q(V, T, z_represents='both')

# for key in sorted(ql.q)[::-1]:
#     print("")
#     print(key)
#     print(ql.q[key])

In [None]:
display(episode_windows[0][0].head(5))

In [None]:
def run_Q(V, H, T, ql, episode_windows):
    costs_list = []
    decisionfrequency = int(H/T)

    for episode in tqdm(episode_windows):
        costs = {}
        volume = V
        
        ## Learned strategy
        ots = OrderbookTradingSimulator(orderbooks=episode, volume=volume, tradingperiods=T,
                                        decisionfrequency=decisionfrequency)
        for tt in range(1, T+1, 1)[::-1]:
            new_vol = round_custombase(ots.volume, base=ql.vol_intervals)    
            if new_vol > 0:
                state = state_as_string(time_left=tt, volume_left=new_vol/V)  #, orderbook=ots.get_next_masterbook())
                action = ql.chooseAction(state)

                # print(state, action)
                obs = episode[decisionfrequency * (T-tt)].copy()
                # obs = [elem.copy() for elem in obs_]
            
                ask = obs.get_ask()
                # center = ots.masterbook.get_center()
                limit = ask * (1. + (action/100.))
            else:
                # theoreticall done
                limit == None
            ots.trade(limit = limit, extrainfo={'ACTION':action})
        costs['learned'] = ots.history.cost.sum()
        # print("learned")
        # display(ots.history)
        
        ## limit + 1
        a=0.1
        lim = episode[0].get_ask() * (1. + (a/100.))
        # print("\n### Fixed limit at: {} (ASK+4) ###".format(lim))
        ots = OrderbookTradingSimulator(orderbooks=episode, volume=volume, tradingperiods=T,
                                        decisionfrequency=decisionfrequency)
        for i in range(T):
            ots.trade(limit = lim)
        costs['ask*1.001'] = ots.history.cost.sum()
        
        
        ## limit + 2.5
        a=0.2
        lim = episode[0].get_ask() * (1. + (a/100.))
        ots = OrderbookTradingSimulator(orderbooks=episode, volume=volume, tradingperiods=T,
                                        decisionfrequency=decisionfrequency)
        for i in range(T):
            ots.trade(limit = lim)
        costs['ask*1.002'] = ots.history.cost.sum()
        # print("1.002")
        # display(ots.history)
        
        
        ## limit + 3
        lim = episode[0].get_ask()
        ots = OrderbookTradingSimulator(orderbooks=episode, volume=volume, tradingperiods=T,
                                        decisionfrequency=decisionfrequency)
        for i in range(T):
            ots.trade(limit = lim)
        costs['ask*1'] = ots.history.cost.sum()
        
        ## limit + 4
        a=0.5
        lim = episode[0].get_ask() * (1. + (a/100.))
        # print("\n### Fixed limit at: {} (ASK+4) ###".format(lim))
        ots = OrderbookTradingSimulator(orderbooks=episode, volume=volume, tradingperiods=T,
                                        decisionfrequency=decisionfrequency)
        for i in range(T):
            ots.trade(limit = lim)
        costs['ask*1.005'] = ots.history.cost.sum()
        
        
        ## market order
        ots = OrderbookTradingSimulator(orderbooks=episode, volume=volume, tradingperiods=T,
                                        decisionfrequency=decisionfrequency)
        ots.trade(limit = None)
        costs['market'] = ots.history.cost.sum()
        
        
        costs_list.append(costs)
        
    return costs_list
        
print("T={}, P={}".format(T, P))
print(ql.q.keys())
costs_list_val = run_Q(V=100, H=T*P, T=T, ql=ql, episode_windows = episode_windows_val)
costs_list_train = run_Q(V=100, H=T*P, T=T, ql=ql, episode_windows = episode_windows_train)

In [None]:
colors=['r','g','b','y','magenta','grey']
order = ['ask+1', 'ask+2.5', 'ask+3', 'ask+4','learned', 'last t', 'market']

experiments = pd.DataFrame(costs_list_train)[order]
# display(experiments)
experiments.boxplot()
plt.axvline(4.5, color='black')
plt.axvline(5.5, color='black')
plt.suptitle("Training Set")
plt.xlabel("Experiments")
plt.ylabel("Occured costs")
plt.savefig("boxplot_train.pdf")
# plt.ylim((0.2, 0.4))
plt.show()

experiments = pd.DataFrame(costs_list_val)[order]
# display(experiments)
experiments.boxplot()
plt.axvline(4.5, color='black')
plt.axvline(5.5, color='black')
plt.suptitle("Validation Set")
plt.xlabel("Experiments")
plt.ylabel("Occured costs")
plt.savefig("boxplot_val.pdf")
plt.ylim((0.0, 0.8))
plt.show()

In [None]:
keys = costs_list[0].keys()

avg = {}
for elem in costs_list:
    for key in keys:
        avg[key] = avg.get(key, 0) + elem[key]/len(costs_list)
avg
for i, key in enumerate(avg):
    plt.scatter(i, avg[key], color=colors[i], label=key)
plt.legend(loc='upper left')
plt.show()

In [None]:
# states:
# [volume_left, time_left]
I = 5
print("V: {}, T: {}, I: {}".format(V, T, I))
volumes_rounded = [I*x for x in range(V/I+1)]
print("volume_left", volumes_rounded)

print(zip(volumes_rounded, range(T)))

states = []
for i in volumes_rounded:
    for j in range(T):
        states.append([i, j])
display(states)

qtable = np.zeros(len(states))
display(qtable, qtable.shape)

qtable[4, ]

display(qtable, qtable.shape)

In [None]:
def train_RL(V, T, P, epochs, gamma=0.95, DECAY_RATE=0.005, epsilon=1., bufferSize=50, batchSize=30, verbose=False, log=None):
    
    model = base_model()
    if log:
        log = open('logs/RL_train_{}.log'.format(datetime.now().isoformat()[2:-10]), 'w')
        log.write("RL training started\n")
        log.write("Actions: {}\n".format(actions))
        log.write("V={}, T={}, P={}\n".format(V, T, P))
        log.write("Compiled model!")
    MAX_EXPLORATION_RATE = 1.
    MIN_EXPLORATION_RATE = 0.05
    
    min_costs = np.inf

    replay = Memory(bufferSize)
    # stores tuples of (S, A, R, S')

    for i_episode in tqdm(range(epochs)):
        obs = episode_windows[0]  # episode_windows[i]  # Testcase with always the same, identical episode_window

        ots = OrderbookTradingSimulator(volume=V, tradingperiods=T, decisionfrequency=P)

        time_left = T
        volume = V
        action_history = []
        state = np.array([time_left, volume])  # volume])

        acc_cost = 0
        for step in range(T):
            qval = model.predict(state.reshape(1, STATE_DIM))
            # print("     {}".format(qval))

            ob = obs[step*P]
 
            if random.random() < epsilon:
                # choose random action
                #action = random.randint(0, len(actions)-1)
                # action = round(random.random()*2.-1, 1)
                action = random.choice(actions)
            else:
                # choose best action from Q(s,a) values
                action = actions[np.argmin(qval)]
            action_history.append(action)    
            if verbose:
                print("{}: action {}".format(action, actions[action]))
                
            # if V > 0:
            #     best_price = ob.get_ask()
            # elif V < 0:
            #     best_price = ob.get_bid()
            # else:
            #     assert(1 == 2), "Error!"
            # lim = best_price - actions[action]
            
            orderbooks = obs[step*P:(step+1)*P]
            info = ots.trade(orderbooks, agression_factor=action, verbose=False, extrainfo={'ACTION':action})


            time_left -= 1
            volume = ots.volume

            new_state = np.array([time_left, volume])  # volume, 
            cost = ots.history.cost.values[-1]
            if info['forced'].values[0]:
                pass
                # cost *= 3
            acc_cost += cost

            replay.add((state, action, cost, new_state))
            if (replay.size() >= bufferSize):
                # wait for buffer to be filled, before getting started with training
                minibatch = replay.get_random_samples(batchSize)           

                X_train = []
                y_train = []
                for memory in minibatch:
                    state_m, action_m, cost_m, new_state_m = memory

                    qval_old = model.predict(state_m.reshape(1, STATE_DIM))               
                    y = np.zeros((1, NUM_ACTIONS))
                    y[:] = qval_old[:]

                    qval_new_m = model.predict(new_state_m.reshape(1, STATE_DIM), batch_size=1)
                    maxQ = np.max(qval_new_m)

                    update = cost_m + (gamma*maxQ)
                    y[0][action_m] = update  # target output

                    X_train.append(state_m.reshape(STATE_DIM,))
                    y_train.append(y.reshape(NUM_ACTIONS,))
                X_train = np.array(X_train)
                y_train = np.array(y_train)

                # print("Game #: %s" % (i_episode,))
                model.fit(X_train, y_train, batch_size=batchSize, nb_epoch=1, verbose=0)
            state = new_state
            if volume == 0:
                # display(ots.history)
                break

        # reduce exploration rate
        if epsilon > MIN_EXPLORATION_RATE:
            epsilon = MAX_EXPLORATION_RATE *   math.exp(- DECAY_RATE * i_episode)
            
        forced_trade = ""
        if info['forced'].values[0]:
            forced_trade = ", forced!"
        info = "{:4d}/{}: epsilon={:5.3f}, acc_cost: {:0.5f}, steps: {} (t={}) {}\n"\
                .format(i_episode+1, epochs, epsilon,  acc_cost, step, ots.t, forced_trade)
        if log:
            log.write(info)
        if min_costs > acc_cost:
            min_costs = acc_cost
            if log:
                log.write("   {}\n".format(action_history))
            
            print(info)
            print("   {}".format(action_history))
            display(ots.history)
        # print("     {}".format(action_history))
    log.close()

In [None]:
V=30
T=5
P=2

train_RL(V=V, T=T, P=P, epochs=300, verbose=False, log=True)

In [None]:
visualize_Q(model, actions, V, T)

In [None]:
def run(V, T, P, epochs=1, overwrite_actions=None):
    for i_episode in range(epochs):
        obs = episode_windows[0]  # episode_windows[i]  # Testcase with always the same, identical episode_window

        ots = OrderbookTradingSimulator(volume=V, tradingperiods=T, decisionfrequency=P)

        time_left = P*T
        volume = V

        state = np.array([time_left, volume])  # , volume])
        # state = discretize_state(state)

        acc_cost = 0
        for step in range(T):
            qval = model.predict(state.reshape(1, STATE_DIM))
            
            action = actions[np.argmin(qval)]
            
            if overwrite_actions and step < len(overwrite_actions):
                action = overwrite_actions[step]

            ob = obs[step*P]

            orderbooks = obs[step*P:(step+1)*P]
            # info = ots.trade(orderbooks, limit=lim, verbose=False)
            info = ots.trade(orderbooks, agression_factor=action, verbose=False, extrainfo={'ACTION':action})

            time_left -= 1
            volume = ots.volume
            new_state = np.array([time_left, volume])  # , volume])
            cost = ots.history.cost.values[-1]

            acc_cost += cost

            state = new_state
            if volume == 0:
                break

        info = "{:4d}/{}: acc_cost: {:0.5f}, steps: {} (t={})"\
                .format(i_episode+1, epochs, acc_cost, step, ots.t)
        print(info)
    return ots.history
hist = run(V=V, T=T, P=P)  #, overwrite_actions=[0,0,0,0,0,0,0,0,0])
display(hist)
hist = run(V=V, T=T, P=P, overwrite_actions=[0.24]*10)
display(hist)

In [None]:
plot_episode(episode_windows[3], volume=50, figsize=(5,3))
episode_windows[3][90].plot(range_factor=1.015, figsize=(5,3))

In [None]:
def show_plots(x):
    # episode_windows[3][x].plot(range_factor=1.015, figsize=(5,3))
    return x

from ipywidgets import interact
interact(show_plots, x=5)

In [None]:
print(len(obs))
for vol in [1, 50, 100]:
    market_order_price = []
    ask_price = []
    for i, ob in enumerate(obs):
        market_order_price.append(ob.get_current_price(volume=vol))
        ask_price.append(vol*ob.get_ask())

    plt.plot(market_order_price, color='blue', marker='o', label='market order price')
    plt.plot(ask_price, color='red', marker='*', label='ask price')
                         
                         
    
    plt.title("Market order price for {} shares".format(vol))
    plt.ylabel("price")
    plt.xlabel("time")
    plt.legend(loc='best')
    plt.xlim((-1,21))
    plt.show()