01_task_NW_load_data.ipynb <<<<< we're here

02_task_NW_strategies.ipynb

03_task_NW_EDA.ipynb

04_task_NW_backtest.ipynb

------

Loading data from text format

------


# init

In [None]:
import numpy as np
import re
import gzip
import pickle
import matplotlib.pyplot as plt

# func

In [None]:
def load_data(PATH, MAX_DEPTH=10):
     #regex patterns for snapshots and trades
    header_re = re.compile(
        r'^(?P<receive_time>\d{2}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+) '
        r'type: OrderbookSnapshot .*? '
        r'server_time: (?P<server_time>\d+) '
    )
    trades_header_re = re.compile(
        r'^(?P<receive_time>\d{2}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+) '
        r'type: Trades .*? '
        r'server_time: (?P<server_time>\d+) '
        r'trades: \[(?P<trades>.*)\]'
    )
    trade_entry_re = re.compile(
        r'\{\s*(?P<side>Bid|Ask)\s+'
        r'(?P<price>\d+\.\d+)\s+'
        r'(?P<volume>\d+\.\d+)\s*\}'
    )

    ASK_PREF = 'Ask price: '
    BID_PREF = 'Bid price: '
    VOL_SEP  = ' volume: '

    #FIRST PASS: count number of snapshots and trades
    N_snap = 0
    N_trd  = 0
    with gzip.open(PATH, 'rt', encoding='utf-8') as f:
        for line in f:
            if header_re.match(line):
                N_snap += 1
            elif trades_header_re.match(line):
                #each "type: Trades" line may contain several trades,
                #but we'll treat it as one record per line
                N_trd += 1

    #initialize arrays for snapshots
    receive_ms      = np.empty(N_snap, dtype=np.int64)
    server_ms       = np.empty(N_snap, dtype=np.int64)
    arr             = np.full((N_snap, 2, 2, MAX_DEPTH), np.nan, dtype=np.float32)

    #initialize arrays for trades
    receive_ms_trades = np.empty(N_trd, dtype=np.int64)
    server_ms_trades  = np.empty(N_trd, dtype=np.int64)
    # столбцы: [side_id, price, volume], где side_id: 0=Ask, 1=Bid
    trades            = np.empty((N_trd, 3), dtype=np.float32)

    #SECOND PASS: fill data while reading the file
    snap_idx = -1
    trd_idx  = -1
    ask_buf = []
    bid_buf = []

    with gzip.open(PATH, 'rt', encoding='utf-8') as f:
        for line in f:
            #snapshot
            m = header_re.match(line)
            if m:
                #save the previous buffer
                if snap_idx >= 0:
                    ask_buf.sort(key=lambda x: x[0])
                    bid_buf.sort(key=lambda x: x[0], reverse=True)
                    for k, (p, v) in enumerate(ask_buf[:MAX_DEPTH]):
                        arr[snap_idx, 0, 0, k] = p
                        arr[snap_idx, 0, 1, k] = v
                    for k, (p, v) in enumerate(bid_buf[:MAX_DEPTH]):
                        arr[snap_idx, 1, 0, k] = p
                        arr[snap_idx, 1, 1, k] = v

                #start a new snapshot
                snap_idx += 1
                ask_buf.clear()
                bid_buf.clear()

                #parse the times
                dt = datetime.datetime.strptime(m.group('receive_time'), '%y-%m-%d %H:%M:%S.%f')
                receive_ms[snap_idx] = int(dt.timestamp() * 1_000_000)
                server_ms[snap_idx]  = int(m.group('server_time'))
                continue

            #Trade
            mt = trades_header_re.match(line)
            if mt:
                trd_idx += 1
                #trade time
                dt = datetime.datetime.strptime(mt.group('receive_time'), '%y-%m-%d %H:%M:%S.%f')
                receive_ms_trades[trd_idx] = int(dt.timestamp() * 1_000_000)
                server_ms_trades[trd_idx]  = int(mt.group('server_time'))

                #parse trades (we store only one record per line)
                text = mt.group('trades')
                m_entry = trade_entry_re.search(text)
                if m_entry:
                    side = m_entry.group('side')
                    price  = float(m_entry.group('price'))
                    volume = float(m_entry.group('volume'))
                    side_id = 1 if side == 'Bid' else 0
                    trades[trd_idx, :] = (side_id, price, volume)
                continue

            #order book levels for snapshot
            if line.startswith(ASK_PREF):
                price_str, vol_str = line[len(ASK_PREF):].split(VOL_SEP)
                ask_buf.append((float(price_str), float(vol_str)))
            elif line.startswith(BID_PREF):
                price_str, vol_str = line[len(BID_PREF):].split(VOL_SEP)
                bid_buf.append((float(price_str), float(vol_str)))

        #save the last snapshot
        if snap_idx >= 0:
            ask_buf.sort(key=lambda x: x[0])
            bid_buf.sort(key=lambda x: x[0], reverse=True)
            for k, (p, v) in enumerate(ask_buf[:MAX_DEPTH]):
                arr[snap_idx, 0, 0, k] = p
                arr[snap_idx, 0, 1, k] = v
            for k, (p, v) in enumerate(bid_buf[:MAX_DEPTH]):
                arr[snap_idx, 1, 0, k] = p
                arr[snap_idx, 1, 1, k] = v

    return arr, receive_ms, server_ms, trades, receive_ms_trades, server_ms_trades

# deribit

In [None]:
MAX_DEPTH = 10
PATH = "/content/drive/MyDrive/Colab Notebooks/data/test01/deribit_BTC-PERPETUAL_20230130_depth10.txt.gz"

snapshots_D, snapshots_receive_ms_D, snapshots_server_ms_D, trades_D, trades_receive_ms_D, trades_server_ms_D = load_data_new(PATH, MAX_DEPTH=MAX_DEPTH)

plt.figure(figsize=(15,5))
plt.subplot(2, 1, 1)
plt.plot(snapshots_receive_ms_D, snapshots_D[:,0,0,0], color='red')
plt.subplot(2, 1, 2)
plt.plot(trades_receive_ms_D, trades_D[:,1], color='red')
plt.show()

In [None]:
with open('/content/drive/MyDrive/Colab Notebooks/data/test01/snapshots_D.pkl', 'wb') as file:
    pickle.dump(snapshots_D, file)

with open('/content/drive/MyDrive/Colab Notebooks/data/test01/snapshots_receive_ms_D.pkl', 'wb') as file:
    pickle.dump(snapshots_receive_ms_D, file)

with open('/content/drive/MyDrive/Colab Notebooks/data/test01/snapshots_server_ms_D.pkl', 'wb') as file:
    pickle.dump(snapshots_server_ms_D, file)

with open('/content/drive/MyDrive/Colab Notebooks/data/test01/trades_D.pkl', 'wb') as file:
    pickle.dump(trades_D, file)

with open('/content/drive/MyDrive/Colab Notebooks/data/test01/trades_receive_ms_D.pkl', 'wb') as file:
    pickle.dump(trades_receive_ms_D, file)

with open('/content/drive/MyDrive/Colab Notebooks/data/test01/trades_server_ms_D.pkl', 'wb') as file:
    pickle.dump(trades_server_ms_D, file)

# bitmex

In [None]:
MAX_DEPTH = 10
PATH = "/content/drive/MyDrive/Colab Notebooks/data/test01/bitmex_XBTUSD_20230130_depth10.txt.gz"

snapshots_B, snapshots_receive_ms_B, snapshots_server_ms_B, trades_B, trades_receive_ms_B, trades_server_ms_B = load_data_new(PATH, MAX_DEPTH=MAX_DEPTH)

plt.figure(figsize=(15,5))
plt.subplot(2, 1, 1)
plt.plot(snapshots_receive_ms_B, snapshots_B[:,0,0,0], color='red')
plt.subplot(2, 1, 2)
plt.plot(trades_receive_ms_B, trades_B[:,1], color='red')
plt.show()

In [None]:
with open('/content/drive/MyDrive/Colab Notebooks/data/test01/snapshots_B.pkl', 'wb') as file:
    pickle.dump(snapshots_B, file)

with open('/content/drive/MyDrive/Colab Notebooks/data/test01/snapshots_receive_ms_B.pkl', 'wb') as file:
    pickle.dump(snapshots_receive_ms_B, file)

with open('/content/drive/MyDrive/Colab Notebooks/data/test01/snapshots_server_ms_B.pkl', 'wb') as file:
    pickle.dump(snapshots_server_ms_B, file)

with open('/content/drive/MyDrive/Colab Notebooks/data/test01/trades_B.pkl', 'wb') as file:
    pickle.dump(trades_B, file)

with open('/content/drive/MyDrive/Colab Notebooks/data/test01/trades_receive_ms_B.pkl', 'wb') as file:
    pickle.dump(trades_receive_ms_B, file)

with open('/content/drive/MyDrive/Colab Notebooks/data/test01/trades_server_ms_B.pkl', 'wb') as file:
    pickle.dump(trades_server_ms_B, file)