In [1]:
import numpy as np
import pandas as pd
import os
import glob
import random
import json
from pandas.io.json import json_normalize

In [2]:
# file path
input_path = './lob-databot/binance-eth_btc/'

# data specification parameters
max_level = 100  # 0.1% price intervals x 100 -> +-10% price movement, each level has 4 features, bid/ask x price/size
feature_level = 10  # -1% to 1% range

In [3]:
# pick random file for inspection
input_file = random.choice(glob.glob(input_path + '/*'))
override_input_file = ""
if override_input_file:
    input_file = override_input_file
input_file

'./lob-databot/binance-eth_btc/binance_dataset_2021-02-24_2565920523.csv'

In [4]:
# create headers
lob_list = []
for side in ['bid', 'ask']:
    for i in range(max_level):
        lob_list.append(side + 'price' + str(i+1))
        lob_list.append(side + 'size' + str(i+1))
header_list = ['timestamp', 'last']
header_list.extend(lob_list)

new_lob_order = []
for i in range(feature_level):
    for side in ['bid', 'ask']:
        new_lob_order.append(side + 'price' + str(i+1))
        new_lob_order.append(side + 'size' + str(i+1))
new_lob_order = ['last'] + new_lob_order

In [5]:
def define_y_labels(y, prediction_period, band_size = 0.001):
    bins = [-np.inf, -band_size, band_size, np.inf]
    names = [0, 1, 2]
    y_labels = pd.cut(y.pct_change(periods=prediction_period), bins, labels=names)
    return y_labels

In [6]:
delta_t = [600,300,100]
delta_cat_count_json = {}

In [7]:
for subdir, dirs, files in os.walk(input_path):
    for file in files:
        if file.endswith((".csv")):
            data_path = os.path.join(subdir, file)
            print('processing', data_path)
            filename = os.path.splitext(file)[0]
            delta_cat_count_json[filename] = {}
            # read csv data
            binance_ethbtc = pd.read_csv(data_path, names=header_list, index_col='timestamp')

            # sort df by timestamp
            binance_ethbtc.sort_index(inplace=True)

            # reorder columns and filter
            binance_ethbtc = binance_ethbtc[new_lob_order].dropna()

            for t in delta_t:
                # print('For a difference of', t, 'seconds:')
                y = define_y_labels(binance_ethbtc['last'], t, band_size = 0.002)[t:]
                # print(y.value_counts(normalize=True))
                delta_cat_count_json[filename][str(t)] = y.value_counts(normalize=True).to_json()

processing ./lob-databot/binance-eth_btc/binance_dataset_2021-02-07_2377073425.csv
processing ./lob-databot/binance-eth_btc/binance_dataset_2021-02-08_2420569459.csv
processing ./lob-databot/binance-eth_btc/binance_dataset_2021-01-20_2206742115.csv
processing ./lob-databot/binance-eth_btc/binance_dataset_2021-02-11_2448256654.csv
processing ./lob-databot/binance-eth_btc/binance_dataset_2021-02-26_2570438093.csv
processing ./lob-databot/binance-eth_btc/binance_dataset_2021-02-20_2472772879.csv
processing ./lob-databot/binance-eth_btc/binance_dataset_2021-02-19_2472772879.csv
processing ./lob-databot/binance-eth_btc/binance_dataset_2021-02-08_2420520732.csv
processing ./lob-databot/binance-eth_btc/binance_dataset_2021-01-21_2230078401.csv
processing ./lob-databot/binance-eth_btc/binance_dataset_2021-02-22_2472772879.csv
processing ./lob-databot/binance-eth_btc/binance_dataset_2021-01-21_2229139167.csv
processing ./lob-databot/binance-eth_btc/binance_dataset_2021-02-17_2472772879.csv
proc

In [8]:
def unpack(x):
    # print(x)
    no_movement = json.loads(x)['1']
    # print(no_movement)
    return no_movement

In [9]:
delta_cat_count_df = pd.DataFrame(delta_cat_count_json).transpose()

In [10]:
delta_cat_count_df

Unnamed: 0,600,300,100
binance_dataset_2021-02-07_2377073425,"{""1"":0.5223407491,""0"":0.2639834374,""2"":0.21367...","{""1"":0.6644054363,""0"":0.1799776208,""2"":0.15561...","{""1"":0.8745028723,""0"":0.066620927,""2"":0.058876..."
binance_dataset_2021-02-08_2420569459,"{""0"":0.3656337279,""1"":0.3196967682,""2"":0.31466...","{""1"":0.4574218235,""0"":0.2867132867,""2"":0.25586...","{""1"":0.7124557028,""0"":0.1498096863,""2"":0.13773..."
binance_dataset_2021-01-20_2206742115,"{""1"":0.4294554615,""0"":0.3062592856,""2"":0.26428...","{""1"":0.5946464191,""0"":0.2093172897,""2"":0.19603...","{""1"":0.8440279828,""2"":0.0807778705,""0"":0.07519..."
binance_dataset_2021-02-11_2448256654,"{""1"":0.5360197818,""0"":0.256760448,""2"":0.207219...","{""1"":0.7013842168,""0"":0.1601941053,""2"":0.13842...","{""1"":0.8880657731,""0"":0.0601778501,""2"":0.05175..."
binance_dataset_2021-02-26_2570438093,"{""1"":0.5169391471,""2"":0.2443119284,""0"":0.23874...","{""1"":0.6602351163,""0"":0.1723390289,""2"":0.16742...","{""1"":0.8626988687,""2"":0.0705327601,""0"":0.06676..."
...,...,...,...
binance_dataset_2021-02-09_2420569459,"{""1"":0.4360518729,""0"":0.2918840681,""2"":0.27206...","{""1"":0.5475218388,""0"":0.2269253586,""2"":0.22555...","{""1"":0.7959406297,""2"":0.1043506516,""0"":0.09970..."
binance_dataset_2021-02-08_2420193864,"{""0"":0.7165354331,""2"":0.2204724409,""1"":0.06299...","{""0"":0.6496815287,""2"":0.225477707,""1"":0.124840...","{""0"":0.5305084746,""1"":0.2813559322,""2"":0.18813..."
binance_dataset_2021-02-10_2420569459,"{""1"":0.5679113281,""2"":0.2281269344,""0"":0.20396...","{""1"":0.7024057543,""2"":0.1583118984,""0"":0.13928...","{""1"":0.8570466448,""2"":0.0752679432,""0"":0.06768..."
binance_dataset_2021-02-25_2570438093,"{""1"":0.6178113014,""0"":0.232404957,""2"":0.149783...","{""1"":0.765674919,""0"":0.1317889796,""2"":0.102536...","{""1"":0.9322299106,""0"":0.0352469372,""2"":0.03252..."


In [11]:
delta_cat_count_df = delta_cat_count_df.applymap(unpack)

In [12]:
delta_cat_count_df.describe()

Unnamed: 0,600,300,100
count,61.0,62.0,63.0
mean,0.545207,0.676948,0.87241
std,0.136384,0.158705,0.101887
min,0.062992,0.0,0.281356
25%,0.443928,0.611331,0.842917
50%,0.560117,0.703667,0.888066
75%,0.638608,0.782084,0.936586
max,0.799641,0.898723,0.971982


In [13]:
delta_cat_count_df.to_pickle('./lob-databot/analysis/df_0.002_delta.pkl')