In [1]:
import pandas as pd
import zipfile
from slicer import slice_ticks
import os

PATH = '../../ticks/binance/'
SYMBOLS = ['BTCUSDT', 'ETHUSDT']
FREQ = ['10s', '1min', '5min', '10min']

In [2]:
def read_file(symbol, zip_name, freq):
    filename = PATH + f'{symbol}/{zip_name}'
    print(f'reading \'{filename}\'... ', end='')
    extention = filename[-4:]
    if extention == '.zip':
        csv_name = zip_name.replace('.zip', '.csv')
        zf = zipfile.ZipFile(filename) 
        df = pd.read_csv(zf.open(csv_name), header=None)
    elif extention == '.csv':        
        df = pd.read_csv(filename)
    else:
        print('Unsupported extention')
        return None

    no_idx = len(df.columns) == 6
    if no_idx:
        df.columns = ['id', 'price', 'size', 'quote_size', 'timestamp', 'is_sell']
    else:
        df.columns = ['idx', 'id', 'price', 'size', 'quote_size', 'timestamp', 'is_sell']

    df.drop_duplicates(subset='id', ignore_index=True, inplace=True)
    df.timestamp = pd.to_datetime(df.timestamp, unit='ms')
    df['side'] = 1 - (2 * df.is_sell.astype('int'))
    if no_idx:
        df.drop(columns=['id', 'quote_size', 'is_sell'], inplace=True)
    else:
        df.drop(columns=['idx', 'id', 'quote_size', 'is_sell'], inplace=True)
        
    if isinstance(freq, str):
        res = slice_ticks(df, freq)
        print(res.shape)
        return res
    elif isinstance(freq, list):
        res = {}
        for f in freq:
            res[f] = slice_ticks(df, f)

        print([(f, *res[f].shape) for f in freq])
        return res
    else:
        print('Unsupported freq type')
        return None

In [3]:
# temp = read_file('BTCUSDT', 'BTCUSDT-427085758-427134169.csv', FREQ)
# temp = read_file('BTCUSDT', 'BTCUSDT-trades-2021-12-31.zip', FREQ)

In [4]:
for symbol in SYMBOLS:
    dfs = [read_file(symbol, f, FREQ) for f in os.listdir(PATH + symbol)]
    # files = [
    #     'BTCUSDT-trades-2021-12-31.zip',
    #     'BTCUSDT-trades-2021-12-31.zip'
    # ]
    # dfs = [read_file(symbol, f, FREQ) for f in files]
    for f in FREQ:
        dfs_f = [d1[f] for d1 in dfs]
        df_all = pd.concat(dfs_f, ignore_index=True, axis=0) if len(dfs_f) > 0 else pd.DataFrame()
        df_all.sort_values(by='time', ignore_index=True, inplace=True)
        filename = f'../../bars/binance-{symbol}-{f}.csv'
        print(f'writing output to \'{filename}\'')
        df_all.to_csv(filename, date_format='%Y-%m-%d %H:%M:%S', index=False)

print('done')

reading '../../ticks/binance/BTCUSDT/BTCUSDT-trades-2021-12-30.zip'... [('10s', 8640, 19), ('1min', 1440, 19), ('5min', 288, 19), ('10min', 144, 19)]
reading '../../ticks/binance/BTCUSDT/BTCUSDT-trades-2021-12-31.zip'... [('10s', 8640, 19), ('1min', 1440, 19), ('5min', 288, 19), ('10min', 144, 19)]
reading '../../ticks/binance/BTCUSDT/BTCUSDT-trades-2021-08.zip'... [('10s', 267840, 19), ('1min', 44640, 19), ('5min', 8928, 19), ('10min', 4464, 19)]
reading '../../ticks/binance/BTCUSDT/BTCUSDT-trades-2021-09.zip'... [('10s', 259200, 19), ('1min', 43200, 19), ('5min', 8640, 19), ('10min', 4320, 19)]
reading '../../ticks/binance/BTCUSDT/BTCUSDT-trades-2022-01-27.zip'... [('10s', 8640, 19), ('1min', 1440, 19), ('5min', 288, 19), ('10min', 144, 19)]
reading '../../ticks/binance/BTCUSDT/BTCUSDT-555823694-555823695.csv'... [('10s', 3, 19), ('1min', 2, 19), ('5min', 2, 19), ('10min', 2, 19)]
reading '../../ticks/binance/BTCUSDT/BTCUSDT-trades-2022-01-26.zip'... [('10s', 8640, 19), ('1min', 1440