In [1]:
import re
import s3fs
import pandas as pd
from datetime import datetime, timedelta

In [2]:
endpoint_url = "http://us-east-1.linodeobjects.com"
bucket = "crypto-data"
book1_dir = "book-lvl1"
book2_dir = "book-lvl2-s1"
stat_dir = "stat"
ticker_dir = "ticker"
trade_dir = "trade"
candle_dir = "candle"

In [4]:
fs_client_kwargs = {
    "endpoint_url" : endpoint_url,
    "aws_access_key_id" : bcs_key,
    "aws_secret_access_key" : bcs_secret,
}

In [5]:
fs = s3fs.S3FileSystem(client_kwargs = fs_client_kwargs)

In [6]:
### obtain all filenames

In [7]:
book1_files = fs.ls("{}/{}".format(bucket, book1_dir))

In [8]:
book2_files = fs.ls("{}/{}".format(bucket, book2_dir))

In [9]:
stat_files = fs.ls("{}/{}".format(bucket, stat_dir))

In [10]:
ticker_files = fs.ls("{}/{}".format(bucket, ticker_dir))

In [11]:
trade_files = fs.ls("{}/{}".format(bucket, trade_dir))

In [12]:
#### Utility functions

In [13]:
def print_epoch(epoch):
    return datetime.fromtimestamp(epoch).strftime("%Y-%m-%dT%H:%M:%S")

In [14]:
def print_date_ranges(ranges):
    for r in ranges:
        print("({} - {})".format(print_epoch(r[0]), print_epoch(r[1])))

In [15]:
### find irregular file epoch ranges

In [16]:
def find_file_epoch_dist(files, prefix, dist):
    short_epoch_dist_files = []
    for file in files:
        result = re.match(r'^crypto-data/{}_[0-9]+_([0-9]+)_([0-9]+).parquet$'.format(prefix), file)
        if result is None:
            print("bad filename {}".format(file))
            continue
        se = int(result.group(1))
        ee = int(result.group(2))
        if ee - se != dist:
            short_epoch_dist_files.append(file)
    return short_epoch_dist_files

In [17]:
book1_bad_files = find_file_epoch_dist(book1_files, 'book-lvl1/book_level1', 21600)
book1_bad_files

[]

In [18]:
book2_bad_files = find_file_epoch_dist(book2_files, 'book-lvl2-s1/book_level2', 21600)
book2_bad_files

[]

In [19]:
stat_bad_files = find_file_epoch_dist(stat_files, 'stat/stat_exchange', 21600)
stat_bad_files

[]

In [20]:
ticker_bad_files = find_file_epoch_dist(ticker_files, 'ticker/ticker_exchange', 21600)
ticker_bad_files

[]

In [21]:
trade_bad_files = find_file_epoch_dist(trade_files, 'trade/trade_exchange', 21600)
trade_bad_files

[]

In [22]:
### Check File Time continuity

In [23]:
def get_epoch_pairs(files, prefix, exid):
    epochs = []
    for file in files:
        result = re.match(r'^crypto-data/{}_{}_([0-9]+)_([0-9]+).parquet$'.format(prefix, exid), file)
        if result is None:
            continue
        se = int(result.group(1))
        ee = int(result.group(2))
        epochs.append((se, ee))
    return epochs

In [24]:
def get_coverage_epoch(epochs):
    ranges = []
    if not epochs:
        return ranges
    epochs.sort(key=lambda x: x[0])
    start_epoch = epochs[0][0]
    end_epoch = epochs[0][1]
    for i in range(1, len(epochs)):
        if epochs[i][0] > end_epoch:
            ranges.append((start_epoch, end_epoch))
            start_epoch = epochs[i][0]
            end_epoch = epochs[i][1]
        else:
            end_epoch = epochs[i][1]
    ranges.append((start_epoch, end_epoch))
    return ranges

In [25]:
book10_range = get_coverage_epoch(get_epoch_pairs(book1_files, 'book-lvl1/book_level1', 0))

In [26]:
print_date_ranges(book10_range)

(2022-04-29T20:00:00 - 2022-05-03T02:00:00)
(2022-06-11T08:00:00 - 2022-06-13T20:00:00)
(2022-06-23T14:00:00 - 2022-06-25T08:00:00)
(2022-07-02T14:00:00 - 2022-07-03T14:00:00)
(2022-07-09T20:00:00 - 2022-07-10T02:00:00)
(2022-07-17T14:00:00 - 2022-11-09T19:00:00)


In [27]:
book11_range = get_coverage_epoch(get_epoch_pairs(book1_files, 'book-lvl1/book_level1', 1))

In [28]:
print_date_ranges(book11_range)

(2022-04-29T20:00:00 - 2022-05-03T02:00:00)
(2022-06-11T08:00:00 - 2022-06-12T08:00:00)
(2022-07-02T14:00:00 - 2022-07-04T14:00:00)
(2022-07-17T14:00:00 - 2022-08-12T20:00:00)
(2022-08-13T02:00:00 - 2022-11-09T19:00:00)


In [29]:
book20_range = get_coverage_epoch(get_epoch_pairs(book2_files, 'book-lvl2-s1/book_level2', 0))

In [30]:
print_date_ranges(book20_range)

(2022-10-23T08:00:00 - 2022-10-23T14:00:00)
(2022-10-24T02:00:00 - 2022-10-24T08:00:00)
(2022-10-25T08:00:00 - 2022-10-25T14:00:00)
(2022-10-27T14:00:00 - 2022-10-27T20:00:00)
(2022-10-28T08:00:00 - 2022-11-05T08:00:00)
(2022-11-06T13:00:00 - 2022-11-09T19:00:00)


In [31]:
book21_range = get_coverage_epoch(get_epoch_pairs(book2_files, 'book-lvl2-s1/book_level2', 1))

In [32]:
print_date_ranges(book21_range)

(2022-10-23T08:00:00 - 2022-10-23T14:00:00)
(2022-10-24T02:00:00 - 2022-10-24T08:00:00)
(2022-10-25T08:00:00 - 2022-10-25T14:00:00)
(2022-10-27T14:00:00 - 2022-10-27T20:00:00)
(2022-10-28T08:00:00 - 2022-11-09T19:00:00)


In [33]:
stat0_range = get_coverage_epoch(get_epoch_pairs(stat_files, 'stat/stat_exchange', 0))

In [34]:
print_date_ranges(stat0_range)

(2022-05-01T08:00:00 - 2022-05-09T02:00:00)
(2022-06-11T08:00:00 - 2022-06-14T14:00:00)
(2022-06-23T20:00:00 - 2022-06-29T08:00:00)
(2022-07-02T14:00:00 - 2022-07-06T02:00:00)
(2022-07-09T20:00:00 - 2022-07-13T02:00:00)
(2022-07-17T14:00:00 - 2022-07-26T02:00:00)
(2022-07-31T14:00:00 - 2022-08-04T02:00:00)
(2022-08-07T14:00:00 - 2022-09-27T08:00:00)
(2022-09-28T20:00:00 - 2022-10-27T20:00:00)
(2022-10-28T08:00:00 - 2022-11-09T19:00:00)


In [35]:
stat1_range = get_coverage_epoch(get_epoch_pairs(stat_files, 'stat/stat_exchange', 1))

In [36]:
print_date_ranges(stat1_range)

(2022-05-01T08:00:00 - 2022-05-03T02:00:00)
(2022-06-11T08:00:00 - 2022-06-14T08:00:00)
(2022-06-23T20:00:00 - 2022-06-27T08:00:00)
(2022-07-02T14:00:00 - 2022-07-06T02:00:00)
(2022-07-09T20:00:00 - 2022-07-11T14:00:00)
(2022-07-17T14:00:00 - 2022-10-28T08:00:00)
(2022-10-30T14:00:00 - 2022-11-09T19:00:00)


In [37]:
ticker0_range = get_coverage_epoch(get_epoch_pairs(ticker_files, 'ticker/ticker_exchange', 0))

In [38]:
print_date_ranges(ticker0_range)

(2022-05-01T08:00:00 - 2022-05-02T20:00:00)
(2022-06-11T08:00:00 - 2022-06-13T08:00:00)
(2022-06-23T20:00:00 - 2022-06-25T02:00:00)
(2022-07-02T14:00:00 - 2022-07-06T02:00:00)
(2022-07-09T20:00:00 - 2022-07-13T20:00:00)
(2022-07-17T14:00:00 - 2022-10-05T14:00:00)
(2022-10-09T08:00:00 - 2022-11-09T19:00:00)


In [39]:
ticker1_range = get_coverage_epoch(get_epoch_pairs(ticker_files, 'ticker/ticker_exchange', 1))

In [40]:
print_date_ranges(ticker1_range)

(2022-05-01T08:00:00 - 2022-05-02T08:00:00)
(2022-06-11T08:00:00 - 2022-06-14T08:00:00)
(2022-06-23T20:00:00 - 2022-06-28T08:00:00)
(2022-07-02T14:00:00 - 2022-07-04T20:00:00)
(2022-07-09T20:00:00 - 2022-07-14T08:00:00)
(2022-07-17T14:00:00 - 2022-11-07T13:00:00)


In [41]:
trade0_range = get_coverage_epoch(get_epoch_pairs(trade_files, 'trade/trade_exchange', 0))

In [42]:
print_date_ranges(trade0_range)

(2022-08-13T20:00:00 - 2022-09-12T20:00:00)
(2022-09-18T20:00:00 - 2022-10-16T02:00:00)
(2022-10-17T20:00:00 - 2022-11-08T13:00:00)
(2022-11-08T19:00:00 - 2022-11-09T07:00:00)
(2022-11-09T13:00:00 - 2022-11-09T19:00:00)


In [43]:
trade1_range = get_coverage_epoch(get_epoch_pairs(trade_files, 'trade/trade_exchange', 1))

In [44]:
print_date_ranges(trade1_range)

(2022-08-13T20:00:00 - 2022-09-12T20:00:00)
(2022-09-18T20:00:00 - 2022-11-09T19:00:00)


In [45]:
### Check As Of Time Continuity

In [46]:
#TODO: something is wrong

In [46]:
sequence_datetime_format = '%Y-%m-%dT%H:%M:%S.%f %Z'

In [47]:
def get_file_epoch_pairs(files, prefix, exid):
    ranges = []
    for file in files:
        result = re.match(r'^crypto-data/{}_{}_([0-9]+)_([0-9]+).parquet$'.format(prefix, exid), file)
        if result is None:
            continue
        se = int(result.group(1))
        ee = int(result.group(2))
        ranges.append((file, se, ee))
    return ranges

In [48]:
def get_file_asoftime_ranges(files_epochs):
    files_epochs.sort(key=lambda x : x[1])
    start_datetime = None
    end_datetime = None
    ranges = []
    for (file, se, ee) in files_epochs:
        with fs.open(file, 'rb') as fd:
            df = pd.read_parquet(fd)
            ag = df['sequence_time'].agg(['min', 'max'])
            if start_datetime is None:
                start_datetime = datetime.strptime(ag[0], sequence_datetime_format)
                end_datetime = datetime.strptime(ag[1], sequence_datetime_format)
            elif datetime.strptime(ag[0], sequence_datetime_format) > end_datetime + timedelta(seconds=1):
                ranges.append((start_datetime, end_datetime))
                start_datetime = datetime.strptime(ag[0], sequence_datetime_format)
                end_datetime = datetime.strptime(ag[1], sequence_datetime_format)
            else:
                end_datetime = datetime.strptime(ag[1], sequence_datetime_format)
    return ranges

In [49]:
book20_asoftime_range = get_file_asoftime_ranges(get_file_epoch_pairs(book2_files, 'book-lvl2-s1/book_level2', 0))

In [50]:
book20_asoftime_range

[(datetime.datetime(2022, 10, 23, 15, 20, 21),
  datetime.datetime(2022, 10, 23, 17, 59, 59)),
 (datetime.datetime(2022, 10, 24, 11, 22, 6),
  datetime.datetime(2022, 10, 24, 11, 59, 59)),
 (datetime.datetime(2022, 10, 25, 12, 49, 31),
  datetime.datetime(2022, 10, 25, 17, 59, 59)),
 (datetime.datetime(2022, 10, 27, 19, 57, 42),
  datetime.datetime(2022, 10, 27, 23, 59, 59)),
 (datetime.datetime(2022, 10, 28, 13, 38, 44),
  datetime.datetime(2022, 10, 31, 23, 59, 59))]

In [51]:
book21_asoftime_range = get_file_asoftime_ranges(get_file_epoch_pairs(book2_files, 'book-lvl2-s1/book_level2', 1))

In [52]:
book21_asoftime_range

[(datetime.datetime(2022, 10, 23, 15, 33, 1),
  datetime.datetime(2022, 10, 23, 17, 59, 59)),
 (datetime.datetime(2022, 10, 24, 11, 24, 16),
  datetime.datetime(2022, 10, 24, 11, 59, 59)),
 (datetime.datetime(2022, 10, 25, 12, 51, 45),
  datetime.datetime(2022, 10, 25, 17, 59, 59)),
 (datetime.datetime(2022, 10, 27, 19, 58, 22),
  datetime.datetime(2022, 10, 27, 23, 59, 59)),
 (datetime.datetime(2022, 10, 28, 13, 32, 55),
  datetime.datetime(2022, 10, 28, 17, 59, 59)),
 (datetime.datetime(2022, 10, 28, 22, 0, 9),
  datetime.datetime(2022, 10, 28, 23, 59, 59)),
 (datetime.datetime(2022, 10, 29, 4, 11, 3),
  datetime.datetime(2022, 10, 29, 5, 59, 59)),
 (datetime.datetime(2022, 10, 29, 10, 3, 9),
  datetime.datetime(2022, 10, 29, 11, 59, 59)),
 (datetime.datetime(2022, 10, 29, 15, 45, 42),
  datetime.datetime(2022, 10, 29, 17, 59, 59)),
 (datetime.datetime(2022, 10, 29, 22, 16, 48),
  datetime.datetime(2022, 10, 29, 23, 59, 59)),
 (datetime.datetime(2022, 10, 30, 4, 0, 7),
  datetime.dat

In [53]:
len(book21_asoftime_range)

22

In [54]:
book10_asoftime_range = get_file_asoftime_ranges(get_file_epoch_pairs(book1_files, 'book-lvl1/book_level1', 0))

In [55]:
book10_asoftime_range

[(datetime.datetime(2022, 4, 30, 0, 52),
  datetime.datetime(2022, 5, 3, 5, 59, 59)),
 (datetime.datetime(2022, 6, 11, 16, 3),
  datetime.datetime(2022, 6, 13, 20, 12, 49)),
 (datetime.datetime(2022, 6, 23, 23, 57),
  datetime.datetime(2022, 6, 23, 23, 59, 59)),
 (datetime.datetime(2022, 6, 24, 0, 51),
  datetime.datetime(2022, 6, 25, 11, 59, 59)),
 (datetime.datetime(2022, 7, 2, 19, 56),
  datetime.datetime(2022, 7, 3, 15, 6, 6)),
 (datetime.datetime(2022, 7, 10, 0, 19),
  datetime.datetime(2022, 7, 10, 5, 59, 59)),
 (datetime.datetime(2022, 7, 17, 21, 56),
  datetime.datetime(2022, 7, 19, 5, 59, 41)),
 (datetime.datetime(2022, 7, 19, 6, 1, 21),
  datetime.datetime(2022, 7, 19, 11, 58, 27)),
 (datetime.datetime(2022, 7, 19, 12, 3, 27),
  datetime.datetime(2022, 7, 25, 23, 59, 35)),
 (datetime.datetime(2022, 7, 26, 0, 1, 15),
  datetime.datetime(2022, 7, 29, 23, 58, 20))]

In [56]:
book11_asoftime_range = get_file_asoftime_ranges(get_file_epoch_pairs(book1_files, 'book-lvl1/book_level1', 1))

In [57]:
book11_asoftime_range

[(datetime.datetime(2022, 4, 30, 0, 57),
  datetime.datetime(2022, 5, 1, 5, 58, 46)),
 (datetime.datetime(2022, 5, 1, 6, 0, 26),
  datetime.datetime(2022, 5, 1, 23, 58, 25)),
 (datetime.datetime(2022, 5, 2, 0, 0, 5),
  datetime.datetime(2022, 5, 3, 5, 59, 59)),
 (datetime.datetime(2022, 6, 11, 16, 5),
  datetime.datetime(2022, 6, 12, 11, 59, 59)),
 (datetime.datetime(2022, 7, 2, 19, 58),
  datetime.datetime(2022, 7, 4, 17, 59, 59)),
 (datetime.datetime(2022, 7, 17, 21, 58),
  datetime.datetime(2022, 7, 20, 23, 59, 22)),
 (datetime.datetime(2022, 7, 21, 0, 1, 2),
  datetime.datetime(2022, 7, 22, 17, 59, 16)),
 (datetime.datetime(2022, 7, 22, 18, 0, 56),
  datetime.datetime(2022, 8, 2, 23, 58, 53)),
 (datetime.datetime(2022, 8, 3, 0, 0, 33),
  datetime.datetime(2022, 8, 8, 5, 59, 33)),
 (datetime.datetime(2022, 8, 8, 6, 1, 13),
  datetime.datetime(2022, 8, 12, 23, 59, 59))]

In [None]:
### read each file and do row count statistics

In [61]:
def get_df_row_count(files, prefix, exid):
    rowcounts = []
    for file in files:
        result = re.match(r'^crypto-data/{}_{}_[0-9]+_[0-9]+.parquet$'.format(prefix, exid), file)
        if result is None:
            continue
        with fs.open(file, 'rb') as fd:
            df = pd.read_parquet(fd)
            rowcounts.append((file, len(df)))
    return rowcounts

In [62]:
book20_rowcounts = get_df_row_count(book2_files, 'book-lvl2-s1/book_level2', 0)

In [63]:
book20_rowcounts

[('crypto-data/book-lvl2-s1/book_level2_0_1666526400_1666548000.parquet',
  9557),
 ('crypto-data/book-lvl2-s1/book_level2_0_1666591200_1666612800.parquet',
  2274),
 ('crypto-data/book-lvl2-s1/book_level2_0_1666699200_1666720800.parquet',
  18526),
 ('crypto-data/book-lvl2-s1/book_level2_0_1666893600_1666915200.parquet',
  14532),
 ('crypto-data/book-lvl2-s1/book_level2_0_1666958400_1666980000.parquet',
  15671),
 ('crypto-data/book-lvl2-s1/book_level2_0_1666980000_1667001600.parquet',
  21592),
 ('crypto-data/book-lvl2-s1/book_level2_0_1667001600_1667023200.parquet',
  21577),
 ('crypto-data/book-lvl2-s1/book_level2_0_1667023200_1667044800.parquet',
  21600),
 ('crypto-data/book-lvl2-s1/book_level2_0_1667044800_1667066400.parquet',
  21141),
 ('crypto-data/book-lvl2-s1/book_level2_0_1667066400_1667088000.parquet',
  19632),
 ('crypto-data/book-lvl2-s1/book_level2_0_1667088000_1667109600.parquet',
  21330),
 ('crypto-data/book-lvl2-s1/book_level2_0_1667109600_1667131200.parquet',
  21

In [64]:
book21_rowcounts = get_df_row_count(book2_files, 'book-lvl2-s1/book_level2', 1)

In [65]:
book21_rowcounts

[('crypto-data/book-lvl2-s1/book_level2_1_1666526400_1666548000.parquet',
  8804),
 ('crypto-data/book-lvl2-s1/book_level2_1_1666591200_1666612800.parquet',
  2144),
 ('crypto-data/book-lvl2-s1/book_level2_1_1666699200_1666720800.parquet',
  18487),
 ('crypto-data/book-lvl2-s1/book_level2_1_1666893600_1666915200.parquet',
  14494),
 ('crypto-data/book-lvl2-s1/book_level2_1_1666958400_1666980000.parquet',
  8953),
 ('crypto-data/book-lvl2-s1/book_level2_1_1666980000_1667001600.parquet',
  7191),
 ('crypto-data/book-lvl2-s1/book_level2_1_1667001600_1667023200.parquet',
  6537),
 ('crypto-data/book-lvl2-s1/book_level2_1_1667023200_1667044800.parquet',
  7011),
 ('crypto-data/book-lvl2-s1/book_level2_1_1667044800_1667066400.parquet',
  8058),
 ('crypto-data/book-lvl2-s1/book_level2_1_1667066400_1667088000.parquet',
  6192),
 ('crypto-data/book-lvl2-s1/book_level2_1_1667088000_1667109600.parquet',
  7193),
 ('crypto-data/book-lvl2-s1/book_level2_1_1667109600_1667131200.parquet',
  7173),
 (

In [67]:
len(book21_rowcounts)

22

In [None]:
### read each file and check for NaN

In [70]:
def get_df_nan_count(files, prefix, exid):
    nancounts = []
    for file in files:
        result = re.match(r'^crypto-data/{}_{}_[0-9]+_[0-9]+.parquet$'.format(prefix, exid), file)
        if result is not None:
            continue
        with fs.open(file, 'rb') as fd:
            df = pd.read_parquet(fd)
            nan_count = df.isna().sum().sum()
            nancounts.append((file, nan_count))
    return nancounts

In [71]:
book20_nancounts = get_df_nan_count(book2_files, 'book-lvl2-s1/book_level2', 0)

In [72]:
book20_nancounts

[('crypto-data/book-lvl2-s1/book_level2_1_1666526400_1666548000.parquet', 0),
 ('crypto-data/book-lvl2-s1/book_level2_1_1666591200_1666612800.parquet', 0),
 ('crypto-data/book-lvl2-s1/book_level2_1_1666699200_1666720800.parquet', 0),
 ('crypto-data/book-lvl2-s1/book_level2_1_1666893600_1666915200.parquet', 0),
 ('crypto-data/book-lvl2-s1/book_level2_1_1666958400_1666980000.parquet', 0),
 ('crypto-data/book-lvl2-s1/book_level2_1_1666980000_1667001600.parquet', 0),
 ('crypto-data/book-lvl2-s1/book_level2_1_1667001600_1667023200.parquet', 0),
 ('crypto-data/book-lvl2-s1/book_level2_1_1667023200_1667044800.parquet', 0),
 ('crypto-data/book-lvl2-s1/book_level2_1_1667044800_1667066400.parquet', 0),
 ('crypto-data/book-lvl2-s1/book_level2_1_1667066400_1667088000.parquet', 0),
 ('crypto-data/book-lvl2-s1/book_level2_1_1667088000_1667109600.parquet', 0),
 ('crypto-data/book-lvl2-s1/book_level2_1_1667109600_1667131200.parquet', 0),
 ('crypto-data/book-lvl2-s1/book_level2_1_1667131200_1667152800.

In [73]:
book21_nancounts = get_df_nan_count(book2_files, 'book-lvl2-s1/book_level2', 1)

In [74]:
book21_nancounts

[('crypto-data/book-lvl2-s1/book_level2_0_1666526400_1666548000.parquet', 0),
 ('crypto-data/book-lvl2-s1/book_level2_0_1666591200_1666612800.parquet', 0),
 ('crypto-data/book-lvl2-s1/book_level2_0_1666699200_1666720800.parquet', 0),
 ('crypto-data/book-lvl2-s1/book_level2_0_1666893600_1666915200.parquet', 0),
 ('crypto-data/book-lvl2-s1/book_level2_0_1666958400_1666980000.parquet', 0),
 ('crypto-data/book-lvl2-s1/book_level2_0_1666980000_1667001600.parquet', 0),
 ('crypto-data/book-lvl2-s1/book_level2_0_1667001600_1667023200.parquet', 0),
 ('crypto-data/book-lvl2-s1/book_level2_0_1667023200_1667044800.parquet', 0),
 ('crypto-data/book-lvl2-s1/book_level2_0_1667044800_1667066400.parquet', 3),
 ('crypto-data/book-lvl2-s1/book_level2_0_1667066400_1667088000.parquet', 6),
 ('crypto-data/book-lvl2-s1/book_level2_0_1667088000_1667109600.parquet', 0),
 ('crypto-data/book-lvl2-s1/book_level2_0_1667109600_1667131200.parquet', 0),
 ('crypto-data/book-lvl2-s1/book_level2_0_1667131200_1667152800.