In [120]:
import os
import pandas as pd
import numpy as np
import pprint as pp
import datetime as dt
import pathlib
import re
import json
import math

symbols_list_file_name = "binance_BTC_from_2019_05_01_to_2022_04_30.json"
with open(symbols_list_file_name, "r") as f:
    symbols_list = json.load(f)

exchange = "binance"
data_folder_name = "datasets_by_asset_" + exchange
# dir_name = os.path.join(os.getcwd(), data_folder_name)
extension = ".gz"
regex_match_to_strip = "\.csv.gz$"
returns_folder_name = "returns"
pattern = "(.*?)_(.*)" # date_ticker regex pattern
first_day_first_minute = dt.datetime(2022, 4, 14)
last_day_plus_one_first_minute = dt.datetime(2022, 4, 29)

date_ticker_regex = re.compile(pattern)
theoretical_first_minute = math.floor((first_day_first_minute - initial_time) / dt.timedelta(minutes=1))
theoretical_last_minute_plus_one = math.floor((last_day_plus_one_first_minute - initial_time) / dt.timedelta(minutes=1))


def time_is_in_day(a, timestamp):
    # checks if timestamp (in microseconds) is within 1 day in the future of a (a datetime object)
    b = dt.datetime.utcfromtimestamp(timestamp/1000000)
    return a <= b < a + dt.timedelta(days=1)

def data_integrity_test(df, date_ticker_regex):
    date_ticker_regex_result = date_ticker_regex.match(data_name)
    file_name_date = date_ticker_regex_result.group(1)
    file_name_ticker = date_ticker_regex_result.group(2)
    file_name_datetime_object = dt.datetime.strptime(file_name_date, "%Y-%m-%d")
    
    first_time = df.iloc[0]['timestamp']
    last_time = df.iloc[-1]['timestamp']

    if not pd.Index(df['timestamp']).is_monotonic:
        print("Timestamp is not monotonic for " + data_name)

    if not pd.Index(df['id']).is_monotonic:
        print("ID is not monotonic for " + data_name)

    if not (time_is_in_day(file_name_datetime_object, first_time) and time_is_in_day(file_name_datetime_object, last_time)):
        print("Time is out of range for " + data_name)
    
    if df.iloc[0]['symbol'] != file_name_ticker:
        print("Ticker is wrong for " + data_name)

# data_dict = {}
pathlib.Path(os.path.join(os.getcwd(), returns_folder_name)).mkdir(parents=True, exist_ok=True)
date_ticker_regex = re.compile(pattern)

def data_integrity_test(df, date_ticker_regex):
    date_ticker_regex_result = date_ticker_regex.match(data_name)
    file_name_date = date_ticker_regex_result.group(1)
    file_name_ticker = date_ticker_regex_result.group(2)
    file_name_datetime_object = dt.datetime.strptime(file_name_date, "%Y-%m-%d")
    
    first_time = df.iloc[0]['timestamp']
    last_time = df.iloc[-1]['timestamp']

    if not pd.Index(df['timestamp']).is_monotonic:
        print("Timestamp is not monotonic for " + data_name)

    if not pd.Index(df['id']).is_monotonic:
        print("ID is not monotonic for " + data_name)

    if not (time_is_in_day(file_name_datetime_object, first_time) and time_is_in_day(file_name_datetime_object, last_time)):
        print("Time is out of range for " + data_name)
    
    if df.iloc[0]['symbol'] != file_name_ticker:
        print("Ticker is wrong for " + data_name)
        
symbols_list = ['ethbtc'] # debug only

def keep_latest_trades(df):
    df_latests = df.groupby(['minute', 'side']).timestamp.transform(max)
    df = df[df.timestamp == df_latests]
    df = df.drop(columns=['timestamp'])
    
def compute_best_sells(df):
    sell_df = df[df.side == 'sell'].drop('side', 1)
    sell_latests_priciest = sell_df.groupby(['minute']).price.transform(max)
    sell_df = sell_df[sell_df.price == sell_latests_priciest]
    return sell_df.groupby(['minute']).tail(1) # make each minute have at most one row

def compute_best_buys(df):
    buy_df = df[df.side == 'buy'].drop('side', 1)
    buy_latests_cheapest = buy_df.groupby(['minute']).price.transform(min)
    buy_df = buy_df[buy_df.price == buy_latests_cheapest]
    return buy_df.groupby(['minute']).tail(1) # make each minute have at most one row

def fill_in_missing_trades(df):
    df = df.set_index('minute').reindex(range(theoretical_first_minute, theoretical_last_minute_plus_one), fill_value=np.NaN).reset_index()
    df = df.fillna(method='ffill') # uses best price from previous minute(s)
    df = df.fillna(method='bfill') # only affects the start of the dataset
    
def compute_mid_prices(big_sell_df, big_buy_df):
    big_sell_df = big_sell_df.rename(columns={'price': 'sell'})
    big_buy_df = big_buy_df.drop(columns=['minute'])
    big_buy_df = big_buy_df.rename(columns={'price': 'buy'})
    big_df = pd.concat([big_sell_df, big_buy_df], axis=1)
    big_df['mid'] = (big_df['sell'] + big_df['buy']) / 2
    return big_df.drop(['sell', 'buy'], 1)

def compute_log_returns(big_df):
    big_df['log_return'] = np.log(big_df['mid'] / big_df['mid'].shift(1))
    big_df = big_df.drop(columns=['mid'])
    big_df = big_df.fillna(value=1)

for symbol in symbols_list:
    dir_name = os.path.join(os.getcwd(), data_folder_name, symbol)
    sell_df_list = []
    buy_df_list = []
    for root, _, file_names in os.walk(dir_name):
        for file_name in file_names:
            if file_name.endswith(extension):
                file_path = os.path.join(root, file_name)
                data_name = re.sub(regex_match_to_strip, '', file_name)
#                 print(f"Walked to {file_path}")
                try:
                    df = pd.read_csv(file_path, compression='gzip')
                    data_integrity_test(df, date_ticker_regex)
                    
                    df = df[['timestamp', 'side', 'price']]
                    df.loc[:,'minute'] = df.timestamp.apply(lambda x: math.floor(dt.timedelta(microseconds=x) / dt.timedelta(minutes=1)))
                    
                    keep_latest_trades(df)
                    
                    sell_df = compute_best_sells(df)
                    sell_df_list.append(sell_df)
                    
                    buy_df = compute_best_buys(df)
                    buy_df_list.append(buy_df)
                except:
                    print("No data")
    # write stuff for that symbol
    big_sell_df = pd.concat(sell_df_list, ignore_index=True)
    big_buy_df = pd.concat(buy_df_list, ignore_index=True)
    
    initial_time = dt.datetime.utcfromtimestamp(0)
    fill_in_missing_trades(big_sell_df)
    fill_in_missing_trades(big_buy_df)
    
    big_df = compute_mid_prices(big_sell_df, big_buy_df)
    compute_log_returns(big_df)
    
    # structure: returns_folder_name/symbol/this df as csv.gz
    save_file_name = symbol + '.csv.gz'
    save_path = os.path.join(os.getcwd(), returns_folder_name, save_file_name)
    big_df.to_csv(save_path, compression='gzip')
    print(f"Saved returns for {symbol}")
    



Saved returns for ethbtc


In [121]:
df = pd.read_csv(save_path, compression='gzip')

In [122]:
df

Unnamed: 0.1,Unnamed: 0,timestamp,minute,timestamp.1,mid,log_return
0,0,1649894459799000,27498240,1.649894e+15,0.075807,
1,1,1649894465574000,27498241,1.649895e+15,0.075798,-0.000119
2,2,1649894545263000,27498242,1.649895e+15,0.075746,-0.000680
3,3,1649894623856000,27498243,1.649895e+15,0.075745,-0.000013
4,4,1649894697140000,27498244,1.649895e+15,0.075776,0.000403
...,...,...,...,...,...,...
21547,21547,1651190107706000,27519835,,,
21548,21548,1651190218708000,27519836,,,
21549,21549,1651190254960000,27519837,,,
21550,21550,1651190313017000,27519838,,,


In [22]:
sell_df_list

[           price    minute
 83      0.075828  27498240
 196     0.075769  27498241
 238     0.075743  27498242
 313     0.075749  27498243
 417     0.075774  27498244
 ...          ...       ...
 101786  0.075648  27499674
 101808  0.075655  27499675
 101825  0.075655  27499677
 101843  0.075655  27499678
 101868  0.075657  27499679
 
 [1437 rows x 2 columns],
           price    minute
 71     0.075661  27499680
 133    0.075673  27499681
 312    0.075747  27499682
 461    0.075736  27499683
 503    0.075736  27499684
 ...         ...       ...
 76494  0.074995  27501115
 76510  0.074991  27501116
 76544  0.074998  27501117
 76576  0.075011  27501118
 76634  0.074984  27501119
 
 [1436 rows x 2 columns],
           price    minute
 106    0.075015  27501120
 188    0.074997  27501121
 247    0.074974  27501122
 336    0.074977  27501123
 382    0.075013  27501124
 ...         ...       ...
 59291  0.075770  27502555
 59303  0.075760  27502556
 59387  0.075760  27502557
 59406  0.0757

In [23]:
buy_df_list

[           price    minute
 82      0.075827  27498240
 195     0.075770  27498241
 241     0.075744  27498242
 324     0.075750  27498243
 416     0.075775  27498244
 ...          ...       ...
 101812  0.075656  27499675
 101819  0.075656  27499676
 101826  0.075656  27499677
 101845  0.075656  27499678
 101867  0.075658  27499679
 
 [1439 rows x 2 columns],
           price    minute
 69     0.075662  27499680
 132    0.075674  27499681
 260    0.075740  27499682
 451    0.075742  27499683
 507    0.075726  27499684
 ...         ...       ...
 76505  0.074992  27501115
 76520  0.074992  27501116
 76546  0.074999  27501117
 76575  0.075012  27501118
 76614  0.074993  27501119
 
 [1435 rows x 2 columns],
           price    minute
 103    0.075025  27501120
 187    0.074998  27501121
 261    0.074975  27501122
 335    0.074978  27501123
 380    0.075003  27501124
 ...         ...       ...
 59283  0.075771  27502555
 59367  0.075761  27502556
 59383  0.075761  27502557
 59414  0.0757

In [24]:
big_sell_df = pd.concat(sell_df_list, ignore_index=True)
big_buy_df = pd.concat(buy_df_list, ignore_index=True)

In [25]:
len(big_sell_df)

21552

In [28]:
15*1440

21600

In [27]:
len(big_buy_df)

21514

In [30]:
import numpy as np

In [57]:
initial_time = dt.datetime.utcfromtimestamp(0)

In [53]:
first_day_first_minute = dt.datetime(2022, 4, 14)
last_day_plus_one_first_minute = dt.datetime(2022, 4, 29)

In [63]:
theoretical_first_minute = math.floor((first_day_first_minute - initial_time) / dt.timedelta(minutes=1))

In [64]:
theoretical_last_minute_plus_one = math.floor((last_day_plus_one_first_minute - initial_time) / dt.timedelta(minutes=1))

In [65]:
big_sell_df = big_sell_df.set_index('minute').reindex(range(theoretical_first_minute, theoretical_last_minute_plus_one), fill_value=np.NaN).reset_index()

In [66]:
big_buy_df = big_buy_df.set_index('minute').reindex(range(theoretical_first_minute, theoretical_last_minute_plus_one), fill_value=np.NaN).reset_index()

In [67]:
big_sell_df = big_sell_df.fillna(method='ffill') # uses best price from previous minute(s)
big_sell_df = big_sell_df.fillna(method='bfill') # only affects the start of the dataset
big_buy_df = big_buy_df.fillna(method='ffill') # uses best price from previous minute(s)
big_buy_df = big_buy_df.fillna(method='bfill') # only affects the start of the dataset

In [83]:
big_sell_df = big_sell_df.rename(columns={'price': 'sell'})

In [None]:
big_buy_df = big_buy_df.drop('minute', 1)

In [84]:
big_buy_df = big_buy_df.rename(columns={'price': 'buy'})

In [85]:
big_df = pd.concat([big_sell_df, big_buy_df], axis=1)

In [86]:
big_df

Unnamed: 0,minute,sell,buy
0,27498240,0.075828,0.075827
1,27498241,0.075769,0.075770
2,27498242,0.075743,0.075744
3,27498243,0.075749,0.075750
4,27498244,0.075774,0.075775
...,...,...,...
21595,27519835,0.073837,0.073838
21596,27519836,0.073885,0.073886
21597,27519837,0.073916,0.073917
21598,27519838,0.073899,0.073900


In [90]:
(big_df['sell'] <= big_df['buy']).value_counts()

True     18781
False     2819
dtype: int64

In [92]:
big_df.iloc[0]['buy']

0.075827

In [95]:
big_df['mid'] = (big_df['sell'] + big_df['buy'])/2

In [96]:
big_df

Unnamed: 0,minute,sell,buy,mid
0,27498240,0.075828,0.075827,0.075828
1,27498241,0.075769,0.075770,0.075770
2,27498242,0.075743,0.075744,0.075744
3,27498243,0.075749,0.075750,0.075749
4,27498244,0.075774,0.075775,0.075774
...,...,...,...,...
21595,27519835,0.073837,0.073838,0.073838
21596,27519836,0.073885,0.073886,0.073885
21597,27519837,0.073916,0.073917,0.073916
21598,27519838,0.073899,0.073900,0.073900


In [99]:
big_df = big_df.drop(['sell', 'buy'], 1)

  """Entry point for launching an IPython kernel.


In [110]:
big_df['log_return'] = np.log(big_df['mid'] / big_df['mid'].shift(1))

In [113]:
big_df = big_df.drop('mid', 1)

  """Entry point for launching an IPython kernel.


In [114]:
big_df.set_value()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cacher_needs_updating = self._check_is_chained_assignment_possible()


In [117]:
big_df.at[0, 'log_return'] = 1

In [118]:
big_df

Unnamed: 0,minute,log_return
0,27498240,1.000000
1,27498241,-0.000765
2,27498242,-0.000343
3,27498243,0.000079
4,27498244,0.000330
...,...,...
21595,27519835,-0.000230
21596,27519836,0.000650
21597,27519837,0.000419
21598,27519838,-0.000230


In [34]:
len(big_buy_df)

21600

In [1]:
import os
import pandas as pd
import pprint as pp
import datetime
import gzip
import re
import json

In [2]:
symbols_list_file_name = "binance_BTC_from_2019_05_01_to_2022_04_30.json"
with open(symbols_list_file_name, "r") as f:
    symbols_list = json.load(f)

In [3]:
exchange = "binance"
data_folder_name = "datasets_by_asset_" + exchange
# dir_name = os.path.join(os.getcwd(), data_folder_name)
extension = ".gz"
regex_match_to_strip = "\.csv.gz$"
returns_folder_name = "returns"
pattern = "(.*?)_(.*)" # date_ticker regex pattern

In [4]:
os.getcwd()

'C:\\Users\\wang-\\OneDrive\\Term 2\\Dissertation\\experiments\\scratches'

In [5]:
date_ticker_regex = re.compile(pattern)

In [6]:
def data_integrity_test(df, date_ticker_regex):
    date_ticker_regex_result = date_ticker_regex.match(data_name)
    file_name_date = date_ticker_regex_result.group(1)
    file_name_ticker = date_ticker_regex_result.group(2)
    file_name_datetime_object = datetime.datetime.strptime(file_name_date, "%Y-%m-%d")
    
    first_time = df.iloc[0]['timestamp']
    last_time = df.iloc[-1]['timestamp']

    if not pd.Index(df['timestamp']).is_monotonic:
        print("Timestamp is not monotonic for " + data_name)

    if not pd.Index(df['id']).is_monotonic:
        print("ID is not monotonic for " + data_name)

    if not (time_is_in_day(file_name_datetime_object, first_time) and time_is_in_day(file_name_datetime_object, last_time)):
        print("Time is out of range for " + data_name)
    
    if df.iloc[0]['symbol'] != file_name_ticker:
        print("Ticker is wrong for " + data_name)

In [7]:
symbols_list = ['ethbtc'] # debug only

In [9]:
file_path

'C:\\Users\\wang-\\OneDrive\\Term 2\\Dissertation\\experiments\\scratches\\datasets_by_asset_binance\\ethbtc\\binance\\trades\\2022-04-28_ETHBTC.csv.gz'

In [10]:
for symbol in symbols_list:
    dir_name = os.path.join(os.getcwd(), data_folder_name, symbol)
    sell_df_list = []
    buy_df_list = []
    for root, _, file_names in os.walk(dir_name):
        for file_name in file_names:
            if file_name.endswith(extension):
                file_path = os.path.join(root, file_name)
                data_name = re.sub(regex_match_to_strip, '', file_name)
                print(f"Walked to {file_path}")
                try:
                    df = pd.read_csv(file_path, compression='gzip')
                    data_integrity_test(df, date_ticker_regex)
                    
                    df = df[['timestamp', 'side', 'price']]
                    df.loc[:,'minute'] = df.timestamp.apply(lambda x: math.floor(dt.timedelta(microseconds=x) / dt.timedelta(minutes=1)))
                    
                    df_latests = df.groupby(['minute', 'side']).timestamp.transform(max)
                    df = df[df.timestamp == df_latests]
                    
                    sell_df = df[df.side == 'sell']
                    sell_latests_priciest = sell_df.groupby(['minute', 'side']).price.transform(max)
                    sell_df = sell_df[sell_df.price == sell_latests_priciest]
                    sell_df = sell_df.groupby(['minute']).tail(1) # make each minute have at most one row
                    sell_df_list.append(sell_df)
                    
                    buy_df = df[df.side == 'buy']
                    buy_latests_cheapest = buy_df.groupby(['minute', 'side']).price.transform(min)
                    buy_df = buy_df[buy_df.price == buy_latests_cheapest]
                    buy_df = buy_df.groupby(['minute']).tail(1) # make each minute have at most one row
                    buy_df_list.append(buy_df)
                except:
                    print("No data")

Walked to C:\Users\wang-\OneDrive\Term 2\Dissertation\experiments\scratches\datasets_by_asset_binance\ethbtc\binance\trades\2022-04-14_ETHBTC.csv.gz
No data
Walked to C:\Users\wang-\OneDrive\Term 2\Dissertation\experiments\scratches\datasets_by_asset_binance\ethbtc\binance\trades\2022-04-15_ETHBTC.csv.gz
No data
Walked to C:\Users\wang-\OneDrive\Term 2\Dissertation\experiments\scratches\datasets_by_asset_binance\ethbtc\binance\trades\2022-04-16_ETHBTC.csv.gz
No data
Walked to C:\Users\wang-\OneDrive\Term 2\Dissertation\experiments\scratches\datasets_by_asset_binance\ethbtc\binance\trades\2022-04-17_ETHBTC.csv.gz
No data
Walked to C:\Users\wang-\OneDrive\Term 2\Dissertation\experiments\scratches\datasets_by_asset_binance\ethbtc\binance\trades\2022-04-18_ETHBTC.csv.gz
No data
Walked to C:\Users\wang-\OneDrive\Term 2\Dissertation\experiments\scratches\datasets_by_asset_binance\ethbtc\binance\trades\2022-04-19_ETHBTC.csv.gz
No data
Walked to C:\Users\wang-\OneDrive\Term 2\Dissertation\expe

In [59]:
sell_df_list

[]

In [8]:
trial_path = os.path.join(os.getcwd(), "datasets_by_asset_binance", "ethbtc", "binance", "trades", "2022-04-28_ETHBTC.csv.gz")

In [9]:
df = pd.read_csv(trial_path, compression = 'gzip')

In [10]:
df

Unnamed: 0,exchange,symbol,timestamp,local_timestamp,id,side,price,amount
0,binance,ETHBTC,1651104001009000,1651104001014392,336445620,sell,0.073637,0.5444
1,binance,ETHBTC,1651104001009000,1651104001014407,336445621,sell,0.073637,1.3447
2,binance,ETHBTC,1651104001009000,1651104001014409,336445622,sell,0.073637,7.4486
3,binance,ETHBTC,1651104001009000,1651104001014430,336445623,sell,0.073637,0.1949
4,binance,ETHBTC,1651104001009000,1651104001014431,336445624,sell,0.073637,2.0283
...,...,...,...,...,...,...,...,...
157781,binance,ETHBTC,1651190390835000,1651190390840925,336603401,sell,0.073899,0.1160
157782,binance,ETHBTC,1651190392836000,1651190392841330,336603402,sell,0.073899,0.1161
157783,binance,ETHBTC,1651190393835000,1651190393840714,336603403,sell,0.073899,0.1161
157784,binance,ETHBTC,1651190394661000,1651190394666067,336603404,sell,0.073899,1.0114


In [20]:
import datetime as dt

In [21]:
x = 1651104001009000
td = dt.timedelta(microseconds=x)

In [27]:
import math

In [24]:
td / dt.timedelta(minutes=1)

27518400.01681667

In [28]:
math.floor(td / dt.timedelta(minutes=1))

27518400

In [25]:
df = df[['timestamp', 'side', 'price']]

In [33]:
df.loc[:,'minute'] = df.timestamp.apply(lambda x: math.floor(dt.timedelta(microseconds=x) / dt.timedelta(minutes=1)))

In [35]:
df_latests = df.groupby(['minute', 'side']).timestamp.transform(max)
df = df[df.timestamp == df_latests]

In [40]:
df

Unnamed: 0,timestamp,side,price,minute
174,1651104055912000,buy,0.073572,27518400
175,1651104059645000,sell,0.073571,27518400
241,1651104111148000,buy,0.073615,27518401
264,1651104119665000,sell,0.073609,27518401
348,1651104179326000,buy,0.073613,27518402
...,...,...,...,...
157735,1651190325358000,buy,0.073900,27519838
157737,1651190339882000,sell,0.073899,27519838
157780,1651190389942000,buy,0.073900,27519839
157784,1651190394661000,sell,0.073899,27519839


In [54]:
sell_df = df[df.side == 'sell']
sell_latests = sell_df.groupby(['minute', 'side']).timestamp.transform(max)
sell_df = sell_df[sell_df.timestamp == sell_latests]
sell_latests_priciest = sell_df.groupby(['minute', 'side']).price.transform(max)
sell_df = sell_df[sell_df.price == sell_latests_priciest]
sell_df = sell_df.groupby(['minute']).tail(1) # make each minute have at most one row

In [55]:
sell_df

Unnamed: 0,timestamp,side,price,minute
175,1651104059645000,sell,0.073571,27518400
264,1651104119665000,sell,0.073609,27518401
349,1651104179603000,sell,0.073622,27518402
449,1651104239437000,sell,0.073609,27518403
487,1651104293178000,sell,0.073587,27518404
...,...,...,...,...
157155,1651190149593000,sell,0.073837,27519835
157349,1651190218708000,sell,0.073885,27519836
157687,1651190273271000,sell,0.073916,27519837
157737,1651190339882000,sell,0.073899,27519838


In [52]:
buy_df = df[df.side == 'buy']
buy_latests = buy_df.groupby(['minute', 'side']).timestamp.transform(max)
buy_df = buy_df[buy_df.timestamp == buy_latests]
buy_latests_cheapest = buy_df.groupby(['minute', 'side']).price.transform(min)
buy_df = buy_df[buy_df.price == buy_latests_cheapest]
buy_df = buy_df.groupby(['minute']).tail(1) # make each minute have at most one row

In [53]:
buy_df

Unnamed: 0,timestamp,side,price,minute
174,1651104055912000,buy,0.073572,27518400
241,1651104111148000,buy,0.073615,27518401
348,1651104179326000,buy,0.073613,27518402
416,1651104215033000,buy,0.073617,27518403
493,1651104293758000,buy,0.073583,27518404
...,...,...,...,...
157146,1651190140813000,buy,0.073838,27519835
157341,1651190210096000,buy,0.073886,27519836
157682,1651190260476000,buy,0.073917,27519837
157735,1651190325358000,buy,0.073900,27519838


In [31]:
df = df.drop('minute', 1)

  """Entry point for launching an IPython kernel.


In [32]:
df

Unnamed: 0,timestamp,side,price
0,1651104001009000,sell,0.073637
1,1651104001009000,sell,0.073637
2,1651104001009000,sell,0.073637
3,1651104001009000,sell,0.073637
4,1651104001009000,sell,0.073637
...,...,...,...
157781,1651190390835000,sell,0.073899
157782,1651190392836000,sell,0.073899
157783,1651190393835000,sell,0.073899
157784,1651190394661000,sell,0.073899


In [15]:
def time_is_in_day(a, timestamp):
    # checks if timestamp (in microseconds) is within 1 day in the future of a (a datetime object)
    b = datetime.datetime.utcfromtimestamp(timestamp/1000000)
    return a <= b < a + datetime.timedelta(days=1)

In [18]:


data_name = "2022-04-28_ETHBTC"
date_ticker_regex_result = date_ticker_regex.match(data_name)
file_name_date = date_ticker_regex_result.group(1)
file_name_ticker = date_ticker_regex_result.group(2)
file_name_datetime_object = datetime.datetime.strptime(file_name_date, "%Y-%m-%d")

first_time = df.iloc[0]['timestamp']
last_time = df.iloc[-1]['timestamp']

if not pd.Index(df['timestamp']).is_monotonic:
    print("Timestamp is not monotonic for " + data_name)

if not pd.Index(df['id']).is_monotonic:
    print("ID is not monotonic for " + data_name)

if not (time_is_in_day(file_name_datetime_object, first_time) and time_is_in_day(file_name_datetime_object, last_time)):
    print("Time is out of range for " + data_name)

if df.iloc[0]['symbol'] != file_name_ticker:
    print("Ticker is wrong for " + data_name)