In [60]:
import pandas as pd
df = pd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]],
                  index=[4, 5, 6], columns=['A', 'B', 'C'])

In [61]:
df

Unnamed: 0,A,B,C
4,0,2,3
5,0,4,1
6,10,20,30


In [62]:
# df.drop(df[df.B < 10].index, inplace=True)

In [63]:
def my_drop(df):
    df.drop(df[df.B < 10].index, inplace=True)

In [64]:
my_drop(df)

In [65]:
df

Unnamed: 0,A,B,C
6,10,20,30


In [77]:
import os
import pandas as pd
import numpy as np
import pprint as pp
import datetime as dt
import pathlib
import re
import json
import math

symbols_list_file_name = "binance_BTC_from_2019_05_01_to_2022_04_30.json"
with open(symbols_list_file_name, "r") as f:
    symbols_list = json.load(f)

exchange = "binance"
data_folder_name = "datasets_by_asset_" + exchange
extension = ".gz"
regex_match_to_strip = "\.csv.gz$"
returns_folder_name = "returns"
pattern = "(.*?)_(.*)" # date_ticker regex pattern
first_day_first_minute = dt.datetime(2022, 4, 14)
last_day_plus_one_first_minute = dt.datetime(2022, 4, 29)

initial_time = dt.datetime.utcfromtimestamp(0)
date_ticker_regex = re.compile(pattern)
theoretical_first_minute = math.floor(
    (first_day_first_minute - initial_time) / dt.timedelta(minutes=1)
)
theoretical_last_minute_plus_one = math.floor(
    (last_day_plus_one_first_minute - initial_time) / dt.timedelta(minutes=1)
)


def time_is_in_day(a, timestamp):
    # is timestamp (in microseconds) is within 1 day in the future of a (datetime object)?
    b = dt.datetime.utcfromtimestamp(timestamp/1000000)
    return a <= b < a + dt.timedelta(days=1)

def data_integrity_test(df, date_ticker_regex):
    date_ticker_regex_result = date_ticker_regex.match(data_name)
    file_name_date = date_ticker_regex_result.group(1)
    file_name_ticker = date_ticker_regex_result.group(2)
    file_name_datetime_object = dt.datetime.strptime(file_name_date, "%Y-%m-%d")
    
    first_time = df.iloc[0]['timestamp']
    last_time = df.iloc[-1]['timestamp']
    
    first_local_time = df.iloc[0]['local_timestamp']
    last_local_time = df.iloc[-1]['local_timestamp']

    if not pd.Index(df['timestamp']).is_monotonic:
        print("Timestamp is not monotonic for " + data_name)

    if not pd.Index(df['id']).is_monotonic:
        print("ID is not monotonic for " + data_name)

    if not (time_is_in_day(file_name_datetime_object, first_time) and 
            time_is_in_day(file_name_datetime_object, last_time)):
        print("Time is out of range for " + data_name)
    
    if not (time_is_in_day(file_name_datetime_object, first_local_time) and 
            time_is_in_day(file_name_datetime_object, last_local_time)):
        print("Local time is out of range for " + data_name)
    
    if df.iloc[0]['symbol'] != file_name_ticker:
        print("Ticker is wrong for " + data_name)

# data_dict = {}
pathlib.Path(os.path.join(os.getcwd(), returns_folder_name)).mkdir(parents=True, exist_ok=True)

In [78]:
# symbols_list = ['btcusdt'] # debug only
for symbol in symbols_list:
    dir_name = os.path.join(os.getcwd(), data_folder_name, symbol)
    sell_df_list = []
    buy_df_list = []
    for root, _, file_names in os.walk(dir_name):
        for file_name in file_names:
            if file_name.endswith(extension):
                file_path = os.path.join(root, file_name)
                data_name = re.sub(regex_match_to_strip, '', file_name)
#                 print(f"Walked to {file_path}")
                df = pd.read_csv(file_path, compression='gzip')
                data_integrity_test(df, date_ticker_regex)

Time is out of range for 2022-04-14_BTCUSDT
Time is out of range for 2022-04-15_BTCUSDT
Time is out of range for 2022-04-16_BTCUSDT
Time is out of range for 2022-04-17_BTCUSDT
Time is out of range for 2022-04-18_BTCUSDT
Time is out of range for 2022-04-19_BTCUSDT
Time is out of range for 2022-04-20_BTCUSDT
Time is out of range for 2022-04-21_BTCUSDT
Time is out of range for 2022-04-22_BTCUSDT
Time is out of range for 2022-04-23_BTCUSDT
Time is out of range for 2022-04-24_BTCUSDT
Time is out of range for 2022-04-26_BTCUSDT
Time is out of range for 2022-04-27_BTCUSDT
Time is out of range for 2022-04-28_BTCUSDT


In [70]:
df

Unnamed: 0,exchange,symbol,timestamp,local_timestamp,id,side,price,amount
0,binance,BTCUSDT,1651103999993000,1651104000001359,1339109769,buy,39235.73,0.00061
1,binance,BTCUSDT,1651103999994000,1651104000002480,1339109770,buy,39235.73,0.00068
2,binance,BTCUSDT,1651103999996000,1651104000003014,1339109771,buy,39235.73,0.00194
3,binance,BTCUSDT,1651103999997000,1651104000006761,1339109772,sell,39235.72,0.00086
4,binance,BTCUSDT,1651104000000000,1651104000008147,1339109773,sell,39235.72,0.00082
...,...,...,...,...,...,...,...,...
1135229,binance,BTCUSDT,1651190399989000,1651190399995335,1340245001,buy,39742.07,0.00065
1135230,binance,BTCUSDT,1651190399990000,1651190399996775,1340245002,buy,39742.07,0.00187
1135231,binance,BTCUSDT,1651190399990000,1651190399996898,1340245003,sell,39742.06,0.00158
1135232,binance,BTCUSDT,1651190399992000,1651190399999220,1340245004,sell,39742.06,0.00200


In [80]:
df.drop(columns=['timestamp'], inplace=True)

In [82]:
df.rename(columns={'local_timestamp': 'timestamp'}, inplace=True)

In [83]:
df

Unnamed: 0,exchange,symbol,timestamp,id,side,price,amount
0,binance,REPBTC,1651104293514017,10767481,sell,0.000334,0.36
1,binance,REPBTC,1651104383936349,10767482,sell,0.000335,0.32
2,binance,REPBTC,1651104384913320,10767483,sell,0.000335,0.66
3,binance,REPBTC,1651104385919510,10767484,sell,0.000335,1.66
4,binance,REPBTC,1651104386904322,10767485,sell,0.000335,0.46
...,...,...,...,...,...,...,...
1413,binance,REPBTC,1651187191283618,10768894,sell,0.000337,0.50
1414,binance,REPBTC,1651187191297376,10768895,sell,0.000337,1.40
1415,binance,REPBTC,1651188411569387,10768896,sell,0.000336,1.99
1416,binance,REPBTC,1651188455365006,10768897,buy,0.000337,5.23


In [71]:
x = 1651103999993000
dt.datetime.utcfromtimestamp(x/1000000)

datetime.datetime(2022, 4, 27, 23, 59, 59, 993000)

In [74]:
y = 1651104000001359
dt.datetime.utcfromtimestamp(y/1000000)

datetime.datetime(2022, 4, 28, 0, 0, 0, 1359)

In [None]:
# run tests for local timestamp and actual timestamp. Find what is the problem.