In [None]:
import pandas as pd
import datetime as dt
import time
from os import listdir
from os.path import isfile, join
import numpy as np

In [None]:
folder = "../gateio_data/football_trade_data"
files = [f for f in listdir(folder) if isfile(join(folder, f))]

#when subset required
#files = [i for i in files if i[6] in ['202307', '202308', '202309']]

In [None]:
def date_processing(df):
    df = df[["timestamp","dealid","price","amount","side"]]
    df['hour'] = [i[:-3] for i in df['timestamp']]
    df.set_index("hour", inplace=True)
    df.sort_values(by='timestamp', inplace=True)
    return df

def vwap_vanilla(df):
    q = df.amount.values
    p = df.price.values
    return df.assign(vwap=(p * q).cumsum() / q.cumsum())

def vwap2(df):
    df = df.groupby(df.index.date, group_keys=False).apply(vwap_vanilla)
    return df

In [None]:
#the following function delivers VWAP for the asset trade data (across minutes)
def each_assetmonth_sort(each_assetmonth):
    #read and sort each month of data
    df = pd.read_csv(folder+"/"+each_assetmonth)
    df = df[["timestamp","dealid","price","amount","side"]]
    df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y-%m-%d-%H-%M')
    df.sort_values('timestamp', inplace=True)
    df.set_index('timestamp', inplace=True)
    #calculate vwap
    df = vwap2(df)
    
    #assemble all assets and times together
    df = df[['price', 'vwap']]
    df['ticker'] = each_assetmonth[:-7]
    return df

batched_data =[]
for each_assetmonth in files:
        if each_assetmonth != ".DS_Store":
              batched_data.append(each_assetmonth_sort(each_assetmonth))


In [None]:
agg_df = pd.concat(batched_data)

In [None]:
agg_df.to_csv("../gateio_data/minutely_data/aggregated_minutely_data.csv")

In [None]:
#specific processing for btc as requires batch processing to organise
#rerun the above code for btc specifically
btc_df = pd.read_csv("../gateio_data/btc_vwap_data.csv")


In [None]:
# agg_df = agg_df[agg_df['ticker'] =="BTC_USDT"]
btc_df['timestamp'] = pd.to_datetime(btc_df['timestamp'])
btc_df.set_index('timestamp', inplace=True)
btc_df = btc_df['vwap']

In [None]:
batch_size = 100000  # Number of data points per batch

# Calculate the number of batches
num_batches = len(btc_df) // batch_size

# Initialize an empty list to store the batched data
batched_data = []

# Loop through the data and create batches
for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = (i + 1) * batch_size
    batch = btc_df.iloc[start_idx:end_idx]
    #the 
    batched_data.append(batch.pct_change().resample('d').sum())

In [None]:
# Concatenate the batched data into a single DataFrame
#daily_btc_rets = pd.concat(batched_data)
btc_daily_rets = pd.concat(batched_data)

In [None]:
btc_daily_rets = btc_daily_rets.resample('d').sum()

In [None]:
btc_daily_rets.to_csv("../gateio_data/minutely_data/btc_daily_rets")