## Pipeline to process additional data

I downloaded additional data from Binance website for the missing period between Aug 2023 to end of Feb 2024. Therefore, I need to reformat the data to the same format with train.csv for convenience.

In [33]:
import os
import zipfile
import pandas as pd
import numpy as np
ADDITIONAL_FOLDER = r'C:\Users\e0817820\Desktop\tokka\data\additional_test'
RAW_FOLDER = r'C:\Users\e0817820\Desktop\tokka\data\raw'

In [34]:
def unzip_file():
    for file in os.listdir(ADDITIONAL_FOLDER):
        if file.endswith(".zip"):
            file_path = os.path.join(ADDITIONAL_FOLDER, file)
            with zipfile.ZipFile(file_path, 'r') as zip_ref:
                zip_ref.extractall(ADDITIONAL_FOLDER)

In [35]:
unzip_file()

In [36]:
list_cryptocurrencies = ['AVAX', 'ADA', 'SOL', 'BNB', 'TRX', 'DOGE', 'LINK', 'XRP', 'BTC', 'ETH']

In [37]:
columns = ['open_time', 'open', 'high', 'low', 'close', 'volume', 'close_time', 'quote_volume', 'count', 'taker_buy_volume', 'taker_buy_quote_volume', 'ignore', 'symbol', 'log_return']

In [38]:
def load_data():
    combine_df = pd.DataFrame()
    for currency in list_cryptocurrencies:
        combine_curr_df = pd.DataFrame()
        for file in os.listdir(ADDITIONAL_FOLDER):
            if file.endswith(".csv") and 'combine_df' not in file and currency in file:
                print(f'Processing: {file}')
                file_path = os.path.join(ADDITIONAL_FOLDER, file)
                df = pd.read_csv(file_path)
                df.columns = columns[:12]
                df['symbol'] = [currency for i in range(len(df))]
                combine_curr_df = pd.concat([combine_curr_df.reset_index(drop=True), df], ignore_index=True)
                combine_curr_df.columns = columns[:13]
        combine_curr_df['log_return'] = np.log(combine_curr_df['close'].shift(-10) / combine_curr_df['close'])
        combine_df = pd.concat([combine_df, combine_curr_df], ignore_index=True)
    combine_df.columns = columns
    combine_df = combine_df.dropna()
    combine_df.to_csv(os.path.join(ADDITIONAL_FOLDER, 'combine_df_test.csv'))    
    print(combine_df.shape) 

In [39]:
load_data()

Processing: AVAXUSDT-1m-2024-02-29.csv
Processing: AVAXUSDT-1m-2024-03-01.csv
Processing: AVAXUSDT-1m-2024-03-02.csv
Processing: AVAXUSDT-1m-2024-03-03.csv
Processing: AVAXUSDT-1m-2024-03-04.csv
Processing: AVAXUSDT-1m-2024-03-05.csv
Processing: AVAXUSDT-1m-2024-03-06.csv
Processing: AVAXUSDT-1m-2024-03-07.csv
Processing: AVAXUSDT-1m-2024-03-08.csv
Processing: AVAXUSDT-1m-2024-03-09.csv
Processing: AVAXUSDT-1m-2024-03-10.csv
Processing: ADAUSDT-1m-2024-02-29.csv
Processing: ADAUSDT-1m-2024-03-01.csv
Processing: ADAUSDT-1m-2024-03-02.csv
Processing: ADAUSDT-1m-2024-03-03.csv
Processing: ADAUSDT-1m-2024-03-04.csv
Processing: ADAUSDT-1m-2024-03-05.csv
Processing: ADAUSDT-1m-2024-03-06.csv
Processing: ADAUSDT-1m-2024-03-07.csv
Processing: ADAUSDT-1m-2024-03-08.csv
Processing: ADAUSDT-1m-2024-03-09.csv
Processing: ADAUSDT-1m-2024-03-10.csv
Processing: SOLUSDT-1m-2024-02-29.csv
Processing: SOLUSDT-1m-2024-03-01.csv
Processing: SOLUSDT-1m-2024-03-02.csv
Processing: SOLUSDT-1m-2024-03-03.csv
P

In [40]:
df = pd.read_csv(os.path.join(ADDITIONAL_FOLDER, 'combine_df_test.csv'))
print(df.columns)

Index(['Unnamed: 0', 'open_time', 'open', 'high', 'low', 'close', 'volume',
       'close_time', 'quote_volume', 'count', 'taker_buy_volume',
       'taker_buy_quote_volume', 'ignore', 'symbol', 'log_return'],
      dtype='object')


In [41]:
new_columns = ['id', 'timestamp', 'open', 'high', 'low', 'close', 'volume',
       'close_time', 'quote_asset_volume', 'number_of_trades', 'taker_buy_volume',
       'taker_sell_volume', 'ignore', 'symbol', 'log_return']

In [42]:
df.columns = new_columns

In [43]:
df = df.drop(columns=['close_time', 'ignore', 'id'])

In [44]:
print(df.shape)

(158190, 12)


#### Calculate log return of the asset over next 10 minutes 

In [45]:
df = df.sort_values(by='timestamp')

In [46]:
lower_timestamp = 1709222400000
upper_timestamp = 1710086400000
df = df[df['timestamp'] >= lower_timestamp]
df = df[df['timestamp'] <= upper_timestamp]


In [47]:
df.to_csv(os.path.join(ADDITIONAL_FOLDER, 'add_test.csv'), index=True)

In [48]:
print(df.shape)

(143910, 12)
