In [10]:
import sys
!{sys.executable} -m pip install tables

import os
from populator import aggregate_data

data_dir = 'data'
exchange = 'test'
write_csv = False
write_hdf = True
# Intervals other than '1Min' over which to downsample data and store on disk
downsampling_intervals = ['5Min', '15Min', '1h', '1d']



In [11]:
pairs = []
for filename in os.listdir(os.path.join(data_dir, exchange)):
    currencies = filename.split('-')[:2]
    pairs.append('/'.join(currencies))
print('Found following pairs for exchange {}: {}'.format(exchange, pairs))

Found following pairs for exchange test: ['EOS/BNB', '.DS_Store', 'NEO/BNB', 'AE/BNB']


In [12]:
df_1m = aggregate_data('data', {exchange: pairs}, '1m', '2000-01-01T00:00:00Z')
print('Assembled aggregate dataframe')

Loaded pair EOS/BNB on test (2018-05-28 05:00:00 to 2019-03-14 00:17:00)
Loaded pair NEO/BNB on test (2017-11-20 03:41:00 to 2019-03-14 00:17:00)
Loaded pair AE/BNB on test (2018-02-07 11:59:00 to 2019-03-14 00:17:00)
Assembled aggregate dataframe


In [13]:
df_1m.head()

Unnamed: 0_level_0,open_test_EOS_BNB,high_test_EOS_BNB,low_test_EOS_BNB,close_test_EOS_BNB,volume_test_EOS_BNB,open_test_NEO_BNB,high_test_NEO_BNB,low_test_NEO_BNB,close_test_NEO_BNB,volume_test_NEO_BNB,open_test_AE_BNB,high_test_AE_BNB,low_test_AE_BNB,close_test_AE_BNB,volume_test_AE_BNB
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2017-11-20 03:41:00,,,,,,24.0,24.0,24.0,24.0,0.5,,,,,
2017-11-20 03:42:00,,,,,,24.0,24.0,24.0,24.0,0.0,,,,,
2017-11-20 03:43:00,,,,,,100.0,100.0,100.0,100.0,4.166,,,,,
2017-11-20 03:44:00,,,,,,100.0,100.0,100.0,100.0,0.0,,,,,
2017-11-20 03:45:00,,,,,,69.0,69.0,69.0,69.0,9.25,,,,,


In [16]:
import numpy as np

def downsample(df, interval):
    # Sum over all volumes and average over all prices in each interval
    agg_fns = {col: np.sum if col.startswith('volume') else np.mean for col in df}
    return df.resample(interval).agg(agg_fns)

In [17]:
dfs = [(df_1m, '1m')]
for interval in downsampling_intervals:
    print('Downsampling {} data to {}... '.format(exchange, interval), end='')
    df_down = downsample(df_1m, interval)
    print('Done')
    dfs.append((df_down, interval))

Downsampling test data to 5Min... Done
Downsampling test data to 15Min... Done
Downsampling test data to 1h... Done
Downsampling test data to 1d... Done


In [18]:
for df, interval in dfs:
    path_prefix = os.path.join(data_dir, exchange, 'all-' + interval)
    if write_csv:
        df.to_csv(path_prefix + '.csv', index_label='timestamp')
        print('Wrote {}'.format(path_prefix + '.csv'))
    if write_hdf:
        df.to_hdf(path_prefix + '.h5', key = exchange + '_' + interval, mode='w')
        print('Wrote {}'.format(path_prefix + '.h5'))

Wrote data/test/all-1m.h5
Wrote data/test/all-5Min.h5
Wrote data/test/all-15Min.h5
Wrote data/test/all-1h.h5
Wrote data/test/all-1d.h5
