In [1]:
import pandas as pd
import numpy as np
import yfinance as yf
import os
import sys

sys.path.append(os.getcwd()[:-9])
from utils.historical_downloader import *
from datetime import datetime, time

### In this notebook, we will be resampling data into volume and dollar bars.

The general idea is to sample every time a pre-defined amount of the security’s units (shares, futures contracts, etc.) have been exchanged. For example, we could sample prices every time a futures contract exchanges 1,000 units.

In similar fashion, dollar bars are sampled every time a pre-defined market value is exchanged. The pros and cons of both methods are discussed by M. López de Prado.

But how do we set this pre-defined amount? 1,000 is an arbitrary number, and for highly liquid securities many bars will be created, while few bars will be created for illiquid ones. We could instead reference the security's past trading activity to determine what size to set for each bar.

As an example, we will download 1 month of historical data, aggregate the volume (resp market value) of the security, and divide it by the number of trading minutes in the 1 month period. This estimates the size of one minute volume (dollar) bar to sample.

#### Assume today is 2020/02/01. we will use past 1 month (2020/01/01 - 2020/01/31) trading activity to determine the sampling size.

In [2]:
# 1 month historical daily bars, from 2020/01/01 to 2020/01/31

historical_data = yf.download(tickers='AAPL', start='2020-01-02', end='2020-02-01')
historical_data.head()

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-02,74.059998,75.150002,73.797501,75.087502,73.988472,135480400
2020-01-03,74.287498,75.144997,74.125,74.357498,73.26915,146322800
2020-01-06,73.447502,74.989998,73.1875,74.949997,73.852974,118387200
2020-01-07,74.959999,75.224998,74.370003,74.597504,73.505653,108872000
2020-01-08,74.290001,76.110001,74.290001,75.797501,74.68808,132079200


In [3]:
# trading hours for AAPL stock
TRADING_INTERVALS = [(time(4,0), time(9, 30)),
                     (time(9,30), time(16,0)),
                     (time(16,0), time(20,0))]

# calculate number of trading minutes in a day
minutes_in_a_day = 0
for interval in TRADING_INTERVALS:
    # if trading day bleeds over to the next day
    if interval[1] < interval[0]:
        hours = (interval[1].hour - 0) + (24 - interval[0].hour)
    else:
        hours = interval[1].hour - interval[0].hour
    minutes = interval[1].minute - interval[0].minute
    
    minutes_in_a_day += 60*hours + minutes
    
number_of_days = historical_data.shape[0]
total_minutes = number_of_days * minutes_in_a_day

historical_data['market_value'] = historical_data['Volume'] * historical_data['Adj Close']
total_volume = historical_data.Volume.sum()
total_market_value = historical_data.market_value.sum()

volume_sampling_size = int(total_volume  / total_minutes)
dollar_sampling_size = int(total_market_value / total_minutes)

print(f"Volume sampling size is: {volume_sampling_size}")
print(f"Dollar sampling size is: {dollar_sampling_size}")

Volume sampling size is: 145554
Dollar sampling size is: 11191279


Now that we have calculated the volume (dollar) sampling sizes, we can resample our data into volume (dollar) bars. The actual procedure will require an event-driven system, where trades are fed one at a time, the volumes recorded and a bar is created once the volume crosses the pre-defined amount.

We may also resample time bars into volume bars during vectorised backtests, albeit with some estimation errors due to lower granularity of the data.

In [4]:
# 1 minute time bars for Apple stock, from 2020/02/01 to 2020/02/28

bar_data = download_historical_bar_data(symbol='AAPL', barsize='1min', lookback='year2month12')
bar_data

Unnamed: 0_level_0,open,high,low,close,volume
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-02-03 04:01:00,76.42,76.49,76.41,76.49,22792.0
2020-02-03 04:03:00,76.52,76.55,76.52,76.55,2196.0
2020-02-03 04:04:00,76.54,76.54,76.45,76.52,3512.0
2020-02-03 04:07:00,76.47,76.47,76.47,76.47,2044.0
2020-02-03 04:08:00,76.44,76.44,76.41,76.41,2660.0
...,...,...,...,...,...
2020-02-28 19:56:00,67.91,67.93,67.91,67.93,4200.0
2020-02-28 19:57:00,67.93,67.96,67.93,67.96,7844.0
2020-02-28 19:58:00,67.96,67.97,67.92,67.92,8288.0
2020-02-28 19:59:00,67.97,67.97,67.92,67.92,7616.0


### Resampling into volume bars

In [5]:
# data structures used to calculate OHLCV for each bar
prices = bar_data['close'].values
volumes = bar_data['volume'].values
volume_sampling_threshold = 145554
volume_counter = 0  # tracks how much volume has been traded so far, and creates a bar if amount crosses threshold
start_index = 0  # starting index for bar creation. OHLCV will be created from start_index to current_index

# stores volume sampled bars
volume_sampled_bars = []

for current_index in range(len(prices)):
    volume_counter += volumes[current_index]

    number_of_bars_created = int(volume_counter / volume_sampling_threshold)
    if number_of_bars_created > 0:
        for _ in range(number_of_bars_created):
            volume_sampled_bars.append(
                {'open': prices[start_index: current_index+1][0],
                 'high': np.max(prices[start_index: current_index+1]),
                 'low': np.min(prices[start_index: current_index+1]),
                 'close': prices[start_index: current_index+1][-1],
                 'volume': volume_sampling_threshold})
            
        # Once a bar is created, we update the starting index to be the next data point.
        # This is equivalent to "clearing" the list of trades that is collected one at a time,
        # for bar creation in the events-based version.
        start_index = current_index + 1
        volume_counter -= number_of_bars_created * volume_sampling_threshold
        
volume_bar_df = pd.DataFrame(volume_sampled_bars)

print('Time bars resampled into "1min volume bars"')
volume_bar_df

Time bars resampled into "1min volume bars"


Unnamed: 0,open,high,low,close,volume
0,76.49,76.55,75.64,75.68,145554
1,75.56,75.76,75.13,75.76,145554
2,75.74,75.74,75.60,75.67,145554
3,75.64,75.64,75.26,75.28,145554
4,75.27,75.48,75.26,75.48,145554
...,...,...,...,...,...
19519,67.93,67.96,67.91,67.96,145554
19520,67.91,67.91,67.81,67.90,145554
19521,67.93,68.07,67.87,68.07,145554
19522,68.07,68.21,68.03,68.14,145554


### Resampling into dollar bars

In [6]:
# data structures used to calculate OHLCV for each bar
prices = bar_data['close'].values
market_value = prices * bar_data['volume'].values
dollar_sampling_threshold = 11191279
dollar_counter = 0  # tracks how much market value has been traded so far, and creates a bar if amount crosses threshold
start_index = 0  # starting index for bar creation. OHLCV will be created from start_index to current_index

# stores dollar sampled bars
dollar_sampled_bars = []

for current_index in range(len(prices)):
    dollar_counter += market_value[current_index]

    number_of_bars_created = int(dollar_counter / dollar_sampling_threshold)
    if number_of_bars_created > 0:
        for _ in range(number_of_bars_created):
            dollar_sampled_bars.append(
                {'open': prices[start_index: current_index+1][0],
                 'high': np.max(prices[start_index: current_index+1]),
                 'low': np.min(prices[start_index: current_index+1]),
                 'close': prices[start_index: current_index+1][-1],
                 'dollar': dollar_sampling_threshold})
            
        # Once a bar is created, we update the starting index to be the next data point.
        # This is equivalent to "clearing" the list of trades that is collected one at a time,
        # for bar creation in the events-based version.
        start_index = current_index + 1
        dollar_counter -= number_of_bars_created * dollar_sampling_threshold
        
dollar_bar_df = pd.DataFrame(dollar_sampled_bars)

print('Time bars resampled into "1min dollar bars"')
dollar_bar_df

Time bars resampled into "1min dollar bars"


Unnamed: 0,open,high,low,close,dollar
0,76.49,76.55,75.56,75.56,11191279
1,75.44,75.76,75.13,75.76,11191279
2,75.74,75.74,75.60,75.64,11191279
3,75.53,75.58,75.26,75.27,11191279
4,75.26,75.50,75.26,75.50,11191279
...,...,...,...,...,...
18959,67.93,67.96,67.91,67.96,11191279
18960,67.96,67.96,67.81,67.90,11191279
18961,67.93,68.09,67.87,68.08,11191279
18962,68.10,68.21,68.07,68.15,11191279
