# Description

The goal of this notebook is to demonstrate the various approaches of working with `dataflow`.

# Imports

In [1]:
import logging
import os

import pandas as pd

import core.config.config_ as cconconf
import core.finance as cofinanc
import core.finance.resampling as cfinresa
import core.finance.returns as cfinretu
import dataflow.core as dtfcore
import dataflow.system.source_nodes as dtfsysonod
import helpers.hdbg as hdbg
import helpers.hprint as hprint
import im_v2.ccxt.data.client as icdcl

  from tqdm.autonotebook import tqdm


In [2]:
hdbg.init_logger(verbosity=logging.INFO)

_LOG = logging.getLogger(__name__)

hprint.config_notebook()

[0m[36mINFO[0m: > cmd='/venv/lib/python3.8/site-packages/ipykernel_launcher.py -f /home/.local/share/jupyter/runtime/kernel-c5273168-8c30-4397-a861-2edc975561de.json'


# Config

In [3]:
def get_gallery_dataflow_example_config() -> cconconf.Config:
    """
    Get config, that specifies params for getting raw data.
    """
    config = cconconf.Config()
    # Load parameters.
    config.add_subconfig("load")
    config["load"]["aws_profile"] = "ck"
    config["load"]["data_dir"] = os.path.join(
        "s3://cryptokaizen-data", "historical"
    )
    config["load"]["data_snapshot"] = "latest"
    config["load"]["partition_mode"] = "by_year_month"
    # Data parameters.
    config.add_subconfig("data")
    config["data"]["start_date"] = pd.Timestamp("2021-09-01", tz="UTC")
    config["data"]["end_date"] = pd.Timestamp("2021-09-15", tz="UTC")
    config["data"]["resampling_rule"] = "5T"
    return config

In [4]:
config = get_gallery_dataflow_example_config()
print(config)

load:
  aws_profile: ck
  data_dir: s3://cryptokaizen-data/historical
  data_snapshot: latest
  partition_mode: by_year_month
data:
  start_date: 2021-09-01 00:00:00+00:00
  end_date: 2021-09-15 00:00:00+00:00
  resampling_rule: 5T


# Load historical data

In [5]:
# Specify params.
resample_1min = True
root_dir = config["load"]["data_dir"]
partition_mode = config["load"]["partition_mode"]
data_snapshot = config["load"]["data_snapshot"]
aws_profile = config["load"]["aws_profile"]

# Initiate the client.
historical_client = icdcl.CcxtHistoricalPqByTileClient(
    resample_1min,
    root_dir,
    partition_mode,
    data_snapshot=data_snapshot,
    aws_profile=aws_profile,
)

### Data Loader

In [6]:
# Specify time period.
full_symbols = ["binance::ADA_USDT", "binance::AVAX_USDT"]
start_date = config["data"]["start_date"]
end_date = config["data"]["end_date"]

# Load the data.
data_hist = historical_client.read_data(full_symbols, start_date, end_date)
display(data_hist.shape)
display(data_hist.head(3))

(40322, 6)

Unnamed: 0_level_0,full_symbol,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-09-01 00:00:00+00:00,binance::ADA_USDT,2.768,2.77,2.762,2.762,307378.2
2021-09-01 00:00:00+00:00,binance::AVAX_USDT,39.51,39.54,39.3,39.32,2483.93
2021-09-01 00:01:00+00:00,binance::ADA_USDT,2.763,2.765,2.761,2.764,74199.3


# Task description

The goal of this exercise is to implement the following transformations to the historical data:
- resampling
- VWAP, TWAP computation
- Calculation of returns

While using the different approaches to working with `dataflow` methods.
The main feature that these methods are trying to overcome is the fact that when the raw data consists of two and more `full_symbols`, then one needs to be careful to apply transformations that needs to be implemented specifically to each `full_symbol`.

These three approaches are:
- 1) Use the "low level" functions and do loops
- 2) Use pandas Multi-index
- 3) Use Dataflow nodes

The general rule is to use the third and second approach when possible, while keeping the first approach as a bacjup.

In [7]:
# The resampling frequency is the same for all approaches.
resampling_freq = config["data"]["resampling_rule"]

# Approach 1 - Use the "low level" functions and do loops

This approach does both resampling and computation of metrics and applied them individually to each `full_symbol` using the loop.

In [8]:
def resample_calculate_twap_vwap_and_returns(df, resampling_freq):
    result = []
    full_symbol_list = df["full_symbol"].unique()
    for cc in full_symbol_list:
        # DataFrame with a specific `full_symbol`
        cc_df = df[df["full_symbol"] == cc]
        # Resample OHLCV data inside `full_symbol`-specific DataFrame.
        resampled_cc_df = cfinresa.resample_ohlcv_bars(
            cc_df, rule=resampling_freq
        )
        # Attach VWAP, TWAP.
        resampled_cc_df[["vwap", "twap"]] = cfinresa.compute_twap_vwap(
            cc_df, resampling_freq, price_col="close", volume_col="volume"
        )
        # Calculate returns.
        resampled_cc_df["vwap_rets"] = cfinretu.compute_ret_0(
            resampled_cc_df[["vwap"]], "pct_change"
        )
        resampled_cc_df["twap_rets"] = cfinretu.compute_ret_0(
            resampled_cc_df[["twap"]], "pct_change"
        )
        resampled_cc_df["log_rets"] = cfinretu.compute_ret_0(
            resampled_cc_df[["close"]], "log_rets"
        )
        # Add a column with `full_symbol` indication.
        resampled_cc_df["full_symbol"] = cc
        # Omit unnecesary columns.
        resampled_cc_df = resampled_cc_df.drop(columns=["open", "high", "low"])
        result.append(resampled_cc_df)
    final_df = pd.concat(result)
    return final_df

In [9]:
df_approach_1 = resample_calculate_twap_vwap_and_returns(data_hist, resampling_freq)
df_approach_1.head(3)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  price.loc[nan_idx] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  volume.loc[nan_idx] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  price.loc[nan_idx] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  volume.loc[nan_idx] = np.nan


Unnamed: 0_level_0,close,volume,vwap,twap,vwap_rets,twap_rets,log_rets,full_symbol
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-09-01 00:00:00+00:00,2.762,307378.2,2.762,2.762,,,,binance::ADA_USDT
2021-09-01 00:05:00+00:00,2.776,971520.4,2.761713,2.7624,-0.000104,0.000145,0.005056,binance::ADA_USDT
2021-09-01 00:10:00+00:00,2.764,1057998.3,2.769847,2.7694,0.002945,0.002534,-0.004332,binance::ADA_USDT


# Approach 2 - Use pandas Multi-index

In [10]:
# Convert historical data to multiindex format.
converted_data = dtfsysonod._convert_to_multiindex(data_hist, "full_symbol")
converted_data.head(3)

Unnamed: 0_level_0,close,close,high,high,low,low,open,open,volume,volume
Unnamed: 0_level_1,binance::ADA_USDT,binance::AVAX_USDT,binance::ADA_USDT,binance::AVAX_USDT,binance::ADA_USDT,binance::AVAX_USDT,binance::ADA_USDT,binance::AVAX_USDT,binance::ADA_USDT,binance::AVAX_USDT
timestamp,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
2021-09-01 00:00:00+00:00,2.762,39.32,2.77,39.54,2.762,39.3,2.768,39.51,307378.2,2483.93
2021-09-01 00:01:00+00:00,2.764,39.19,2.765,39.37,2.761,39.15,2.763,39.31,74199.3,4215.74
2021-09-01 00:02:00+00:00,2.757,39.13,2.764,39.26,2.756,39.0,2.764,39.19,199012.1,6472.69


In [11]:
# Resampling VWAP (besides potential errors). This implies hardcoded formula in a mix with resampling functions.
vwap_approach_2 = (converted_data["close"] * converted_data["volume"]).resample(
    resampling_freq
).mean() / converted_data["volume"].resample(resampling_freq).sum()
vwap_approach_2.head(3)

Unnamed: 0_level_0,binance::ADA_USDT,binance::AVAX_USDT
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-09-01 00:00:00+00:00,0.551679,7.840504
2021-09-01 00:05:00+00:00,0.554445,7.964266
2021-09-01 00:10:00+00:00,0.551208,7.946542


In [12]:
# Compute the ret_0 on all assets. You don't need a loop! But the data needs to be in the "right" format
# (the variable one wants to loop on needs to be the outermost in the levels, so one needs to do swaplevel).
rets_approach_2 = converted_data.swaplevel(axis=1).pct_change()
rets_approach_2.head(3)

Unnamed: 0_level_0,binance::ADA_USDT,binance::AVAX_USDT,binance::ADA_USDT,binance::AVAX_USDT,binance::ADA_USDT,binance::AVAX_USDT,binance::ADA_USDT,binance::AVAX_USDT,binance::ADA_USDT,binance::AVAX_USDT
Unnamed: 0_level_1,close,close,high,high,low,low,open,open,volume,volume
timestamp,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
2021-09-01 00:00:00+00:00,,,,,,,,,,
2021-09-01 00:01:00+00:00,0.000724,-0.003306,-0.001805,-0.004299,-0.000362,-0.003817,-0.001806,-0.005062,-0.758606,0.697206
2021-09-01 00:02:00+00:00,-0.002533,-0.001531,-0.000362,-0.002794,-0.001811,-0.003831,0.000362,-0.003053,1.682129,0.535363


In [13]:
# To go back to a flat index representation.
rets_approach_2.columns = ["_".join(col) for col in rets_approach_2.columns]
rets_approach_2.head(3)

Unnamed: 0_level_0,binance::ADA_USDT_close,binance::AVAX_USDT_close,binance::ADA_USDT_high,binance::AVAX_USDT_high,binance::ADA_USDT_low,binance::AVAX_USDT_low,binance::ADA_USDT_open,binance::AVAX_USDT_open,binance::ADA_USDT_volume,binance::AVAX_USDT_volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2021-09-01 00:00:00+00:00,,,,,,,,,,
2021-09-01 00:01:00+00:00,0.000724,-0.003306,-0.001805,-0.004299,-0.000362,-0.003817,-0.001806,-0.005062,-0.758606,0.697206
2021-09-01 00:02:00+00:00,-0.002533,-0.001531,-0.000362,-0.002794,-0.001811,-0.003831,0.000362,-0.003053,1.682129,0.535363


# Approach 3 - Use Dataflow nodes

One node does resampling and VWAP, TWAP calculations, the other does returns.

In [14]:
# Configure the node to do the TWAP / VWAP resampling.
node_resampling_config = {
    "in_col_groups": [
        ("close",),
        ("volume",),
    ],
    "out_col_group": (),
    "transformer_kwargs": {
        "rule": resampling_freq,
        "resampling_groups": [
            ({"close": "close"}, "last", {}),
            (
                {
                    "close": "twap",
                },
                "mean",
                {},
            ),
            (
                {
                    "volume": "volume",
                },
                "sum",
                {"min_count": 1},
            ),
        ],
        "vwap_groups": [
            ("close", "volume", "vwap"),
        ],
    },
    "reindex_like_input": False,
    "join_output_with_input": False,
}
# Put the data in the DataFlow format (which is multi-index).
converted_data = dtfsysonod._convert_to_multiindex(data_hist, "full_symbol")
# Create the node.
nid = "resample"
node = dtfcore.GroupedColDfToDfTransformer(
    nid,
    transformer_func=cofinanc.resample_bars,
    **node_resampling_config,
)
# Compute the node on the data.
vwap_twap = node.fit(converted_data)
# Save the result.
vwap_twap_approach_3 = vwap_twap["df_out"]
vwap_twap_approach_3.head(3)

Unnamed: 0_level_0,close,close,twap,twap,volume,volume,vwap,vwap
Unnamed: 0_level_1,binance::ADA_USDT,binance::AVAX_USDT,binance::ADA_USDT,binance::AVAX_USDT,binance::ADA_USDT,binance::AVAX_USDT,binance::ADA_USDT,binance::AVAX_USDT
timestamp,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
2021-09-01 00:00:00+00:00,2.762,39.32,2.762,39.32,307378.2,2483.93,2.762,39.32
2021-09-01 00:05:00+00:00,2.776,39.77,2.7624,39.322,971520.4,35482.15,2.761713,39.396582
2021-09-01 00:10:00+00:00,2.764,39.87,2.7694,39.83,1057998.3,30734.79,2.769847,39.847694


In [15]:
# Configure the node to calculate the returns.
node_returns_config = {
    "in_col_groups": [
        ("close",),
        ("vwap",),
        ("twap",),
    ],
    "out_col_group": (),
    "transformer_kwargs": {
        "mode": "pct_change",
    },
    "col_mapping": {
        "close": "close.ret_0",
        "vwap": "vwap.ret_0",
        "twap": "twap.ret_0",
    },
}
# Create the node that computes ret_0.
nid = "ret0"
node = dtfcore.GroupedColDfToDfTransformer(
    nid,
    transformer_func=cofinanc.compute_ret_0,
    **node_returns_config,
)
# Compute the node on the data.
rets = node.fit(vwap_twap_approach_3)
# Save the result.
vwap_twap_rets_approach_3 = rets["df_out"]
vwap_twap_rets_approach_3.head(3)

Unnamed: 0_level_0,close.ret_0,close.ret_0,twap.ret_0,twap.ret_0,vwap.ret_0,vwap.ret_0,close,close,twap,twap,volume,volume,vwap,vwap
Unnamed: 0_level_1,binance::ADA_USDT,binance::AVAX_USDT,binance::ADA_USDT,binance::AVAX_USDT,binance::ADA_USDT,binance::AVAX_USDT,binance::ADA_USDT,binance::AVAX_USDT,binance::ADA_USDT,binance::AVAX_USDT,binance::ADA_USDT,binance::AVAX_USDT,binance::ADA_USDT,binance::AVAX_USDT
timestamp,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
2021-09-01 00:00:00+00:00,,,,,,,2.762,39.32,2.762,39.32,307378.2,2483.93,2.762,39.32
2021-09-01 00:05:00+00:00,0.005069,0.011445,0.000145,5.1e-05,-0.000104,0.001948,2.776,39.77,2.7624,39.322,971520.4,35482.15,2.761713,39.396582
2021-09-01 00:10:00+00:00,-0.004323,0.002514,0.002534,0.012919,0.002945,0.011451,2.764,39.87,2.7694,39.83,1057998.3,30734.79,2.769847,39.847694
