# Imports

In [1]:
import logging
import os

import pandas as pd

import helpers.dbg as hdbg
import helpers.env as henv
import helpers.printing as hprint
import helpers.s3 as hs3
import im.cryptodatadownload.data.load.loader as imcdalolo
import im_v2.ccxt.data.client.clients as imvcdclcl
import im_v2.ccxt.universe.universe as imvccunun
import im_v2.common.data.client.clients as ivcdclcl

In [2]:
hdbg.init_logger(verbosity=logging.INFO)

_LOG = logging.getLogger(__name__)

_LOG.info("%s", henv.get_system_signature()[0])

hprint.config_notebook()

[0m[36mINFO[0m: > cmd='/venv/lib/python3.8/site-packages/ipykernel_launcher.py -f /home/.local/share/jupyter/runtime/kernel-02f7a15b-3d5b-45e7-955e-d0e32da951ea.json'
>>ENV<<: is_inside_container=True: code_version=cmamp-1.0.3, container_version=cmamp-1.0.3, is_inside_docker=True, is_inside_ci=False, CI_defined=True, CI=''
>>ENV<<: AM_AWS_PROFILE=True AM_ECR_BASE_PATH=True AM_S3_BUCKET=True AM_TELEGRAM_TOKEN=True AWS_ACCESS_KEY_ID=False AWS_DEFAULT_REGION=False AWS_SECRET_ACCESS_KEY=False GH_ACTION_ACCESS_TOKEN=True
generated new fontManager
# Git
    branch_name='master'
    hash='8774fb65b'
    # Last commits:
      * 8774fb65b cryptomtc Amp task1955 lint 20211215 (#771)                                 (  10 hours ago) Thu Dec 16 01:22:06 2021  (HEAD -> master, origin/master, origin/HEAD)
      * c5ca50025 cryptomtc Amp task1786 integrate 20211214 (#763)                            (  11 hours ago) Thu Dec 16 00:14:21 2021           
      * 87b5f8ada Nikola Jašek CMTask767: Add si

# Load the data universe

## CCXT

In [3]:
ccxt_universe = imvccunun.get_vendor_universe(version="v03")

## CDD

In [4]:
universe_cdd = imvccunun.get_vendor_universe(version="v01", vendor="CDD")

# Compare universes

In [5]:
_LOG.info("Number of full symbols in CCXT: %s", len(ccxt_universe))
_LOG.info("Number of full symbols in CDD: %s", len(universe_cdd))

Number of full symbols in CCXT: 38
Number of full symbols in CDD: 117


In [6]:
# Intersection of full symbols between two vendors.
currency_pair_intersection = set(ccxt_universe).intersection(universe_cdd)
_LOG.info("Number of similar full symbols: %s", len(currency_pair_intersection))
display(currency_pair_intersection)

Number of similar full symbols: 18


{'binance::ADA_USDT',
 'binance::BNB_USDT',
 'binance::BTC_USDT',
 'binance::EOS_USDT',
 'binance::ETH_USDT',
 'binance::LINK_USDT',
 'binance::SOL_USDT',
 'ftx::BNB_USDT',
 'ftx::BTC_USDT',
 'ftx::ETH_USDT',
 'ftx::LINK_USDT',
 'ftx::XRP_USDT',
 'kucoin::ADA_USDT',
 'kucoin::BNB_USDT',
 'kucoin::BTC_USDT',
 'kucoin::EOS_USDT',
 'kucoin::ETH_USDT',
 'kucoin::XRP_USDT'}

In [7]:
# Full symbols that are included in CCXT but not in CDD.
ccxt_and_not_cdd = set(ccxt_universe).difference(universe_cdd)
_LOG.info(
    "Number of full symbols that are included in CCXT but not in CDD: %s",
    len(ccxt_and_not_cdd),
)
display(ccxt_and_not_cdd)

Number of full symbols that are included in CCXT but not in CDD: 20


{'binance::AVAX_USDT',
 'binance::DOGE_USDT',
 'ftx::DOGE_USDT',
 'ftx::SOL_USDT',
 'gateio::ADA_USDT',
 'gateio::AVAX_USDT',
 'gateio::BNB_USDT',
 'gateio::BTC_USDT',
 'gateio::DOGE_USDT',
 'gateio::EOS_USDT',
 'gateio::ETH_USDT',
 'gateio::FIL_USDT',
 'gateio::LINK_USDT',
 'gateio::SOL_USDT',
 'gateio::XRP_USDT',
 'kucoin::AVAX_USDT',
 'kucoin::DOGE_USDT',
 'kucoin::FIL_USDT',
 'kucoin::LINK_USDT',
 'kucoin::SOL_USDT'}

In [8]:
# Full symbols that are included in CDD but not in CCXT.
cdd_and_not_ccxt = set(universe_cdd).difference(ccxt_universe)
_LOG.info(
    "Number of full symbols that are included in CDD but not in CCXT: %s",
    len(cdd_and_not_ccxt),
)
display(cdd_and_not_ccxt)

Number of full symbols that are included in CDD but not in CCXT: 99


{'binance::AAVE_USDT',
 'binance::BAT_USDT',
 'binance::BTT_USDT',
 'binance::CELR_USDT',
 'binance::CVC_USDT',
 'binance::DAI_USDT',
 'binance::DASH_USDT',
 'binance::DOT_USDT',
 'binance::ETC_USDT',
 'binance::FIL_USDT',
 'binance::ICP_USDT',
 'binance::ICX_USDT',
 'binance::LRC_USDT',
 'binance::LTC_USDT',
 'binance::MATIC_USDT',
 'binance::MKR_USDT',
 'binance::NEO_USDT',
 'binance::ONE_USDT',
 'binance::PAX_USDT',
 'binance::QTUM_USDT',
 'binance::SCU_USDT',
 'binance::TRX_USDT',
 'binance::TUSD_USDT',
 'binance::UNI_USDT',
 'binance::USDC_USDT',
 'binance::VET_USDT',
 'binance::XLM_USDT',
 'binance::XMR_USDT',
 'binance::XRP_USDT',
 'binance::ZEC_USDT',
 'bitfinex::AID_USD',
 'bitfinex::BAT_USD',
 'bitfinex::BTC_EUR',
 'bitfinex::BTC_GBR',
 'bitfinex::BTC_JPY',
 'bitfinex::BTC_USD',
 'bitfinex::DAI_BTC',
 'bitfinex::DAI_USD',
 'bitfinex::DASH_BTC',
 'bitfinex::DASH_USD',
 'bitfinex::EDO_USD',
 'bitfinex::EOS_BTC',
 'bitfinex::EOS_EUR',
 'bitfinex::EOS_GBR',
 'bitfinex::EOS_JPY',


# Compare close prices / returns from Binance

## Load the data

The code below can be used to load all the existing data from two vendors CDD and CCXT. Current version is specified to Binance only, however, even for one exchange there's too many data to operate, that's why the output is the intersection of currency pairs between to universe, since one can compare only the intersection of currency pairs for two vendors.

In [10]:
# Load Binance-specific universe for CCXT.
ccxt_binance_universe = [
    element for element in ccxt_universe if element.startswith("binance")
]
# Load Binnance-specific universe for CDD.
cdd_binance_universe_initial = [
    element for element in universe_cdd if element.startswith("binance")
]
cdd_binance_universe = cdd_binance_universe_initial.copy()
# SCU_USDT has incorrect columns, so can not be downloaded.
# See CMTask244 - Cannot load CDD - binance - SCU/USDT from s3 for the reference.
cdd_binance_universe.remove("binance::SCU_USDT")
# The intersection of Binance currency pairs from two universes.
currency_pair_intersection_binance = set(ccxt_binance_universe).intersection(
    cdd_binance_universe_initial
)

In [None]:
root_dir = os.path.join(hs3.get_path(), "data")

In [11]:
cdd_data = []
cdd_loader = imcdalolo.CddLoader(root_dir=root_dir, aws_profile="am")

for full_symbol in currency_pair_intersection_binance:
    _, currency_pair = ivcdclcl.parse_full_symbol(full_symbol)
    cur_data = cdd_loader.read_data_from_filesystem(
        exchange_id="binance", currency_pair=currency_pair, data_type="ohlcv"
    )
    cdd_data.append(cur_data)
cdd_binance_df = pd.concat(cdd_data)

Reading CDD data for exchange id='binance', currencies='SOL_USDT', from file='s3://alphamatic-data/data/cryptodatadownload/20210924/binance/SOL_USDT.csv.gz'...
Processing CDD data for exchange id='binance', currencies='SOL_USDT'...
Index length increased by 270 = 129252 - 128982
Reading CDD data for exchange id='binance', currencies='ETH_USDT', from file='s3://alphamatic-data/data/cryptodatadownload/20210924/binance/ETH_USDT.csv.gz'...
Processing CDD data for exchange id='binance', currencies='ETH_USDT'...
Index length increased by 5288 = 948591 - 943303
Reading CDD data for exchange id='binance', currencies='BTC_USDT', from file='s3://alphamatic-data/data/cryptodatadownload/20210924/binance/BTC_USDT.csv.gz'...
Processing CDD data for exchange id='binance', currencies='BTC_USDT'...
Index length increased by 5289 = 1063179 - 1057890
Reading CDD data for exchange id='binance', currencies='EOS_USDT', from file='s3://alphamatic-data/data/cryptodatadownload/20210924/binance/EOS_USDT.csv.gz'

In [12]:
display(cdd_binance_df.head(3))
display(cdd_binance_df.shape)

Unnamed: 0,open,high,low,close,volume,epoch,currency_pair,exchange_id
2021-06-06 06:57:00+00:00,39.654,39.709,39.592,39.649,3053.129,1622963000000.0,SOL/USDT,binance
2021-06-06 06:58:00+00:00,39.646,39.719,39.633,39.634,874.508,1622963000000.0,SOL/USDT,binance
2021-06-06 06:59:00+00:00,39.634,39.686,39.624,39.658,906.122,1622963000000.0,SOL/USDT,binance


(5599808, 8)

In [13]:
ccxt_client = imvcdclcl.CcxtCsvFileSystemClient(
    data_type="ohlcv", root_dir=root_dir, aws_profile="am"
)
multiple_symbols_client = ivcdclcl.MultipleSymbolsClient(
    class_=ccxt_client, mode="concat"
)
ccxt_binance_df = multiple_symbols_client.read_data(
    currency_pair_intersection_binance
)

Reading CCXT data for exchange id='binance', currencies='ADA_USDT' from file='s3://alphamatic-data/data/ccxt/20210924/binance/ADA_USDT.csv.gz'...
Processing CCXT data for exchange id='binance', currencies='ADA_USDT'...
Index length increased by 4520 = 1622000 - 1617480
Reading CCXT data for exchange id='binance', currencies='BNB_USDT' from file='s3://alphamatic-data/data/ccxt/20210924/binance/BNB_USDT.csv.gz'...
Processing CCXT data for exchange id='binance', currencies='BNB_USDT'...
Index length increased by 4520 = 1622295 - 1617775
Reading CCXT data for exchange id='binance', currencies='BTC_USDT' from file='s3://alphamatic-data/data/ccxt/20210924/binance/BTC_USDT.csv.gz'...
Processing CCXT data for exchange id='binance', currencies='BTC_USDT'...
Index length increased by 4519 = 1619641 - 1615122
Reading CCXT data for exchange id='binance', currencies='EOS_USDT' from file='s3://alphamatic-data/data/ccxt/20210924/binance/EOS_USDT.csv.gz'...
Processing CCXT data for exchange id='binanc

In [14]:
ccxt_binance_df = ccxt_binance_df.sort_index()

In [15]:
display(ccxt_binance_df.head(3))
display(ccxt_binance_df.shape)

Unnamed: 0,full_symbol,open,high,low,close,volume,epoch,currency_pair,exchange_id
2018-08-17 00:00:00+00:00,binance::ADA_USDT,0.0946,0.0948,0.09442,0.09479,41334.2,1534464000000.0,ADA_USDT,binance
2018-08-17 00:00:00+00:00,binance::BTC_USDT,6316.0,6319.04,6310.32,6311.64,9.967395,1534464000000.0,BTC_USDT,binance
2018-08-17 00:00:00+00:00,binance::BNB_USDT,9.7779,9.7791,9.7538,9.7778,520.66,1534464000000.0,BNB_USDT,binance


(10084929, 9)

## Calculate returns and correlation

In [16]:
# CDD names cleaning.
cdd_binance_df["currency_pair"] = cdd_binance_df["currency_pair"].str.replace(
    "/", "_"
)

In [17]:
def resample_close_price(df: pd.DataFrame, resampling_freq: str) -> pd.Series:
    """
    Transform OHLCV data to the grouped series with resampled frequency and
    last close prices.

    :param df: OHLCV data
    :param resampling_freq: frequency from `pd.date_range()` to resample to
    :return: grouped and resampled close prices
    """
    # Reseting DateTime index, since pd.Grouper can't use index values.
    df = df.reset_index().rename(columns={"index": "stamp"})
    # Group by currency pairs and simultaneously resample to the desired frequency.
    resampler = df.groupby(
        ["currency_pair", pd.Grouper(key="stamp", freq=resampling_freq)]
    )
    # Take the last close value from each resampling period.
    close_series = resampler.close.last()
    return close_series

In [18]:
def calculate_correlations(
    ccxt_close_price: pd.Series, cdd_close_price: pd.Series, compute_returns: bool
) -> pd.DataFrame:
    """
    Take two series with close prices(i.e. CDD and CCXT data) and calculate the
    correlations for each specific currency pair.

    :param ccxt_series: grouped and resampled close prices for CCXT
    :param cdd_series: grouped and resampled close prices for CDD
    :param compute_returns: if True - compare returns, if False - compare close prices
    :return: grouped correlation matrix
    """
    if compute_returns:
        # Group by currency pairs in order to calculate the percentage returns.
        grouper_cdd = cdd_close_price.groupby("currency_pair")
        cdd_close_price = grouper_cdd.pct_change()
        grouper_ccxt = ccxt_close_price.groupby("currency_pair")
        ccxt_close_price = grouper_ccxt.pct_change()
    # Combine and calculate correlations.
    combined = pd.merge(
        cdd_close_price, ccxt_close_price, left_index=True, right_index=True
    )
    # Rename the columns.
    if compute_returns:
        combined.columns = ["ccxt_returns", "cdd_returns"]
    else:
        combined.columns = ["cdd_close", "ccxt_close"]
    # Group by again to calculte returns correlation for each currency pair.
    corr_matrix = combined.groupby(level=0).corr()
    return corr_matrix

In [19]:
# Corresponding resampled Series.
ccxt_binance_series_1d = resample_close_price(ccxt_binance_df, "1D")
cdd_binance_series_1d = resample_close_price(cdd_binance_df, "1D")

ccxt_binance_series_5min = resample_close_price(ccxt_binance_df, "5min")
cdd_binance_series_5min = resample_close_price(cdd_binance_df, "5min")

### 1-day returns

In [20]:
returns_corr_1day = calculate_correlations(
    ccxt_binance_series_1d, cdd_binance_series_1d, compute_returns=True
)
display(returns_corr_1day)

Unnamed: 0_level_0,Unnamed: 1_level_0,ccxt_returns,cdd_returns
currency_pair,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ADA_USDT,ccxt_returns,1.0,0.997807
ADA_USDT,cdd_returns,0.997807,1.0
BNB_USDT,ccxt_returns,1.0,0.99849
BNB_USDT,cdd_returns,0.99849,1.0
BTC_USDT,ccxt_returns,1.0,0.997763
BTC_USDT,cdd_returns,0.997763,1.0
EOS_USDT,ccxt_returns,1.0,0.998294
EOS_USDT,cdd_returns,0.998294,1.0
ETH_USDT,ccxt_returns,1.0,0.995563
ETH_USDT,cdd_returns,0.995563,1.0


### 5-min returns

In [21]:
returns_corr_5min = calculate_correlations(
    ccxt_binance_series_5min, cdd_binance_series_5min, compute_returns=True
)
display(returns_corr_5min)

Unnamed: 0_level_0,Unnamed: 1_level_0,ccxt_returns,cdd_returns
currency_pair,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ADA_USDT,ccxt_returns,1.0,0.986912
ADA_USDT,cdd_returns,0.986912,1.0
BNB_USDT,ccxt_returns,1.0,0.987292
BNB_USDT,cdd_returns,0.987292,1.0
BTC_USDT,ccxt_returns,1.0,0.990746
BTC_USDT,cdd_returns,0.990746,1.0
EOS_USDT,ccxt_returns,1.0,0.993959
EOS_USDT,cdd_returns,0.993959,1.0
ETH_USDT,ccxt_returns,1.0,0.974821
ETH_USDT,cdd_returns,0.974821,1.0


## Compare close prices

In [22]:
close_corr_1day = calculate_correlations(
    ccxt_binance_series_1d, cdd_binance_series_1d, compute_returns=False
)
display(close_corr_1day)

Unnamed: 0_level_0,Unnamed: 1_level_0,cdd_close,ccxt_close
currency_pair,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ADA_USDT,cdd_close,1.0,0.999995
ADA_USDT,ccxt_close,0.999995,1.0
BNB_USDT,cdd_close,1.0,0.999999
BNB_USDT,ccxt_close,0.999999,1.0
BTC_USDT,cdd_close,1.0,1.0
BTC_USDT,ccxt_close,1.0,1.0
EOS_USDT,cdd_close,1.0,0.999906
EOS_USDT,ccxt_close,0.999906,1.0
ETH_USDT,cdd_close,1.0,0.999994
ETH_USDT,ccxt_close,0.999994,1.0
