In [1]:
import sys

sys.path.append("../")

In [5]:
import requests
import time
import pandas as pd
import numpy as np
from statsmodels.tsa.vector_ar.vecm import coint_johansen
from stats_arb.tests import adf_test, kpss_test, cal_half_life, pp_test
from datetime import datetime, timedelta
from ta.volatility import BollingerBands
from datetime import datetime
import seaborn as sns
from IPython.display import clear_output
import statsmodels.api as sm

from bokeh.layouts import column
from bokeh.plotting import figure, output_notebook, show, output_file

from bokeh.models import (  # type: ignore
    ColumnDataSource,
    Span,
    HoverTool,
)

import matplotlib.pyplot as plt


plt.rcParams["figure.figsize"] = (18,7)


In [3]:
API_BASE = 'https://fapi.binance.com/fapi/v1/'
TIMEFRAME = '1h'

LABELS = [
    'open_time',
    'open',
    'high',
    'low',
    'close',
    'volume',
    'close_time',
    'quote_asset_volume',
    'number_of_trades',
    'taker_buy_base_asset_volume',
    'taker_buy_quote_asset_volume',
    'ignore'
]

DROP_COLUMNS=[
    'close_time',
    'quote_asset_volume',
    'number_of_trades',
    'taker_buy_base_asset_volume',
    'taker_buy_quote_asset_volume',
    'ignore'
]


def get_batch(symbol, interval='1m', start_time=0, limit=1000):
    """Use a GET request to retrieve a batch of candlesticks. Process the JSON into a pandas
    dataframe and return it. If not successful, return an empty dataframe.
    """

    params = {
        'symbol': symbol,
        'interval': interval,
        'startTime': start_time,
        'limit': limit
    }
    try:
        # timeout should also be given as a parameter to the function
        response = requests.get(f'{API_BASE}klines', params, timeout=30)
    except requests.exceptions.ConnectionError:
        print('Connection error, Cooling down for 5 mins...')
        time.sleep(5 * 60)
        return get_batch(symbol, interval, start_time, limit)

    except requests.exceptions.Timeout:
        print('Timeout, Cooling down for 5 min...')
        time.sleep(5 * 60)
        return get_batch(symbol, interval, start_time, limit)

    if response.status_code == 200:
        return pd.DataFrame(response.json(), columns=LABELS)
    
    print(f'Got erroneous response back {symbol}: {response}. {response.text}')
    return pd.DataFrame([])


def get_candles(base, quote, start_date: datetime, interval='1m'):
    batches = []

    last_timestamp = int(start_date.timestamp()) * 1000
    # gather all candlesticks available, starting from the last timestamp loaded from disk or 0
    # stop if the timestamp that comes back from the api is the same as the last one
    previous_timestamp = None

    while previous_timestamp != last_timestamp:
        # stop if we reached data from today
        if datetime.fromtimestamp(last_timestamp / 1000) >= datetime.utcnow():
            break

        previous_timestamp = last_timestamp

        new_batch = get_batch(
            symbol=base + quote,
            interval=interval,
            start_time=last_timestamp
        )

        # requesting candles from the future returns empty
        # also stop in case response code was not 200
        if new_batch.empty:
            break

        last_timestamp = new_batch['open_time'].max()

        # sometimes no new trades took place yet on date.today();
        # in this case the batch is nothing new
        if previous_timestamp == last_timestamp:
            break

        batches.append(new_batch)
        last_datetime = datetime.fromtimestamp(last_timestamp / 1000)

        covering_spaces = 20 * ' '
        print(datetime.now(), base, quote, interval, str(last_datetime) + covering_spaces, end='\r', flush=True)

    if len(batches) > 0:
        # write clean version of csv to parquet
        df = pd.concat(batches, ignore_index=True)
        df.drop(columns=DROP_COLUMNS, inplace=True)
        df['open_time'] = pd.to_datetime(df['open_time'], unit='ms')
        df.set_index(keys=['open_time'], inplace=True)
        return df



In [8]:
def cal_hedge_ratio(Y, X):
    # Look into using Kalman Filter to calculate the hedge ratio
    X = sm.add_constant(X)
    model = sm.OLS(Y, X).fit()
    return model.params[1]

In [9]:
def check_pair(symbols, hedge_ratio, half_life, timeframe=TIMEFRAME, nb_symbols=2, zscore_lookback=500, zscore_range=None):
    data = []

    for symbol in symbols:
        if timeframe == '1h':
            p = 4 * 30
        elif timeframe == '15m':
            p = 4 * 30
        else:
            p = 30

        # print(p)
        df = get_candles(base=symbol, quote='USDT', start_date=datetime.utcnow() - timedelta(days=p), interval=timeframe)
        if df is None:
            continue

        df.rename(columns={'close': symbol}, inplace=True)
        # the data is too long, just limit to recent period
        log = np.log(df[symbol].astype(np.float32))
        data.append(log)

    df = pd.concat(data, axis=1)
    df = df.dropna(axis=1, how='all')

    hedge_r = cal_hedge_ratio(df[symbols[0]], df[symbols[1]])
    spread = df[symbols[0]] - hedge_r * df[symbols[1]]
    # plot_spread(spread, lookback=zscore_lookback)
    # plot_zscore(spread, half_life, zscore_range=zscore_range, lookback=zscore_lookback, plot_name='-'.join(symbols))
    print(adf_test(spread, verbose=True))
    return spread

In [10]:
symbols = ['ETC', 'UNI']
half_life = 75
hedge_ratio = [19.544813797954433, -24.40562001819398]

spread = check_pair(
    symbols, hedge_ratio, half_life, timeframe='1h', 
    zscore_range=[2.5, 4, -2.5, -4], zscore_lookback=500
)

Results of Dickey-Fuller Test: USDT 1h 2023-04-12 08:00:00                    
Result: The series is  not  stationary
0.14060343538215003


  x = pd.concat(x[::order], 1)
