In [6]:
import sys

sys.path.append("../")

In [94]:
import pandas as pd
import numpy as np
from statsmodels.tsa.vector_ar.vecm import coint_johansen
from statsmodels.tsa.stattools import adfuller
from datetime import datetime, timedelta

DATA_PATH = '../../data/5min'

In [97]:
symbols = pd.read_csv(f'../../data/symbols.csv')['symbol'].values.tolist()[:20]

data = []
start_date = (datetime.utcnow() - timedelta(days=30)).strftime('%Y-%m-%d')

for symbol in symbols:
    try:
        df = pd.read_csv(f'{DATA_PATH}/{symbol}.csv', 
                        parse_dates=['open_time'], 
                        index_col=['open_time'])

        df.rename(columns={'close': symbol}, inplace=True)
        # the data is too long, just limit to recent period
        data.append(df[df.index > start_date][symbol])
    except:
        pass

df = pd.concat(data, axis=1)
df = df.dropna(axis=1, how='all')
df.dropna(inplace=True, how='any')

df.tail()

Unnamed: 0_level_0,1INCH-USDT,AAVE-USDT,ACM-USDT,ADA-USDT,AION-USDT,AKRO-USDT,ALGO-USDT,ALICE-USDT,ALPACA-USDT,ALPHA-USDT,ANKR-USDT,ANT-USDT,AR-USDT,ARDR-USDT,ARPA-USDT,ASR-USDT,ATA-USDT,ATM-USDT,ATOM-USDT,AUD-USDT
open_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2021-08-30 00:40:00,3.124,361.5,8.35,2.871,0.1868,0.03464,1.0151,20.27,1.3394,1.095,0.10644,4.88,42.8,0.2803,0.07363,7.391,1.58,16.53,20.8,0.7294
2021-08-30 00:45:00,3.094,359.6,8.337,2.854,0.186,0.03444,1.0126,20.08,1.3366,1.0895,0.106,4.846,41.97,0.2805,0.07325,7.4,1.565,16.5,20.69,0.7295
2021-08-30 00:50:00,3.098,358.9,8.319,2.847,0.1863,0.03442,1.0125,20.14,1.3365,1.0843,0.10598,4.853,41.98,0.2808,0.07325,7.38,1.5737,16.5,20.72,0.7297
2021-08-30 00:55:00,3.104,358.7,8.317,2.851,0.1856,0.03435,1.0113,20.15,1.3349,1.0854,0.10598,4.84,42.15,0.2799,0.07334,7.429,1.5675,16.45,20.72,0.7295
2021-08-30 01:00:00,3.103,358.9,8.357,2.853,0.1859,0.03445,1.0143,20.2,1.334,1.0864,0.10616,4.846,42.09,0.2793,0.07337,7.413,1.5766,16.49,20.78,0.7295


In [98]:
p = 1
COINTEGRATION_CONFIDENCE_LEVEL = 90

# the 90%, 95%, and 99% confidence levels for the trace statistic and maximum 
# eigenvalue statistic are stored in the first, second, and third column of 
# cvt and cvm, respectively
confidence_level_cols = {
    90: 0,
    95: 1,
    99: 2
}
confidence_level_col = confidence_level_cols[COINTEGRATION_CONFIDENCE_LEVEL]


def test_johansen(symbol_pairs):
    df_t = df[symbol_pairs].copy()

    # The second and third parameters indicate constant term, with a lag of 1. 
    result = coint_johansen(df_t, 0, p)

    trace_crit_value = result.cvt[:, confidence_level_col]
    eigen_crit_value = result.cvm[:, confidence_level_col]
    # print("trace_crit_value",trace_crit_value)
    # print("eigen_crit_value",eigen_crit_value)
    # print("lr1",result.lr1)
    # print("lr2",result.lr2)

    # The trace statistic and maximum eigenvalue statistic are stored in lr1 and lr2;
    # see if they exceeded the confidence threshold
    if np.all(result.lr1 >= trace_crit_value) and np.all(result.lr2 >= eigen_crit_value):
        # print(f"{symbol_pairs} are cointegrated")
        # The first i.e. leftmost column of eigenvectors matrix, result.evec, contains the best weights.
        v1= result.evec[:,0:1]
        hr=v1/-v1[1] #to get the hedge ratio divide the best_eigenvector by the negative of the second component of best_eigenvector
        #the regression will be: close of symbList[1] = hr[0]*close of symbList[0] + error
        #where the beta of the regression is hr[0], also known as the hedge ratio, and
        #the error of the regression is the mean reverting residual signal that you need to predict, it is also known as the "spread"
        #the spread = close of symbList[1] - hr[0]*close of symbList[0] or alternatively (the same thing):
        #do a regression with close of symbList[0] as x and lose of symbList[1] as y, and take the residuals of the regression to be the spread.
        coint_pair = dict(hedge_ratio=v1[:, 0])
        for i, s in enumerate(symbol_pairs):
            coint_pair[f'sid_{i+1}'] = s

        cointegrating_pairs.append(coint_pair)



In [100]:

import itertools as it

cointegrating_pairs = []

#get symbol pairs
pairs = list(it.combinations(symbols, 4))

for pair in pairs:
    test_johansen(list(pair))

coint_df = pd.DataFrame(cointegrating_pairs)
coint_df.head()

('1INCH-USDT', 'AAVE-USDT', 'ACM-USDT', 'ADA-USDT')
('1INCH-USDT', 'AAVE-USDT', 'ACM-USDT', 'AION-USDT')
('1INCH-USDT', 'AAVE-USDT', 'ACM-USDT', 'AKRO-USDT')
('1INCH-USDT', 'AAVE-USDT', 'ACM-USDT', 'ALGO-USDT')
('1INCH-USDT', 'AAVE-USDT', 'ACM-USDT', 'ALICE-USDT')
('1INCH-USDT', 'AAVE-USDT', 'ACM-USDT', 'ALPACA-USDT')
('1INCH-USDT', 'AAVE-USDT', 'ACM-USDT', 'ALPHA-USDT')
('1INCH-USDT', 'AAVE-USDT', 'ACM-USDT', 'ANKR-USDT')
('1INCH-USDT', 'AAVE-USDT', 'ACM-USDT', 'ANT-USDT')
('1INCH-USDT', 'AAVE-USDT', 'ACM-USDT', 'AR-USDT')
('1INCH-USDT', 'AAVE-USDT', 'ACM-USDT', 'ARDR-USDT')
('1INCH-USDT', 'AAVE-USDT', 'ACM-USDT', 'ARPA-USDT')
('1INCH-USDT', 'AAVE-USDT', 'ACM-USDT', 'ASR-USDT')
('1INCH-USDT', 'AAVE-USDT', 'ACM-USDT', 'ATA-USDT')
('1INCH-USDT', 'AAVE-USDT', 'ACM-USDT', 'ATM-USDT')
('1INCH-USDT', 'AAVE-USDT', 'ACM-USDT', 'ATOM-USDT')
('1INCH-USDT', 'AAVE-USDT', 'ACM-USDT', 'AUD-USDT')
('1INCH-USDT', 'AAVE-USDT', 'ADA-USDT', 'AION-USDT')
('1INCH-USDT', 'AAVE-USDT', 'ADA-USDT', 'AKRO-USDT

Unnamed: 0,hedge_ratio,sid_1,sid_2,sid_3,sid_4
0,"[0.46248276881633904, 709.6431556537798, -35.3...",1INCH-USDT,AKRO-USDT,ALGO-USDT,AUD-USDT
1,"[7.366773005363346, -16.02887614668137, -342.1...",1INCH-USDT,ALGO-USDT,ANKR-USDT,ARDR-USDT
2,"[0.16239878599410026, 173.56447910065663, 303....",ADA-USDT,AKRO-USDT,ANKR-USDT,ARDR-USDT
3,"[38.191182376590504, -2.6725510020982703, 346....",AION-USDT,ALGO-USDT,ANKR-USDT,ARDR-USDT
4,"[797.6590531465251, -18.303245533658288, 2.541...",AKRO-USDT,ALGO-USDT,ALPACA-USDT,ANKR-USDT
