In [2]:
import sys

sys.path.append("../")

In [6]:
import pandas as pd
import numpy as np
from statsmodels.tsa.vector_ar.vecm import coint_johansen
from statsmodels.tsa.stattools import adfuller
from datetime import datetime, timedelta

DATA_PATH = '/mnt/d/Trading/trading-agent/crypto-pair-trading/data/5min'

In [9]:
# symbols = pd.read_csv(f'../../data/symbols.csv')['symbol'].values.tolist()
symbols = ['BTC', 'ETH', 'ADA', 'BNB', 'BCH', 'XRP', 'SOL', 'DOGE', 'LTC', 'LINK', 'XLM', 'NEAR']
symbols = [f'{s}-USDT' for s in symbols]

data = []
start_date = (datetime.utcnow() - timedelta(days=30)).strftime('%Y-%m-%d')

for symbol in symbols:
    try:
        file = f'{DATA_PATH}/{symbol}.csv'
        print(file)
        df = pd.read_csv(file, 
                        parse_dates=['open_time'], 
                        index_col=['open_time'])

        df.rename(columns={'close': symbol}, inplace=True)
        # the data is too long, just limit to recent period
        data.append(df[df.index > start_date][symbol])
    except:
        pass

df = pd.concat(data, axis=1)
df = df.dropna(axis=1, how='all')
df.dropna(inplace=True, how='any')

df.tail()

/mnt/d/Trading/trading-agent/crypto-pair-trading/data/5min/BTC-USDT.csv
/mnt/d/Trading/trading-agent/crypto-pair-trading/data/5min/ETH-USDT.csv
/mnt/d/Trading/trading-agent/crypto-pair-trading/data/5min/ADA-USDT.csv
/mnt/d/Trading/trading-agent/crypto-pair-trading/data/5min/BNB-USDT.csv
/mnt/d/Trading/trading-agent/crypto-pair-trading/data/5min/BCH-USDT.csv
/mnt/d/Trading/trading-agent/crypto-pair-trading/data/5min/XRP-USDT.csv
/mnt/d/Trading/trading-agent/crypto-pair-trading/data/5min/SOL-USDT.csv
/mnt/d/Trading/trading-agent/crypto-pair-trading/data/5min/DOGE-USDT.csv
/mnt/d/Trading/trading-agent/crypto-pair-trading/data/5min/LTC-USDT.csv
/mnt/d/Trading/trading-agent/crypto-pair-trading/data/5min/LINK-USDT.csv
/mnt/d/Trading/trading-agent/crypto-pair-trading/data/5min/XLM-USDT.csv
/mnt/d/Trading/trading-agent/crypto-pair-trading/data/5min/NEAR-USDT.csv


Unnamed: 0_level_0,BTC-USDT,ETH-USDT,ADA-USDT,BNB-USDT,BCH-USDT,XRP-USDT,SOL-USDT,DOGE-USDT,LTC-USDT,LINK-USDT,XLM-USDT,NEAR-USDT
open_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2021-08-30 00:40:00,48563.4,3206.72,2.871,476.7,655.5,1.1372,96.26,0.2792,173.6,25.51,0.3436,5.691
2021-08-30 00:45:00,48410.54,3187.69,2.854,474.4,654.7,1.1317,95.05,0.2781,172.9,25.39,0.3424,5.79
2021-08-30 00:50:00,48418.46,3188.74,2.847,473.3,653.8,1.1296,95.94,0.2785,172.4,25.31,0.3425,5.731
2021-08-30 00:55:00,48400.01,3190.53,2.851,473.3,654.2,1.1312,96.83,0.2785,172.6,25.37,0.3426,5.729
2021-08-30 01:00:00,48427.73,3190.47,2.853,473.6,654.6,1.1321,97.46,0.279,172.8,25.42,0.3424,5.709


In [None]:
p = 1
COINTEGRATION_CONFIDENCE_LEVEL = 90

# the 90%, 95%, and 99% confidence levels for the trace statistic and maximum 
# eigenvalue statistic are stored in the first, second, and third column of 
# cvt and cvm, respectively
confidence_level_cols = {
    90: 0,
    95: 1,
    99: 2
}
confidence_level_col = confidence_level_cols[COINTEGRATION_CONFIDENCE_LEVEL]


def test_johansen(symbol_pairs):
    df_t = df[symbol_pairs].copy()

    # The second and third parameters indicate constant term, with a lag of 1. 
    result = coint_johansen(df_t, 0, p)

    trace_crit_value = result.cvt[:, confidence_level_col]
    eigen_crit_value = result.cvm[:, confidence_level_col]
    # print("trace_crit_value",trace_crit_value)
    # print("eigen_crit_value",eigen_crit_value)
    # print("lr1",result.lr1)
    # print("lr2",result.lr2)

    # The trace statistic and maximum eigenvalue statistic are stored in lr1 and lr2;
    # see if they exceeded the confidence threshold
    if np.all(result.lr1 >= trace_crit_value) and np.all(result.lr2 >= eigen_crit_value):
        # print(f"{symbol_pairs} are cointegrated")
        # The first i.e. leftmost column of eigenvectors matrix, result.evec, contains the best weights.
        v1= result.evec[:,0:1]
        hr=v1/-v1[1] #to get the hedge ratio divide the best_eigenvector by the negative of the second component of best_eigenvector
        #the regression will be: close of symbList[1] = hr[0]*close of symbList[0] + error
        #where the beta of the regression is hr[0], also known as the hedge ratio, and
        #the error of the regression is the mean reverting residual signal that you need to predict, it is also known as the "spread"
        #the spread = close of symbList[1] - hr[0]*close of symbList[0] or alternatively (the same thing):
        #do a regression with close of symbList[0] as x and lose of symbList[1] as y, and take the residuals of the regression to be the spread.
        coint_pair = dict(hedge_ratio=v1[:, 0])
        for i, s in enumerate(symbol_pairs):
            coint_pair[f'sid_{i+1}'] = s

        cointegrating_pairs.append(coint_pair)



In [13]:

import itertools as it

cointegrating_pairs = []

#get symbol pairs
pairs = list(it.combinations(symbols, 3))

for pair in pairs:
    try:
        test_johansen(list(pair))
    except KeyError:
        pass

coint_df = pd.DataFrame(cointegrating_pairs)
coint_df.head()

Unnamed: 0,hedge_ratio,sid_1,sid_2,sid_3
0,"[0.0016846619402943984, 0.09584478737200776, -...",ETH-USDT,BCH-USDT,XLM-USDT
1,"[0.010319176444773542, -0.3660049313883571, 93...",ETH-USDT,LTC-USDT,XLM-USDT
2,"[0.07647061999180493, 0.11507398929309798, -13...",BCH-USDT,LTC-USDT,XLM-USDT


In [14]:
coint_df.iloc[0]['hedge_ratio']

array([ 1.68466194e-03,  9.58447874e-02, -1.31904159e+02])

In [15]:
coint_df.to_csv('coint_df.csv')