In [9]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
import tqdm
import pandas as pd
import pathlib
import itertools
import warnings
import collections

from statsmodels.tsa.api import VAR
from statsmodels.tsa.stattools import coint
from statsmodels.tsa.vector_ar.vecm import coint_johansen

In [11]:
DATA_PATH = pathlib.Path("/Users/borsden/Projects/crypto_research/data")
def get_data(interval: int):
    """Use cached version of data instead of influxdb"""
    data_path = DATA_PATH / interval
    pairs_paths = data_path.glob("*.csv")
    for pair_path in tqdm.tqdm(pairs_paths):
        data = pd.read_csv(pair_path, index_col=['time', 'pair'], parse_dates=['time'])
        
        yield pair_path.stem, data
        
        
data = dict(get_data('1h'))
number_of_records = {pair: len(df) for pair, df in data.items()}

# There are enomorous pairs with not enough data. Let skip them
# all_records_size = collections.Counter(number_of_records.values()).most_common()[0][0]
# data = {key: value for key, value in data.items() if len(value)==all_records_size}
# number_of_records = {pair: len(df) for pair, df in data.items()}

50it [00:04, 11.45it/s]


In [12]:
number_of_records

{'runeusdt': 17520,
 'blzusdt': 17520,
 'nearusdt': 17520,
 'sushiusdt': 16520,
 'aaveusdt': 17520,
 'dotusdt': 17520,
 'dogeusdt': 17520,
 'filusdt': 17520,
 'trbusdt': 17520,
 'avaxusdt': 17520,
 'ognusdt': 17520,
 'linkusdt': 17520,
 'zrxusdt': 17520,
 'solusdt': 17520,
 'dydxusdt': 17520,
 'bchusdt': 17520,
 'bnbusdt': 17520,
 'bakeusdt': 17520,
 'ltcusdt': 17520,
 'stmxusdt': 17520,
 'compusdt': 17520,
 'atomusdt': 17520,
 'snxusdt': 17520,
 'linausdt': 17520,
 'mtlusdt': 17520,
 'crvusdt': 17520,
 'trxusdt': 17520,
 'uniusdt': 17520,
 'ftmusdt': 17520,
 '1000shibusdt': 17520,
 'ethusdt': 17520,
 'storjusdt': 17520,
 'btcusdt': 17520,
 'tomousdt': 17520,
 'hbarusdt': 17520,
 'eosusdt': 17520,
 'wavesusdt': 17520,
 'sandusdt': 17520,
 'etcusdt': 17520,
 'mkrusdt': 17520,
 'lptusdt': 17520,
 'unfiusdt': 17520,
 'adausdt': 17520,
 'maskusdt': 17520,
 'kncusdt': 17520,
 'xrpusdt': 17520,
 'xlmusdt': 17520,
 'axsusdt': 17520,
 'galausdt': 17520,
 'maticusdt': 17520}

In [13]:
def get_IS_OS(data: pd.DataFrame, ratio: float):
    """
    Split data into in-sample and out-of-sample datasets based on a specified ratio.
    """
    split_index = int(len(data) * ratio)
    in_sample_data = data.iloc[:split_index]
    out_of_sample_data = data.iloc[split_index:]
    return in_sample_data, out_of_sample_data

IS_DATA = []
OS_DATA = []
ratio = 0.8

for pair_data in data.values():
    IS_data, OS_data = get_IS_OS(pair_data, ratio=ratio)
    IS_DATA.append(IS_data)
    OS_DATA.append(OS_data)

IS_DATA = pd.concat(IS_DATA)
OS_DATA = pd.concat(OS_DATA)

In [14]:
trading_pairs = IS_DATA.index.get_level_values('pair').unique()
trading_pairs

Index(['runeusdt', 'blzusdt', 'nearusdt', 'sushiusdt', 'aaveusdt', 'dotusdt',
       'dogeusdt', 'filusdt', 'trbusdt', 'avaxusdt', 'ognusdt', 'linkusdt',
       'zrxusdt', 'solusdt', 'dydxusdt', 'bchusdt', 'bnbusdt', 'bakeusdt',
       'ltcusdt', 'stmxusdt', 'compusdt', 'atomusdt', 'snxusdt', 'linausdt',
       'mtlusdt', 'crvusdt', 'trxusdt', 'uniusdt', 'ftmusdt', '1000shibusdt',
       'ethusdt', 'storjusdt', 'btcusdt', 'tomousdt', 'hbarusdt', 'eosusdt',
       'wavesusdt', 'sandusdt', 'etcusdt', 'mkrusdt', 'lptusdt', 'unfiusdt',
       'adausdt', 'maskusdt', 'kncusdt', 'xrpusdt', 'xlmusdt', 'axsusdt',
       'galausdt', 'maticusdt'],
      dtype='object', name='pair')

In [17]:
TEST_COINTEGRATION_COLUMNS = [
    'pair1', 'pair2', 
    'eg1', 'eg2', 
    'trace0', 'trace1', 
    'eig0', 'eig1', 
    'w1', 'w2'
]


def test_cointegration(data):
    """
    Test for cointegration among trading pairs.
    Assumes that the data provided is already in-sample.
    """
    # Assuming the latest data in crypto_data is the end of the in-sample period
    trading_pairs = data.index.get_level_values('pair').unique()
    results = []
    
    trading_pairs_combinations = list(itertools.combinations(trading_pairs, 2))

    for pair1, pair2 in tqdm.tqdm(trading_pairs_combinations):
        # Some trading pairs can have 
        close_pair1 = data.xs(pair1, level='pair')['close'].ffill()
        close_pair2 = data.xs(pair2, level='pair')['close'].ffill()


        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            try:
                result = [pair1, pair2]
                result += [coint(close_pair1, close_pair2)[1], coint(close_pair2, close_pair1)[1]]
    
                df_merged = pd.concat([close_pair1, close_pair2], axis=1).dropna()
                if len(df_merged) < 100:
                    continue
    
                order = VAR(df_merged).select_order().selected_orders['aic']
                cj = coint_johansen(df_merged, det_order=0, k_ar_diff=order)
                result += list(cj.lr1) + list(cj.lr2) + list(cj.evec[:, cj.ind[0]])
                results.append(result)
            except Exception as e:
                print(e, pair1, pair2)

    return pd.DataFrame(results, columns=TEST_COINTEGRATION_COLUMNS)

In [20]:
test_cointegration_result = test_cointegration(IS_DATA)


In [21]:
test_cointegration_result.to_csv("./data/test_cointegration_result.csv", index=False)

In [1]:
test_cointegration_result

NameError: name 'test_cointegration_result' is not defined