In [1]:
import os
import time
import pickle
import pandas as pd
import numpy as np
import fathon
from fathon import fathonUtils as utils
from multiprocessing import Pool
from scipy.stats import linregress
data_folder = 'path/to/data/folder'
symbol_list = sorted(['FB1', 'JB1', 'XM1', 'TY1', 'TU1', 'RX1', 'YM1', 'G_1', 'KE1', 'FV1', 'US1', 'DU1'])  # symbols for T71 USB Drive
window_sizes = utils.linRangeByStep(10, 1000, step=20)
polynomial_order = 2

num_processes = 4  # Set the number of processes for parallelization


In [2]:
base_path = '/media/ak/T71/August11th2022Experiments'
# the files in exp_input_path, have more variables in them like this 'micro_price', 'price_imbalance', 'pct_change_micro_price',
#        'weighted_activity_spread', 'GK_vol', 'arrival_rates',
#        'returns_normalised', 'returns_mix_max', 'skew', 'kurt',
#        'median_traded_volume']
exp_input_path = os.path.join(base_path, 'ExperimentInputFiles')
exp_one_path = os.path.join(base_path, 'ExperimentOne')
# the symbol files in exp_one_path, contain files with ['micro_price_change', 'arrival_rates', 'gk_vol', 'median_traded_volume', 'tick']
symbols = os.listdir(exp_one_path) 

hurstFiles = [f for f in os.listdir(exp_input_path) if str('HurstD') in f] # these are mixed symbol files
rhoFiles = [f for f in os.listdir(exp_input_path) if str('Rho') in f] # these are mixed symbol files


In [3]:
os.listdir(exp_one_path)

['FB1',
 'JB1',
 'XM1',
 'TY1',
 'TU1',
 'RX1',
 'YM1',
 'G_1',
 'KE1',
 'FV1',
 'US1',
 'DU1']

In [4]:
# os.listdir(exp_input_path)
# path = os.path.join(exp_input_path,'G_1')
# files = os.listdir(path)
# fileIdx = 1
# pd.read_pickle(os.path.join(path, files[fileIdx])).columns.values

In [5]:
# fb1_path = os.path.join(exp_one_path, 'FB1')
# fb1_files= os.listdir(fb1_path)
# idx_file = 4

# pd.read_pickle(os.path.join(fb1_path, fb1_files[idx_file])).keys()

In [66]:
class CrossCorrelation:
    def __init__(self, path, symbols, symbol_index, bar_choice):
        """
        Initialize CrossCorrelation object.

        :param path: str, path to the data folder
        :param symbols: list, list of symbols
        :param symbol_index: int, index for the symbol
        :param bar_choice: str, bar for information clock
        """
        self._symbol_index = symbol_index
        self._symbols = symbols
        self._bar = bar_choice
        self._symbol = self._symbols[self._symbol_index]
        self._symbol_filepath = os.path.join(path, str(self._symbol))
        self._list_of_files = os.listdir(self._symbol_filepath)

    def get_data_and_path(self):
        """
        Get list of files and path to those files.

        :return: tuple, (list of files, path to those files)
        """
        files_with_bar = [file for file in self._list_of_files if str(self._bar) in file]
        return files_with_bar, self._symbol_filepath

    def get_all_data_from_file(self, file_index):
        """
        Get all data from file.

        :param file_index: int, index of file in the list
        :return: DataFrame, data from the file
        """
        files_with_bar, symbol_filepath = self.get_data_and_path()
        file_to_get = os.path.join(symbol_filepath, files_with_bar[file_index])
        return pd.read_pickle(file_to_get)

    def get_microvar_data(self, file_index, variable):
        """
        Get microvar data.

        :param file_index: int, index of file in the list
        :param variable: str, micro-structure market variable
        :return: DataFrame, microstructure variable data
        """
        data_dict = self.get_all_data_from_file(file_index)
        return data_dict[str(variable)]

    @staticmethod
    def compute_n_rho(var_a, var_b, window_sizes, polynomial_order):
        """
        Compute n and Rho for the given variables.

        :param var_a: Series, first variable for the dCCa rho
        :param var_b: Series, second variable for the dCCa rho
        :param window_sizes: list, window sizes used for the computation
        :param polynomial_order: int, polynomial order
        :return: tuple, (n, Rho)
        """

        aggregated_var_a = utils.toAggregated(var_a)
        aggregated_var_b = utils.toAggregated(var_b)
        # Check if the last value in window_sizes is smaller than the input vector lengths
        min_input_length = min(len(aggregated_var_a), len(aggregated_var_b))
        if window_sizes[-1] >= min_input_length:
            # Update the window_sizes array
            window_sizes = [size for size in window_sizes if size < min_input_length]
            window_sizes = np.asanyarray([int(size) for size in window_sizes],dtype=np.int64)
        try:
            dcca = fathon.DCCA(aggregated_var_a, aggregated_var_b)
            n, _ = dcca.computeFlucVec(window_sizes, polOrd=polynomial_order)
            _, rho = dcca.computeRho(window_sizes, polOrd=polynomial_order)
        except ZeroDivisionError:
            pass
        return n, rho
    @staticmethod
    def compute_n_F_h_h_intercept_dcca(var_a, var_b, window_sizes, polynomial_order):
        """
        Compute the Hurst exponent and the intercept of the linear regression using the DCCA method.

        :param n: array, window sizes
        :param rho: array, rho values corresponding to the window sizes
        :return: tuple, (hurst_exponent, intercept)
        """
        aggregated_var_a = utils.toAggregated(var_a)
        aggregated_var_b = utils.toAggregated(var_b)
        # Check if the last value in window_sizes is smaller than the input vector lengths
        min_input_length = min(len(aggregated_var_a), len(aggregated_var_b))
        if window_sizes[-1] >= min_input_length:
            # Update the window_sizes array
            window_sizes = [size for size in window_sizes if size < min_input_length]
            window_sizes = np.asanyarray([int(size) for size in window_sizes],dtype=np.int64)
        try:
            dcca = fathon.DCCA(aggregated_var_a, aggregated_var_b)
            n, F = dcca.computeFlucVec(window_sizes, polOrd=polynomial_order)
            H, H_intercept = pydcca.fitFlucVec()
        except ZeroDivisionError:
            pass
        return n, F, H, H_intercept


In [45]:
cc3 = CrossCorrelation(path= exp_one_path, symbols=symbol_list, symbol_index= 3, bar_choice= 'tick')

In [46]:
list_of_files, symbol_path = cc3.get_data_and_path()
data_from_file = cc3.get_all_data_from_file(3)
mfdfa_data_for_file = data_from_file['tick'] # requires bar input

In [51]:
var_a = cc3.get_microvar_data(2, 'arrival_rates')
var_b = cc3.get_microvar_data(2, 'median_traded_volume')
index=0
aggregated_var_a = utils.toAggregated(var_a)
aggregated_var_b = utils.toAggregated(var_b)
# # Check if the last value in window_sizes is smaller than the input vector lengths
# min_input_length = min(len(aggregated_var_a), len(aggregated_var_b))
# if window_sizes[-1] >= min_input_length:
#     # Update the window_sizes array
#         window_sizes = [size for size in window_sizes if size < min_input_length]
#         window_sizes = np.asanyarray([int(size) for size in window_sizes],dtype=np.int64)
# try:
#     dcca = fathon.DCCA(aggregated_var_a, aggregated_var_b)
#     n, _ = dcca.computeFlucVec(window_sizes, polOrd=polynomial_order)
#     _, rho = dcca.computeRho(window_sizes, polOrd=polynomial_order)
# except ZeroDivisionError:
#     pass

In [42]:
rho

array([-0.00194493,  0.01077823,  0.00190351, -0.00715906, -0.01056668,
       -0.01356149, -0.01729795, -0.0207053 , -0.02158909, -0.01991595,
       -0.01682543, -0.01300241, -0.00847855, -0.00313096,  0.00273418,
        0.00856487,  0.01387941,  0.01855371,  0.02274335,  0.02668775,
        0.03048691,  0.03411193,  0.03743176,  0.04033947,  0.04284276,
        0.04505329,  0.04710566,  0.04909785,  0.05110951,  0.05317847,
        0.05531206,  0.05751641,  0.0597705 ,  0.06202193,  0.06424143,
        0.06641752,  0.06852925,  0.07056299,  0.07252554,  0.07444849,
        0.07636047,  0.07826817,  0.08015647,  0.08199829,  0.08376719,
        0.08544432,  0.08701286,  0.08845927,  0.08978884,  0.09101205])

In [48]:
n, rho =cc3.compute_n_rho(var_a=arrival_rates, var_b=median_traded_volume, window_sizes=window_sizes, polynomial_order=polynomial_order)

In [49]:
cc3.compute_h_h_intercept_dcca(n, rho)



(nan, nan)

In [None]:
@staticmethod
def compute_h_h_intercept_dcca(args):
var_a, var_b, idx, win_sizes, pol_ord = args
a = fu.toAggregated(var_a[idx])
b = fu.toAggregated(var_b[idx])

n, h = pydcca.computeHurstVec(win_sizes, polOrd=pol_ord)
h_intercept = pydcca.computeHurstIntercept()
return n, h, h_intercept

In [58]:
n, F = pydcca.computeFlucVec(winSizes=window_sizes, polOrd=polynomial_order)

H, H_intercept = pydcca.fitFlucVec()

In [62]:
H_intercept

-0.3116545522780598

In [67]:
a = utils.toAggregated(var_a)
b = utils.toAggregated(var_b)
pymfdcca = fathon.MFDCCA(a, b)

winSizes = fu.linRangeByStep(10, 2000)
qs = np.arange(-3, 4, 0.1)
revSeg = True
polOrd = 1

n, F = pymfdcca.computeFlucVec(winSizes, qs, revSeg=revSeg, polOrd=polOrd)

list_H, list_H_intercept = pymfdcca.fitFlucVec()





AttributeError: module 'fathon' has no attribute 'MFDCCA'

In [73]:
!pip install --upgrade fathon

