In [105]:
import numpy as np
import pandas as pd
from scipy.stats.stats import pearsonr # used to calculate correlation coefficient
from pymongo import MongoClient
import json
from datetime import datetime, timedelta
from pprint import pprint
from enum import Enum
import copy
from collections import deque

# Read timeseries of benchmark and coin

In [119]:
filename_benchmark = 'testing/BTC_Bitfinex_USD.csv'
filename_coin      = 'testing/XRP_Bitfinex_USD.csv'

# --- read benchmark ---
df_benchmark = pd.read_csv(filename_benchmark, header=None, sep=";", 
                            names=['date', 'timestamp', 'open', 'high', 'low', 'close', 'volFrom', 'volTo'])
    
# this makes indexing via date faster
df_benchmark = df_benchmark.set_index(['date'])         # index: string
df_benchmark.index = pd.to_datetime(df_benchmark.index) # index: datetime

pprint(df_benchmark)

# --- read coin ---
df_coin = pd.read_csv(filename_coin, header=None, sep=";", 
                      names=['date', 'timestamp', 'open', 'high', 'low', 'close', 'volFrom', 'volTo'])
    
# this makes indexing via date faster
df_coin = df_coin.set_index(['date'])         # index: string
df_coin.index = pd.to_datetime(df_coin.index) # index: datetime

                        timestamp      open      high       low     close  \
date                                                                        
2018-01-01 00:00:00  1.514765e+09   9830.51   9830.51   9830.51   9830.51   
2018-01-01 01:00:00  1.514768e+09   9830.42   9830.42   9830.42   9830.42   
2018-01-01 02:00:00  1.514772e+09   9874.93   9874.93   9874.93   9874.93   
2018-01-01 03:00:00  1.514776e+09   9894.21   9894.21   9894.21   9894.21   
2018-01-01 04:00:00  1.514779e+09   9900.78   9900.78   9900.78   9900.78   
2018-01-01 05:00:00  1.514783e+09  10032.37  10032.37  10032.37  10032.37   
2018-01-01 06:00:00  1.514786e+09  10120.35  10120.35  10120.35  10120.35   
2018-01-01 07:00:00  1.514790e+09  10120.25  10120.25  10120.25  10120.25   
2018-01-01 08:00:00  1.514794e+09  10070.72  10070.72  10070.72  10070.72   
2018-01-01 09:00:00  1.514797e+09  10091.04  10091.04  10091.04  10091.04   
2018-01-01 10:00:00  1.514801e+09  10170.63  10170.63  10170.63  10170.63   

# Calculate Multiplier and Correlation

If there is no pre-defined libraries to compute (Pearson) correlation, it can also be calculated by its definition: <br>
https://en.wikipedia.org/wiki/Pearson_correlation_coefficient

This relies on calculating the covariance between two vectors, and the variance of each vector indidually. <br>
If you need further clarification please ask.

In [4]:
#

# input variables

#input parameter
dt_benchmark_startTime    = datetime.datetime.strptime("2018-01-01 00:00:00", "%Y-%m-%d %H:%M:%S")

#always current time
dt_benchmark_endTime      = datetime.datetime.strptime("2018-01-31 23:00:00", "%Y-%m-%d %H:%M:%S")

#input parameter
ReturnFrequency = "daily"

#list of currencies

#END input variables
    
dt_currentTime = dt_benchmark_startTime

# add first interval
dt_previousTime = dt_currentTime
if ReturnFrequency == "hourly":
    dt_currentTime += datetime.timedelta(hours=1)
elif ReturnFrequency == "daily":
    dt_currentTime += datetime.timedelta(days=1)
else:
    print('ERROR. Need to implment other frequencies')
    assert(False)


arr_PnL_benchmark = np.array([])
arr_PnL_coin       = np.array([])
   
#-----------------------------------------#
#          calculate return timeseries    #   
#-----------------------------------------#

while (dt_currentTime <= dt_benchmark_endTime):
    # calculate return of benchmark in period [t-1, t]
    PnL_benchmark = df_benchmark.loc[dt_currentTime]['close'] / \
                    df_benchmark.loc[dt_previousTime]['close'] -1.0
    arr_PnL_benchmark = np.append(arr_PnL_benchmark, PnL_benchmark)
    
    # calculate return of strategy in period [t-1, t] (based on equity, i.e. MtM value of positions)
    PnL_coin = df_coin.loc[dt_currentTime]['close']  / \
               df_coin.loc[dt_previousTime]['close'] -1.0
    arr_PnL_coin = np.append(arr_PnL_coin, PnL_coin)
     
    # move to next timepoint
    if ReturnFrequency == "hourly":
        dt_previousTime += datetime.timedelta(hours=1)
        dt_currentTime  += datetime.timedelta(hours=1)
    elif ReturnFrequency == "daily":
        dt_previousTime += datetime.timedelta(days=1)
        dt_currentTime += datetime.timedelta(days=1)
    else:
        print('ERROR. Need to implment other frequencies')
        assert(False)

#-----------------------------------------#
#          calculate multiplier           #   
#-----------------------------------------#
arr_x = arr_PnL_benchmark
arr_y = arr_PnL_coin

# least square regression (linear): y = alpha + beta*x
linReg = np.polyfit(x=arr_PnL_benchmark, y=arr_PnL_coin, deg=1)

alpha = linReg[1] # this is the y-intercept, not needed
beta  = linReg[0] # this is the slope, which also is the multiplier
multiplier = beta
print("multiplier            : ", multiplier)

#-----------------------------------------#
#          calculate correlation          #   
#-----------------------------------------#
correlation = pearsonr(arr_PnL_benchmark, arr_PnL_coin)
print("correlation            :", correlation[0])

multiplier            :  -0.6945408340863843
correlation            : -0.15598094157469428


# Multiplier and Correlation class calculator

Class calculates multiplier and correlation matrix


In [140]:
class MultiplierCorellationCalculator:
    class RequestFrequency(Enum):
        DAILY  = 0
        HOURLY = 1
    FREQUENCY_LIST = RequestFrequency.__members__.items()
    
    def __init__(self,
                 start_time, 
                 end_time,
                 currencies_list, 
                 return_frequency='daily'):
        self.start_time = start_time
        self.end_time         = end_time
        self.currencies_list  = currencies_list
        self.return_frequency = return_frequency

    
    def calculate_aggregated_pairs(self):
        currencies_list  = deque(self.currencies_list)
        pairs_multiplier_correlation = {}
        while len(currencies_list) > 1:
            benchmark_currency  = currencies_list.popleft()
            for coin_currency in currencies_list:
                pair_tag =  "%s/%s" % (benchmark_currency, coin_currency)
                multiplier, correlation = self.calculation_for_pair(benchmark_currency, coin_currency)
                pairs_multiplier_correlation[pair_tag] = { 'multiplier': multiplier, 
                                                          'correlation': correlation }
        return pairs_multiplier_correlation
        
        
    def calculation_for_pair(self, benchmark_tag, coin_tag):
        # --- read coin ---
        arr_PnL_benchmark, arr_PnL_coin = self._calculate_timeseries()
        multiplier, correlation         = self._calculate_multiplier_and_correlation(arr_PnL_benchmark, 
                                                                                     arr_PnL_coin)
        return (multiplier, correlation)
    
    def _calculate_multiplier_and_correlation(self, arr_PnL_benchmark, arr_PnL_coin):
        # arr_x = arr_PnL_benchmark
        # arr_y = arr_PnL_coin
        #          calculate multiplier           #  
        # least square regression (linear): y = alpha + beta*x
        linReg = np.polyfit(x=arr_PnL_benchmark, y=arr_PnL_coin, deg=1)
        alpha = linReg[1] # this is the y-intercept, not needed
        beta  = linReg[0] # this is the slope, which also is the multiplier
        multiplier = beta
        print("multiplier            : ", multiplier)
        #          calculate correlation          #   
        correlation = pearsonr(arr_PnL_benchmark, arr_PnL_coin)
        print("correlation            :", correlation[0])
        return (multiplier, correlation[0])
        
    #-----------------------------------------#
    #          calculate return timeseries    # 
    #-----------------------------------------#
    def _calculate_timeseries(self):
        dt_previousTime    = copy.deepcopy(self.start_time)
        dt_currentTime     = copy.deepcopy(self.start_time)
        # add first interval
        dt_currentTime,     = self._increment_interval(dt_currentTime)

        df_benchmark = self._retrieve_currency_history(benchmark_ccy)
        df_coin = self._retrieve_currency_history(coin_ccy)

        arr_PnL_benchmark = np.array([])
        arr_PnL_coin       = np.array([])
        
        while (dt_currentTime <= self.end_time):
            # calculate return of benchmark in period [t-1, t]
            arr_PnL_benchmark = self._calculate_PnL(arr_PnL_benchmark,
                                                df_benchmark, 
                                                dt_currentTime, 
                                                dt_previousTime)
            arr_PnL_coin      = self._calculate_PnL(arr_PnL_coin,
                                                df_coin, 
                                                dt_currentTime, 
                                                dt_previousTime)
            # move to next timepoint
            dt_previousTime, dt_currentTime = self._increment_interval(dt_previousTime, 
                                                                       dt_currentTime)
        return (arr_PnL_benchmark, arr_PnL_coin)


    def _calculate_PnL(self, arr_PnL, df_data, dt_currentTime, dt_previousTime):
        # calculate return of strategy in period [t-1, t] (based on equity, i.e. MtM value of positions)
        PnL = df_data.loc[dt_currentTime]['close']  / \
                    df_data.loc[dt_previousTime]['close'] -1.0
        arr_PnL = np.append(arr_PnL, PnL)
        return arr_PnL
    
    def _increment_interval(self, *date_time_fields):
        if self.return_frequency == 'daily':
            return map(lambda dt: dt + timedelta(days=1), date_time_fields)
        elif self.return_frequency == 'hourly':
            return map(lambda dt: dt + timedelta(hours=1), date_time_fields)
        else:
            print('ERROR. Need to implment other frequencies')
            assert(False)

    # --- connect and preprocess utilities for mongo collection --- 
    def _reconstruct_currency_date(self, cur):
        for cur_value, index in zip(cur['history'], range(len(cur['history']))):
#             cur['history'][index]['date'] = datetime.fromtimestamp(cur_value['time'])
            cur['history'][index]['date'] = "{:%Y-%m-%d %H:%M:%S}".format(datetime.fromtimestamp(cur_value['time']))
        return cur

    
    def _mongo_connect(self, db_name):
        mongo_c = MongoClient()
        db = mongo_c[db_name]
        if db:
            return db
        else:
            raise Exception("database or server not found")
        
        
    def _preprocess_collection(self, collection_name, filter_params):
        db = self._mongo_connect('darqube_db')       
        collection = db[collection_name]    
        if not collection:
            raise Exception('collection not found')
        return self._reconstruct_currency_date(collection.find_one(filter_params))

    
    def _retrieve_currency_history(self, currency):
        collection = self._preprocess_collection('currencies_collection', {'Ccy': currency})
        df_data = pd.DataFrame(collection['history'])
        # this makes indexing via date faster
        df_data = df_data.set_index(['date'])         # index: string
        df_data.index = pd.to_datetime(df_data.index)
        return df_data


In [141]:
dt_benchmark_startTime    = datetime.strptime("2015-08-07 03:00:00", "%Y-%m-%d %H:%M:%S")

#always current time
dt_benchmark_endTime      = datetime.strptime("2018-10-08 03:00:00", "%Y-%m-%d %H:%M:%S")

currencies_list = ['BTC', 'ETH']
return_frequency = 'daily'

new_compare = MultiplierCorellationCalculator(dt_benchmark_startTime, dt_benchmark_endTime, currencies_list)
print(new_compare.calculate_aggregated_pairs())

multiplier            :  0.6886096420205688
correlation            : 0.35501628723680734
{'BTC/ETH': {'multiplier': 0.6886096420205688, 'correlation': 0.35501628723680734}}
