In [228]:
import numpy as np
import pandas as pd
from scipy.stats.stats import pearsonr # used to calculate correlation coefficient
from pymongo import MongoClient
import json
from datetime import datetime, timedelta, date
from pprint import pprint
from enum import Enum
import copy
from collections import deque
from time import time

# Read timeseries of benchmark and coin

In [119]:
filename_benchmark = 'testing/BTC_Bitfinex_USD.csv'
filename_coin      = 'testing/XRP_Bitfinex_USD.csv'

# --- read benchmark ---
df_benchmark = pd.read_csv(filename_benchmark, header=None, sep=";", 
                            names=['date', 'timestamp', 'open', 'high', 'low', 'close', 'volFrom', 'volTo'])
    
# this makes indexing via date faster
df_benchmark = df_benchmark.set_index(['date'])         # index: string
df_benchmark.index = pd.to_datetime(df_benchmark.index) # index: datetime

pprint(df_benchmark)

# --- read coin ---
df_coin = pd.read_csv(filename_coin, header=None, sep=";", 
                      names=['date', 'timestamp', 'open', 'high', 'low', 'close', 'volFrom', 'volTo'])
    
# this makes indexing via date faster
df_coin = df_coin.set_index(['date'])         # index: string
df_coin.index = pd.to_datetime(df_coin.index) # index: datetime

                        timestamp      open      high       low     close  \
date                                                                        
2018-01-01 00:00:00  1.514765e+09   9830.51   9830.51   9830.51   9830.51   
2018-01-01 01:00:00  1.514768e+09   9830.42   9830.42   9830.42   9830.42   
2018-01-01 02:00:00  1.514772e+09   9874.93   9874.93   9874.93   9874.93   
2018-01-01 03:00:00  1.514776e+09   9894.21   9894.21   9894.21   9894.21   
2018-01-01 04:00:00  1.514779e+09   9900.78   9900.78   9900.78   9900.78   
2018-01-01 05:00:00  1.514783e+09  10032.37  10032.37  10032.37  10032.37   
2018-01-01 06:00:00  1.514786e+09  10120.35  10120.35  10120.35  10120.35   
2018-01-01 07:00:00  1.514790e+09  10120.25  10120.25  10120.25  10120.25   
2018-01-01 08:00:00  1.514794e+09  10070.72  10070.72  10070.72  10070.72   
2018-01-01 09:00:00  1.514797e+09  10091.04  10091.04  10091.04  10091.04   
2018-01-01 10:00:00  1.514801e+09  10170.63  10170.63  10170.63  10170.63   

# Calculate Multiplier and Correlation

If there is no pre-defined libraries to compute (Pearson) correlation, it can also be calculated by its definition: <br>
https://en.wikipedia.org/wiki/Pearson_correlation_coefficient

This relies on calculating the covariance between two vectors, and the variance of each vector indidually. <br>
If you need further clarification please ask.

In [4]:
#

# input variables

#input parameter
dt_benchmark_startTime    = datetime.datetime.strptime("2018-01-01 00:00:00", "%Y-%m-%d %H:%M:%S")

#always current time
dt_benchmark_endTime      = datetime.datetime.strptime("2018-01-31 23:00:00", "%Y-%m-%d %H:%M:%S")

#input parameter
ReturnFrequency = "daily"

#list of currencies

#END input variables
    
dt_currentTime = dt_benchmark_startTime

# add first interval
dt_previousTime = dt_currentTime
if ReturnFrequency == "hourly":
    dt_currentTime += datetime.timedelta(hours=1)
elif ReturnFrequency == "daily":
    dt_currentTime += datetime.timedelta(days=1)
else:
    print('ERROR. Need to implment other frequencies')
    assert(False)


arr_PnL_benchmark  = np.array([])
arr_PnL_coin       = np.array([])
   
#-----------------------------------------#
#          calculate return timeseries    #   
#-----------------------------------------#

while (dt_currentTime <= dt_benchmark_endTime):
    # calculate return of benchmark in period [t-1, t]
    PnL_benchmark = df_benchmark.loc[dt_currentTime]['close'] / \
                    df_benchmark.loc[dt_previousTime]['close'] -1.0
    arr_PnL_benchmark = np.append(arr_PnL_benchmark, PnL_benchmark)
    
    # calculate return of strategy in period [t-1, t] (based on equity, i.e. MtM value of positions)
    PnL_coin = df_coin.loc[dt_currentTime]['close']  / \
               df_coin.loc[dt_previousTime]['close'] -1.0
    arr_PnL_coin = np.append(arr_PnL_coin, PnL_coin)
     
    # move to next timepoint
    if ReturnFrequency == "hourly":
        dt_previousTime += datetime.timedelta(hours=1)
        dt_currentTime  += datetime.timedelta(hours=1)
    elif ReturnFrequency == "daily":
        dt_previousTime += datetime.timedelta(days=1)
        dt_currentTime += datetime.timedelta(days=1)
    else:
        print('ERROR. Need to implment other frequencies')
        assert(False)

#-----------------------------------------#
#          calculate multiplier           #   
#-----------------------------------------#
arr_x = arr_PnL_benchmark
arr_y = arr_PnL_coin

# least square regression (linear): y = alpha + beta*x
linReg = np.polyfit(x=arr_PnL_benchmark, y=arr_PnL_coin, deg=1)

alpha = linReg[1] # this is the y-intercept, not needed
beta  = linReg[0] # this is the slope, which also is the multiplier
multiplier = beta
print("multiplier            : ", multiplier)

#-----------------------------------------#
#          calculate correlation          #   
#-----------------------------------------#
correlation = pearsonr(arr_PnL_benchmark, arr_PnL_coin)
print("correlation            :", correlation[0])

multiplier            :  -0.6945408340863843
correlation            : -0.15598094157469428


# Multiplier and Correlation class calculator

Class calculates multiplier and correlation matrix


In [361]:
from flask import Flask
from flask import request
import numpy as np
import pandas as pd
from scipy.stats.stats import pearsonr # used to calculate correlation coefficient
from pymongo import MongoClient
import json
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from calendar import monthrange
from pprint import pprint
from enum import Enum
import copy
from collections import deque
import time

class MultiplierCorellationCalculator:
    class RequestFrequency(Enum):
        DAILY  = 0
        HOURLY = 1
    
    class HourlyTimeIntervals(Enum):
        A_DAY       = 1
        FIVE_DAYS   = 5
        WEEK        = 7
        TEN_DAYS    = 10
        TWO_WEEKS   = 14
    
    class DailyTimeIntervals(Enum):
        A_MONTH      = 1
        THREE_MONTHS = 3
        HALF_YEAR    = 6
        NINE_MONTHS  = 9
        A_YEAR       = 12
    
    FREQUENCY_LIST        = RequestFrequency.__members__.keys()
    HOURLY_TIME_INTERVALS = list(map(lambda x: x.value, HourlyTimeIntervals.__members__.values()))
    DAILY_TIME_INTERVALS  = list(map(lambda x: x.value, DailyTimeIntervals.__members__.values()))
    TIME_INTERVALS_DICT   = {
        'hourly': HOURLY_TIME_INTERVALS, 
        'daily': DAILY_TIME_INTERVALS,
    }
    
    def __init__(self,
                 db_name='darqube_db',
                 time_interval=1,
                 currencies_list='all',
                 return_frequency='daily'):
        if return_frequency.upper() not in self.FREQUENCY_LIST:
            raise Exception('Only [daily, hourly] values supports for return_frequency parameter yet...')
        self.mongo_c = None
        self.db_name = db_name
        self.mongo_c = self._mongo_connect()
        self.db = self.mongo_c[db_name]
        self.currencies_list  = currencies_list
        if currencies_list == 'all':
            self.currencies_list = self.mongo_c['coin_pairs'].find()['name']
        self.time_points =  self.TIME_INTERVALS_DICT[return_frequency]
        if time_interval not in self.time_points:
            msg = 'Only %s values supports for %s collection' % (','.join(self.time_points), 
                                                                 return_frequency)
            raise Exception(msg)
        self.return_frequency = "%s_data" % return_frequency
        self.currencies       = {}
    
    
    def _preprocess_time_intervals(self): 
        if return_frequency == 'daily':
            interval_days_sum  = 0
            now                = datetime.now()
            month_days         = lambda t: monthrange(t.year, t.month)[1]
            new_time_intervals = []
            prev_interval      = self.time_points[0]
            for interval in self.time_points[1:]:
                for month_delta in range(prev_interval, interval):
                    current = now - relativedelta(months=month_delta)
                    if current < self.start_time:
                        break
                    interval_days_sum += month_days(current)
                new_time_intervals.append(interval_days_sum)
                prev_interval = interval
            return new_time_intervals
        else:
            return time_points
    
    
    def recalculate_pairs(self):
        """
        Example for daily:
            
        'm_and_c_matrix': {
            '1': [  {   ccy: 'ETH',
                        multiplier: 0.5,
                        correlation: 0.93 },
                    {   ccy: 'LTC',
                        mult: 0.5,
                        corr: 0.93 }
            ],
            '5': [  {   ccy: 'ETH',
                        multiplier: 0.5,
                        correlation: 0.93 },
                    {   ccy: 'LTC',
                        mult: 0.5,
                        corr: 0.93 }
            ],
            ......
            '14': [  {   ccy: 'ETH',
                        multiplier: 0.5,
                        correlation: 0.93 },
                    {   ccy: 'LTC',
                        mult: 0.5,
                        corr: 0.93 }
            ]
        }
        """
        currencies_list  = deque(self.currencies_list)
        self._fix_currencies_time_bounds()
        time_intervals   = self._preprocess_time_intervals()
        pairs_multiplier_correlation = {}
        for benchmark_currency in currencies_list:
            matrix = {}
            for n_times, point in zip(time_intervals, self.time_points):
                pairs = []
                for coin_currency in [x for x in currencies_list if x != benchmark_currency]:
                    multiplier, correlation = self.calculate_for_pair(
                                                benchmark_currency, 
                                                coin_currency, 
                                                n_times)
                    pair = { 'ccy': coin_currency,
                             'multiplier': multiplier, 
                             'correlation': correlation }
                    pairs.append(pair)
                matrix[str(point)] = pairs
                
            self.db[self.return_frequency].update(
               { 'Ccy': benchmark_currency },
               { 
                 '$set': { "m_and_c_matrix": matrix}
               },
                upsert=True
            )


    def _fix_currencies_time_bounds(self):
        to_datetime = lambda x: datetime.fromtimestamp(x)
        self.start_time, self.end_time = map(to_datetime, self._return_time_bounds())
        if self.return_frequency == 'daily_data':
            self.start_time = self.start_time.replace(hour=0,minute=0,second=0)
            self.end_time   = self.end_time.replace(hour=0,minute=0,second=0)
    
    
    def _return_time_bounds(self):
        collection_data = self.db[self.return_frequency]
        minln = 0
        maxln = time.time()
        for data in collection_data.find({ 'Ccy': { '$in' : self.currencies_list } }):
            try:
                hist = data["history"]    
                history = list(map(lambda x: x['time'], hist))
                if min(history) > minln:
                    minln = min(history)
                if max(history) < maxln:
                    maxln = max(history)
            except:
                next
        return (minln, maxln)
    
    
    def calculate_for_pair(self, benchmark_ccy, coin_ccy, last_n_times):
        # --- read coin ---
        arr_PnL_benchmark, arr_PnL_coin = self._timeseries(benchmark_ccy, 
                                                           coin_ccy, 
                                                           last_n_times)
        multiplier, correlation         = self._multiplier_and_correlation(arr_PnL_benchmark, 
                                                                           arr_PnL_coin)
        return (multiplier, correlation)
    
    def _multiplier_and_correlation(self, arr_PnL_benchmark, arr_PnL_coin):
        #          calculate multiplier
        # least square regression (linear): y = alpha + beta*x
        linReg = np.polyfit(x=arr_PnL_benchmark, y=arr_PnL_coin, deg=1)
        alpha = linReg[1] # this is the y-intercept, not needed
        beta  = linReg[0] # this is the slope, which also is the multiplier
        multiplier = beta
        print("multiplier            : ", multiplier)
        #          calculate correlation          #   
        correlation = pearsonr(arr_PnL_benchmark, arr_PnL_coin)
        print("correlation            :", correlation[0])
        return (multiplier, correlation[0])
    
    #-----------------------------------------#
    #          calculate return timeseries    # 
    #-----------------------------------------#
    def _timeseries(self, benchmark_ccy, coin_ccy, last_n_times):
        df_benchmark = self._retrieve_currency_history(benchmark_ccy)
        df_benchmark = df_benchmark.pct_change()
        print(df_benchmark)
        df_benchmark = df_benchmark['close'].values[1:last_n_times]
        print(df_benchmark)
        
        df_coin      = self._retrieve_currency_history(coin_ccy).pct_change()
        df_coin      = df_coin['close'].values[1:last_n_times]
        
        return (df_benchmark, df_coin)

    
    def _increment_interval(self, *date_time_fields):
        if self.return_frequency == 'daily':
            return map(lambda dt: dt + timedelta(days=1), date_time_fields)
        elif self.return_frequency == 'hourly':
            return map(lambda dt: dt + timedelta(hours=1), date_time_fields)
        else:
            print('ERROR. Need to implment other frequencies')
            assert(False)

    # --- connect and preprocess utilities for mongo collection --- 
    def _reconstruct_currency_date(self, cur):
        frmt = "{:%Y-%m-%d}"
        if self.return_frequency == 'hourly':
            frmt = "{:%Y-%m-%d %H:%M:%S}"
        for cur_value, index in zip(cur['history'], range(len(cur['history']))):
            #  cur['history'][index]['date'] = datetime.fromtimestamp(cur_value['time'])
            cur['history'][index]['date'] = frmt.format(datetime.fromtimestamp(cur_value['time']))
        return cur

    
    def _mongo_connect(self):
        if not self.mongo_c: 
            self.mongo_c = MongoClient('localhost',
                    authSource=self.db_name)
        return self.mongo_c
        
        
    def _preprocess_collection(self, collection_name, filter_params):
        collection = self.db[collection_name]
        if not collection:
            raise Exception('collection not found')
        return self._reconstruct_currency_date(collection.find_one(filter_params))

    
    def _retrieve_currency_history(self, currency):
        if currency not in self.currencies:
            collection_schema = self.return_frequency # return frequency points to the name of collection
            df_data = self._preprocess_collection(collection_schema, {'Ccy': currency})
            df_data = pd.DataFrame(df_data['history'])
            # this makes indexing via date faster
            df_data = df_data.set_index(['date'])         # index: string
            df_data.index = pd.to_datetime(df_data.index)
            self.currencies[currency] = df_data
        return self.currencies[currency]

    


In [362]:
time_interval = 1

currencies_list = ['BTC', 'ETH', 'BCH', 'XRP', 'EOS']
return_frequency = 'daily'

new_compare = MultiplierCorellationCalculator(
    time_interval=time_interval,
    currencies_list=currencies_list,
    return_frequency=return_frequency)
print(new_compare.recalculate_pairs())



               close      high       low      open      time  volumefrom  \
date                                                                       
2010-07-17       NaN       NaN       NaN       NaN       NaN         NaN   
2010-07-18  0.733791  0.733993  0.199960  0.000000  0.000068    2.750500   
2010-07-19 -0.058714  0.084100  0.299950  0.733791  0.000068    6.652313   
2010-07-20 -0.075000 -0.120984 -0.038457 -0.058714  0.000068   -0.543554   
2010-07-21  0.059807 -0.031781 -0.106652 -0.075000  0.000068    1.194656   
2010-07-22 -0.362454  0.032824 -0.238770  0.059807  0.000068    2.756522   
2010-07-23  0.240000 -0.172840  0.000000 -0.362454  0.000068    0.112269   
2010-07-24 -0.129032 -0.089552 -0.000198  0.240000  0.000068   -0.793415   
2010-07-25 -0.074074 -0.035708  0.000198 -0.129032  0.000068    2.125967   
2010-07-26  0.108911 -0.057398 -0.009901 -0.074074  0.000067   -0.434733   
2010-07-27  0.071429  0.080357  0.060000  0.108911  0.000067    2.846853   
2010-07-28 -

TypeError: 'NoneType' object is not subscriptable

In [356]:
class MultiplierCorrelationRetriever:
    def __init__(self,
                 horizon,
                 currencies_list,
                 return_frequency='daily'):
        self.mongo_c          = None
        self.db_name          = 'darqube_db'
        self._mongo_connect()
        self.return_frequency = "%s_data" % return_frequency
        self.db               = self.mongo_c[self.db_name]
        self._mongo_connect()
        self.horizon          = horizon
        self.currencies_list  = currencies_list
        
        
    def retrieve_data(self):
        currencies_list  = deque(self.currencies_list)
        pairs_multiplier_correlation = {}
        while len(currencies_list) > 1:
            benchmark_currency  = currencies_list.popleft()
            pair = self._retrieve_multiplier_correlation(benchmark_currency, currencies_list)
            pairs_multiplier_correlation = {**pairs_multiplier_correlation, **pair}
        return pairs_multiplier_correlation
        
    def _mongo_connect(self):
        if not self.mongo_c: 
            self.mongo_c = MongoClient('localhost',
                    authSource=self.db_name)
        return self.mongo_c
        
        
    def _retrieve_collection(self, filter_params):
        collection = self.db[self.return_frequency]
        if not collection:
            raise Exception('collection not found')
        return collection.find_one(filter_params)

    
    def _retrieve_multiplier_correlation(self, benchmark, coins):
        df_data = self._retrieve_collection({'Ccy': benchmark})
        df_data = df_data['m_and_c_matrix'][str(self.horizon)]
        df_data = [x for x in df_data if x['ccy'] in coins]
        df_data = {"%s/%s" %(benchmark, obj['ccy']): {'multiplier': obj['multiplier'], 'correlation': obj['correlation']} for obj in df_data}
        return df_data
        

In [357]:
new_retriever = MultiplierCorrelationRetriever(horizon=1, 
                                               currencies_list=coins,
                                              return_frequency='daily')
print(new_retriever.retrieve_data())


KeyError: '1'