In [2]:
import statsmodels.api as sm
import statsmodels.tsa.stattools as ts 
from statsmodels.tsa.stattools import coint
import numpy as np
import pandas as pd

In [118]:
#https://medium.com/@bart.chr/pairs-trading-for-algorithmic-trading-breakdown-d8b709f59372
#https://github.com/aconstandinou/mean-reversion
"""
Augmented Dickey–Fuller (ADF) unit root test
"""
class DickeyFuller(object):
    def __init__(self, significance=.05):
        self.significance_level = significance
        self.p_value = None
        self.perc_stat = None
        self.is_stationary = None
        
    def check(self, time_series):
        model = ts.adfuller(time_series, 1)
        self.p_value = model[1]
        self.perc_stat = model[0]
        
        self.is_stationary = False
        if (self.p_value < self.significance_level):
            self.is_stationary = True
        return self.is_stationary;

"""
Half Life test from the Ornstein-Uhlenbeck process 
"""
class HalfLife(object):
    def __init__(self):
        self.half_life = None

    def check(self, time_series):
        lag = np.roll(time_series, 1)
        lag[0] = 0
        ret = time_series - lag
        ret[0] = 0

        # adds intercept terms to X variable for regression
        lag2 = sm.add_constant(lag)
        res = sm.OLS(ret, lag2).fit()
        self.half_life = int(round(-np.log(2) / res.params[1],0))

        if self.half_life <= 0:
            self.half_life = 1
        return self.half_life

"""
If Hurst Exponent is under the 0.5 value of a random walk, then the series is mean reverting
"""
class HurstExponent():
    def __init__(self):
        self.h_min = 0.0
        self.h_max = 0.4
        self.look_back = 126
        #https://robotwealth.com/demystifying-the-hurst-exponent-part-1/
        self.lag_max = 20#era 100
        self.h_value = None
    
    def check(self, time_series):
        lags = range(2, self.lag_max)

        tau = [np.sqrt(np.std(np.subtract(time_series[lag:], time_series[:-lag]))) for lag in lags]
        poly = np.polyfit(np.log(lags), np.log(tau), 1)

        self.h_value = poly[0]*2.0 
        return self.h_value

def model_ols(y, x):
    x = sm.add_constant(x)
    model = sm.OLS(y, x).fit()
    return model

# beta/coeficiente angular
def beta(y, x):
    model = model_ols(y, x)
    return model.params[1]

# check cointegrated pairs from dataframe
def find_cointegrated_pairs(data, num_pairs=0, noStationary=False):
    adf = DickeyFuller()
    rows = []
    isBreak = False
        
    for col_depen in data.columns:
        for col_indepen in data.columns:
            if (col_depen == col_indepen):
                continue
            
            model = model_ols(data[col_depen], data[col_indepen])
            adf.check(model.resid)
            beta = model.params[1]
            
            # if get only not stationary
            if (noStationary):
                if(adf.is_stationary==False):
                    rows.append([col_depen,col_indepen,adf.p_value, adf.perc_stat, beta])
            elif (adf.is_stationary):
                    rows.append([col_depen,col_indepen,adf.p_value, adf.perc_stat, beta])
                    
            # break for two
            isBreak = (num_pairs > 0 and len(rows) >= num_pairs)
            if (isBreak == True): break
        
        # break for one
        if (isBreak == True): break

    df_pairs = pd.DataFrame(rows, columns=['Dependente', 'Independente', 'pValue', 'ADFStatistic', 'Beta'])
    return df_pairs

def apply_halflife(data, pairs):
    pairs['HalfLife'] = 0
    
    for i, row in pairs.iterrows():
        y = data[row['Dependente']]
        x = data[row['Independente']]
        
        value = check_halflife(y, x)
        pairs['HalfLife'].iloc[i]=value
    return pairs

def check_halflife(y, x):
    halflile = HalfLife()
    model = model_ols(y, x)
    return halflile.check(model.resid)

def apply_hurst(data, pairs):
    pairs['Hurst'] = 0
    
    for i, row in pairs.iterrows():
        y = data[row['Dependente']]
        x = data[row['Independente']]
        
        value = check_hurst(y, x)
        pairs['Hurst'].iloc[i]= value
    return pairs

def check_hurst(y, x):
    hurst = HurstExponent()
    model = model_ols(y, x)
    return hurst.check(model.resid.as_matrix())

In [119]:
df = pd.read_csv('datasets/data.csv')
data = df[df.columns.difference(['data'])]

In [124]:
pairs = find_cointegrated_pairs(data, 0)
pairs.head(3)

ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).

In [121]:
pairs = apply_halflife(data, pairs)
pairs.head(3)

Unnamed: 0,Dependente,Independente,pValue,ADFStatistic,Beta,HalfLife
0,AALR3,ABCB4,0.023039,-3.150434,1.308303,3
1,AALR3,ALSO3,0.005086,-3.637314,0.424785,3
2,AALR3,ALUP11,0.012351,-3.361289,1.681917,3
3,AALR3,ANIM3,0.038146,-2.966476,0.66003,5
4,AALR3,ARZZ3,0.009324,-3.451473,0.476037,3


In [122]:
pairs = apply_hurst(data, pairs)
pairs.head(3)



Unnamed: 0,Dependente,Independente,pValue,ADFStatistic,Beta,HalfLife,Hurst
0,AALR3,ABCB4,0.023039,-3.150434,1.308303,3,0.254748
1,AALR3,ALSO3,0.005086,-3.637314,0.424785,3,0.208671
2,AALR3,ALUP11,0.012351,-3.361289,1.681917,3,0.199234
3,AALR3,ANIM3,0.038146,-2.966476,0.66003,5,0.214762
4,AALR3,ARZZ3,0.009324,-3.451473,0.476037,3,0.172292
