In [4]:
#https://medium.com/techbloghotmart/o-que-s%C3%A3o-s%C3%A9ries-temporais-e-como-aplicar-em-machine-learning-6ea5d94bec78
#https://machinelearningmastery.com/time-series-data-stationary-python/

import numpy as np
import pandas as pd
import os
import statsmodels.api as sm
import statsmodels.tsa.stattools as ts

In [113]:
# http://www.insightsbot.com/augmented-dickey-fuller-test-in-python/

#p-value > 0.05: Fail to reject the null hypothesis (H0), the data has a unit root and is non-stationary.
#p-value <= 0.05: Reject the null hypothesis (H0), the data does not have a unit root and is stationary.
class StationarityTests:
    def __init__(self, significance=.05):
        self.SignificanceLevel = significance
        self.pValue = None
        self.isStationary = None
        self.ADFTestStatistic = None
        
    def CheckStationarityYX(self, y, x, printResults = False):
        ols_result = sm.OLS(y, x).fit()
        self.CheckStationarity(ols_result.resid, printResults)
        
    def CheckStationarity(self, timeseries, printResults = False):
        #Dickey-Fuller test:
        adfTest = ts.adfuller(timeseries)
        
        self.pValue = adfTest[1]
        self.ADFTestStatistic = adfTest[0:1][0]
        
        if (self.pValue < self.SignificanceLevel):
            self.isStationary = True
        else:
            self.isStationary = False
        
        if printResults:
            print('Augmented Dickey-Fuller Test Results:')
            print('ADF Test Statistic    ', adfTest[0:1][0])
            print('P-Value               ', adfTest[1:2][0])
            print('# Lags Used           ', adfTest[2:3][0])
            print('# Observations Used   ', adfTest[3:4][0])
            
            dfResults = pd.Series()
            #Add Critical Values
            for key,value in adfTest[4].items():
                dfResults['Critical Value (%s)'%key] = value
            print(dfResults)
            
            print("Is the time series stationary? {0}".format(sTest.isStationary))

In [119]:
def namefile(file):
    return os.path.splitext(file)[0]

# check cointegrated pairs from dataframe
def find_cointegrated_pairs(data, num_pairs=0):
    sTest = StationarityTests()
    rows = []
    isBreak = False
        
    for numerador in data.columns:
        for denominador in data.columns:
            if (numerador == denominador):
                continue

            sTest.CheckStationarityYX(data[numerador], data[denominador])
            if(sTest.isStationary):
                rows.append([numerador,denominador,sTest.pValue, sTest.ADFTestStatistic])
        
            # break for two
            isBreak = (num_pairs > 0 and len(rows) >= num_pairs)
            if (isBreak == True): break
        
        # break for one
        if (isBreak == True): break
            
    df_pairs = pd.DataFrame(rows, columns=['Independente', 'Dependente', 'pValue', 'ADFTestStatistic'])
    return df_pairs

# check cointegrated pairs from directory of csv's
def find_cointegrated_pairs_foldercsv(path, num_pairs=0):
    test = StationarityTests()
    rows = []
    isBreak = False
    
    # for each all files csv in directory 'datasets', read
    for fnum in os.listdir(path):
        # read the first csv        
        num = pd.read_csv(path + '/'+fnum)['Fechamento']
        
        for fden in os.listdir(path):
            csv_name1 = namefile(fnum)
            csv_name2 = namefile(fden)

            # continue if it is the same csv
            if(csv_name1 == csv_name2):
                continue

            # read then second csv
            den = pd.read_csv(path+'/'+fden)['Fechamento']
            test.CheckStationarityYX(num, den)

            if(test.isStationary):
                rows.append([csv_name1,csv_name2,test.pValue, test.ADFTestStatistic])
            
            # break for two
            isBreak = (num_pairs > 0 and len(rows) >= num_pairs)
            if (isBreak == True): break
        
        # break for one
        if (isBreak == True): break
            
    df_pairs = pd.DataFrame(rows, columns=['Independente', 'Dependente', 'pValue', 'ADFTestStatistic'])
    return df_pairs

In [110]:
# test with csv
y = pd.read_csv('datasets-b3/SMAL11.csv')['Fechamento']
x = pd.read_csv('datasets-b3/ABCB4.csv')['Fechamento']

par = StationarityTests()
ols_result = sm.OLS(y, x).fit()
par.CheckStationarity(ols_result.resid, printResults = True)

Augmented Dickey-Fuller Test Results:
ADF Test Statistic     -2.5653204206718687
P-Value                0.10037872938026032
# Lags Used            2
# Observations Used    83
Critical Value (1%)    -3.511712
Critical Value (5%)    -2.897048
Critical Value (10%)   -2.585713
dtype: float64
Is the time series stationary? False


In [59]:
data = pd.read_csv('datasets/data.csv', index_col=0)
del data['Data']
data.shape

(86, 266)

In [116]:
pairs = find_cointegrated_pairs(data, 10)
pairs.shape
pairs

Unnamed: 0,Independente,Dependente,pValue,ADFTestStatistic
0,AALR3,ALSO3,0.021677,-3.17179
1,AALR3,BBRK3,0.046411,-2.89095
2,AALR3,BPAN4,0.003101,-3.781608
3,AALR3,BSEV3,3.1e-05,-4.923971
4,AALR3,CEAB3,0.011712,-3.378566
5,AALR3,COGN3,0.016862,-3.258029
6,AALR3,CTNM4,0.016089,-3.273843
7,AALR3,CYRE3,0.001434,-3.995629
8,AALR3,DIRR3,0.042224,-2.927663
9,AALR3,EMBR3,0.024565,-3.127768


In [120]:
pairs2 = find_cointegrated_pairs_foldercsv('datasets-b3', 10)
pairs2.shape
pairs2

Unnamed: 0,Independente,Dependente,pValue,ADFTestStatistic
0,AALR3,ALSO3,0.021677,-3.17179
1,AALR3,BBRK3,0.046411,-2.89095
2,AALR3,BPAN4,0.003101,-3.781608
3,AALR3,BSEV3,3.1e-05,-4.923971
4,AALR3,CEAB3,0.011712,-3.378566
5,AALR3,COGN3,0.016862,-3.258029
6,AALR3,CTNM4,0.016089,-3.273843
7,AALR3,CYRE3,0.001434,-3.995629
8,AALR3,DIRR3,0.042224,-2.927663
9,AALR3,EMBR3,0.024565,-3.127768


In [103]:
data[['AALR3','ALSO3']]
test = StationarityTests()
test.CheckStationarityYX(data['AALR3'], data['ALSO3'], True)

Augmented Dickey-Fuller Test Results:
ADF Test Statistic     -3.1717902360769403
P-Value                0.021676748203255828
# Lags Used            0
# Observations Used    85
Critical Value (1%)    -3.509736
Critical Value (5%)    -2.896195
Critical Value (10%)   -2.585258
dtype: float64
Is the time series stationary? False


In [104]:
ALSO3 = pd.read_csv('datasets-b3/ALSO3.csv')['Fechamento']
AALR3 = pd.read_csv('datasets-b3/AALR3.csv')['Fechamento']
test = StationarityTests()
test.CheckStationarityYX(AALR3, ALSO3, True)

Augmented Dickey-Fuller Test Results:
ADF Test Statistic     -3.1717902360769403
P-Value                0.021676748203255828
# Lags Used            0
# Observations Used    85
Critical Value (1%)    -3.509736
Critical Value (5%)    -2.896195
Critical Value (10%)   -2.585258
dtype: float64
Is the time series stationary? False
