In [8]:
import numpy as np
import pandas as pd
import os

### Function/Methods

In [101]:
#https://stackoverflow.com/questions/3949226/calculating-pearson-correlation-and-significance-in-python

# 0.9 para mais ou para menos indica uma correlação muito forte.
# 0.7 a 0.9 positivo ou negativo indica uma correlação forte.percorre
# 0.5 a 0.7 positivo ou negativo indica uma correlação moderada.
# 0.3 a 0.5 positivo ou negativo indica uma correlação fraca.
# 0 a 0.3 positivo ou negativo indica uma correlação desprezível.'''
def corr_pearson(a, b):
  a_avg, b_avg = np.average(a), np.average(b)
  a_stdev, b_stdev = np.std(a), np.std(b)
  n = len(a)
  denominator = a_stdev * b_stdev * n
  numerator = np.sum(np.multiply(a-a_avg, b-b_avg))
  p_coef = numerator/denominator
  return p_coef

def namefile(file):
    return os.path.splitext(file)[0]

def calcvar(data):
    data['Var'] = 0
    lenght = len(data)
    for index in range(lenght):
        if(index == lenght-1):
            break

        current = data['Fechamento'].iloc[index]
        last = data['Fechamento'].iloc[(index+1)]
        var = (current/last)-1
        # index+1; lag: linha do dia dia-dia anterior
        data['Var'].iloc[index+1] = var

# check cointegrated pairs from dataframe
def find_correlation_pairs(data, perc_correl = 0.8, num_pairs=0):
    rows = []
    isBreak = False
        
    for col_depen in data.columns:
        for col_indepen in data.columns:
            if (col_depen == col_indepen):
                continue
            
            df_numerador = pd.DataFrame(data[col_depen])
            df_denominador = pd.DataFrame(data[col_indepen])
            df_numerador.columns = ['Fechamento']
            df_denominador.columns = ['Fechamento']
            
            calcvar(df_numerador)
            calcvar(df_denominador)
            
            p_coef = corr_pearson(df_numerador['Var'], df_denominador['Var'])
            if(abs(p_coef) >= perc_correl):
                rows.append([col_depen,col_indepen,p_coef])
            
            # break for two
            isBreak = (num_pairs > 0 and len(rows) >= num_pairs)
            if (isBreak == True): break
        
        # break for one
        if (isBreak == True): break
            
    df_pairs = pd.DataFrame(rows, columns=['Depen', 'Indepen', 'p_coef'])
    return df_pairs

# check cointegrated pairs from directory of csv's
def find_correlation_pairs_foldercsv(path, perc_correl = 0.8, num_pairs=0):
    isBreak = False
    rows = []

    # for each all files csv in directory 'datasets', read
    for fdepen in os.listdir(path):
        # read the first csv
        den = pd.read_csv(path+'/'+fdepen)

        for findepen in os.listdir(path):
            csv_name1 = namefile(fdepen)
            csv_name2 = namefile(findepen)

            # continue if it is the same csv
            if(csv_name1 == csv_name2):
                continue

            # read then second csv
            num = pd.read_csv(path+'/'+findepen)
            calcvar(den)
            calcvar(num)

            p_coef = corr_pearson(den['Var'], num['Var'])
            if(abs(p_coef) >= perc_correl):
                rows.append([csv_name1,csv_name2,p_coef])

            # break for one
            isBreak = (num_pairs > 0 and len(rows) >= num_pairs)
            if (isBreak == True): break

        # break for two
        if (isBreak == True): break

    df_pairs = pd.DataFrame(rows, columns=['Depen', 'Indepen', 'p_coef'])
    return df_pairs

In [90]:
corr_pearson([1,2,3], [1,2,3])

1.0

In [91]:
# test with csv
mat1 = pd.read_csv('datasets-b3/APER3.csv')
mat2 = pd.read_csv('datasets-b3/BEES3.csv')

calcvar(mat1)
calcvar(mat2)

print('Pelo Preço: ', corr_pearson(mat1['Fechamento'], mat2['Fechamento']))
print('Pela Var %: ', corr_pearson(mat1['Var'], mat2['Var']))

Pelo Preço:  0.8873236737317411
Pela Var %:  0.6638785706386569


### Read CSV

In [83]:
df = pd.read_csv('datasets/data.csv')
data = df[df.columns.difference(['Data'])]

### Pairs Correlationated

In [102]:
pairs = find_correlation_pairs_foldercsv('datasets-b3', num_pairs=10)
pairs

Unnamed: 0,Independente,Dependente,p_coef
0,AAPL34,IVVB11,0.879038
1,AAPL34,SPXI11,0.881377
2,ABCB4,ARZZ3,0.819815
3,ABCB4,AZUL4,0.819255
4,ABCB4,BBAS3,0.848229
5,ABCB4,BBSD11,0.828725
6,ABCB4,BOVA11,0.843962
7,ABCB4,BOVB11,0.864243
8,ABCB4,BOVV11,0.859047
9,ABCB4,BRAX11,0.842594


In [99]:
pairs_ = find_correlation_pairs(data, num_pairs=10)
pairs_

Unnamed: 0,Independente,Dependente,p_coef
0,AAPL34,IVVB11,0.879038
1,AAPL34,SPXI11,0.881377
2,ABCB4,ARZZ3,0.819815
3,ABCB4,AZUL4,0.819255
4,ABCB4,BBAS3,0.848229
