In [8]:
# Import Modules
import os
import warnings
warnings.simplefilter("ignore", UserWarning)

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.signal as ss
import numpy as np

In [9]:
# Directories and files

# Data Directories
DATA_DIR = "../../data/kospi/combined/"  # Using kospi data

In [10]:
# Load Data File List
file_list = os.listdir(DATA_DIR)
codes_list = [f[:-4] for f in file_list]

In [11]:
codes_list[:5]

['000020', '000040', '000050', '000060', '000070']

## 1. Calculate Pearson Correlation

In [12]:
# Calculate Correlation Data

correlation_list = []

for file in file_list:
    # Load File
    filepath= DATA_DIR + file
    df = pd.read_csv(filepath, index_col=0)
    
    if df.empty:
        print(file)
        continue
        
    # Scale
    df -= df.min()
    df /= df.max()
    
    # Calculate Pearson Correlation
    correlation_value = df.corr().iloc[1,2]
    correlation_list.append(correlation_value)

In [13]:
# Convert To Dataframe
pearson_df = pd.DataFrame(list(zip(codes_list,correlation_list)), columns=['code','corr'])
pearson_df = pearson_df.dropna()

In [None]:
# Draw Histogram
plt.hist(pearson_df['corr'], 20)
plt.show()

In [None]:
# Draw Boxplot
plt.subplots(1,1,figsize=(3,5))
sns.set_style('whitegrid')
sns.boxplot(pearson_df['corr'],color='red',orient='v')
plt.tight_layout()

In [None]:
# See Outliers
positive_outliers = pearson_df[pearson_df['corr'] > 0.4]
print(positive_outliers.head())
negative_outliers = pearson_df[pearson_df['corr'] < -0.4]
print(negative_outliers.head())

## 2. Normalized Cross Correlation With Time Shift

In [None]:
# Function to calculate normalized cross correlation
def ccf(x, y, lag_max = 100):
    result = ss.correlate(y - np.mean(y), x - np.mean(x), method='direct') / (np.std(y) * np.std(x) * len(y))
    length = (len(result) - 1) // 2
    lo = length - lag_max
    hi = length + (lag_max + 1)

    return result[lo:hi]

In [None]:
ncc_list = []
lag_list = []
lag = 10

for file in file_list:
    # Load File
    filepath= DATA_DIR + file
    df = pd.read_csv(filepath, index_col=0)
    
    # Scale
    df -= df.min()
    df /= df.max()

    # Calculate Normalized Cross Correlation With Different Time Shift(Lag=10)
    ncc = ccf(df['volume'].values, df['ratio'].values, lag_max=lag)
    
    # Find index of max absolute correlation
    max_idx = np.argmax(abs(ncc))
    
    # Find value of max absolute correlation
    max_ncc = ncc[max_idx]
    
    # Find lag for max value
    max_lag = max_idx - lag
    
    # print(file, max_ncc, max_lag)
    ncc_list.append(max_ncc)
    lag_list.append(max_lag)

In [None]:
# Convert To Dataframe
ncc_df = pd.DataFrame(list(zip(codes_list,ncc_list,lag_list)), columns=['code','ncc', 'lag'])
ncc_df = ncc_df.dropna()

In [None]:
# Draw Histogram
plt.hist(ncc_df['ncc'], 20)
plt.show()

In [None]:
# Draw Boxplot
plt.subplots(1,1, figsize=(3,5))
sns.set_style('whitegrid')
sns.boxplot(ncc_df['ncc'],color='red',orient='v')
plt.tight_layout()

In [None]:
# See Outliers
positive_outliers = ncc_df[ncc_df['ncc'] > 0.4]
print(positive_outliers.head())
negative_outliers = ncc_df[ncc_df['ncc'] < -0.4]
print(negative_outliers.head())

In [None]:
plt.hist(ncc_df['lag'], 20)
plt.show()

## 3. Normalized Cross Correlation With Time Shift (Using Percentage Changes)

In [None]:
ncc_pct_list = []
lag_pct_list = []
lag = 10

for file in file_list:
    # Load File
    filepath= DATA_DIR + file
    df = pd.read_csv(filepath, index_col=0)
    df = df.pct_change()
    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.dropna()
    df = df.abs()
    
    # Calculate Normalized Cross Correlation With Different Time Shift(Lag=10)
    ncc = ccf(df['volume'].values, df['ratio'].values,lag_max=lag)
    
    # Find max index
    max_idx = np.argmax(abs(ncc))
    # Find max correlation
    max_ncc = ncc[max_idx]
    # Find max lag
    max_lag = max_idx - lag
    
    # print(file, max_ncc, max_lag)
    ncc_pct_list.append(max_ncc)
    lag_pct_list.append(max_lag)

In [None]:
# Convert To Dataframe
pct_df = pd.DataFrame(list(zip(codes_list,ncc_pct_list,lag_pct_list)), columns=['code','ncc', 'lag'])
pct_df = pct_df.dropna()

In [None]:
# Draw Histogram
plt.hist(pct_df['ncc'], 20)
plt.show()

In [None]:
# Draw Boxplot
plt.subplots(1,1, figsize=(3,5))
sns.set_style('whitegrid')
sns.boxplot(pct_df['ncc'],color='red',orient='v')
plt.tight_layout()

In [None]:
# See Outliers
# print(df['corr'])
positive_outliers = pct_df[pct_df['ncc'] > 0.4]
print(positive_outliers.head())
negative_outliers = pct_df[pct_df['ncc'] < -0.4]
print(negative_outliers.head())