In [39]:
import yfinance as yf
import pandas as pd
from statsmodels.tsa.stattools import coint
from statsmodels.tsa.stattools import adfuller
import statsmodels.api as sm

tickers = ['XLB', 'XLE', 'XLF', 'XLI', 'XLK', 'XLP', 'XLV', 'XLY', 'XLU', 'XLRE', 'XLC']
prices = yf.download(tickers, start = '2010-01-01', end = '2020-12-31')['Close']

returns = prices.pct_change().dropna()

correlation_matrix = returns.corr()
correlation_matrix = correlation_matrix.rename_axis(None).rename_axis(None, axis = 1)
correlation_matrix = correlation_matrix.stack().reset_index()
correlation_matrix.columns = ['ETF', 'Pair', 'Correlation']

correlation_matrix =  correlation_matrix[correlation_matrix['ETF'] != correlation_matrix['Pair']]
correlation_matrix = correlation_matrix.sort_values(by = 'Correlation', ascending = False)
correlation_matrix = correlation_matrix.drop_duplicates('Correlation')

potential_pairs = correlation_matrix[correlation_matrix['Correlation'] > 0.8]

print("Potential Correlated Pairs:")
potential_pairs

  prices = yf.download(tickers, start = '2010-01-01', end = '2020-12-31')['Close']
[*********************100%***********************]  11 of 11 completed


Potential Correlated Pairs:


Unnamed: 0,ETF,Pair,Correlation
4,XLB,XLI,0.915227
37,XLF,XLI,0.910969
56,XLK,XLC,0.902712
115,XLY,XLK,0.900094
21,XLC,XLY,0.88307
33,XLF,XLB,0.871237
114,XLY,XLI,0.868811
10,XLB,XLY,0.84558
85,XLRE,XLU,0.838442
104,XLV,XLK,0.83122


In [40]:
tickers = prices.columns
cointegrated_pairs = []
prices = prices.dropna()
candidate_pairs = []

for i in range(len(tickers)):
    for j in range(i + 1, len(tickers)):
        candidate_pairs.append([tickers[i], tickers[j]])

for i, j in candidate_pairs:
        etf1 = i
        etf2 = j

        score, pvalue, _ = coint(prices[etf1], prices[etf2])

        if pvalue < 0.1:
                cointegrated_pairs.append((etf1, etf2, pvalue))
                
cointegrated_pairs_df = pd.DataFrame(cointegrated_pairs, columns = ['ETF', 'Pair', 'P-Value'])
cointegrated_pairs_df = cointegrated_pairs_df.sort_values('P-Value').reset_index(drop=True)

print("Cointegration Test Results:")
cointegrated_pairs_df

Cointegration Test Results:


Unnamed: 0,ETF,Pair,P-Value
0,XLK,XLV,0.007507
1,XLC,XLY,0.014095
2,XLE,XLV,0.043761
3,XLE,XLK,0.079956
4,XLC,XLV,0.089709


In [41]:
results = []

def zscore_calc(series):
    return (series - series.mean()) / series.std()

def adf_test(series):
    test_res = adfuller(series)
    return {'stat': test_res[0], 'p-value': test_res[1]}

def hedge_ratio_calc(series1, series2):
    x = sm.add_constant(series2)
    model = sm.OLS(series2, x).fit()

    return model.params.iloc[1]

for etf1, etf2 in candidate_pairs:
    series1 = prices[etf1]
    series2 = prices[etf2]

    hedge_ratio = hedge_ratio_calc(series1, series2)

    spread = series1 - (hedge_ratio * series2)

    zscore_spread = zscore_calc(spread)

    adf_res = adf_test(spread)

    results.append(
        {'ETF1': etf1,
         'ETF2': etf2,
         'adf_value': adf_res['stat'],
         'p-value': adf_res['p-value'],
         'mean': zscore_spread.mean(),
         'std': zscore_spread.std()}
    )

results = pd.DataFrame(results)
results = results.sort_values('p-value', ascending = True)
results = results[results['p-value'] < 0.1]

print("ADF Test Results:")
results

ADF Test Results:


Unnamed: 0,ETF1,ETF2,adf_value,p-value,mean,std
17,XLC,XLV,-3.477369,0.008588,-1.067482e-15,1.0
