In [60]:
import yfinance as yf
import pandas as pd
from statsmodels.tsa.stattools import coint
from statsmodels.tsa.stattools import adfuller
import statsmodels.api as sm

tickers = [
    # Sector ETFs
    'XLB', 'XLE', 'XLF', 'XLI', 'XLK', 'XLP', 'XLV', 'XLY', 'XLU', 'XLRE', 'XLC',
    
    # Style/Factor ETFs
    'VTV', 'VUG', 'MTUM', 'QUAL', 'SPLV',
    
    # Major Index ETFs
    'SPY', 'VOO', 'DIA', 'IWM', 'QQQ',
    
    # Commodity ETFs
    'USO', 'GLD', 'DBC',
    
    # Thematic/Global ETFs
    'ARKK', 'EFA', 'EEM',
    
    'USMV', 'VLUE', 'SIZE', 'RPV', 'RPG',
    'VEU', 'EWJ', 'FXI', 'EWZ',
    'VNQ', 'SLV',
    'TLT', 'IEF', 'BOTZ', 'TAN', 'LIT', 'XRT', 'IYT'
]

prices = yf.download(tickers, start = '2010-01-01', end = '2020-12-31')['Close']

# first_frame = ['2010-01-01', '2020-12-31']
# second_frame = ['2015-01-01, 2020-12-31']
# third_frame = ['2018-01-01', '2023-12-31']

returns = prices.pct_change().dropna()

correlation_matrix = returns.corr()
correlation_matrix = correlation_matrix.rename_axis(None).rename_axis(None, axis = 1)
correlation_matrix = correlation_matrix.stack().reset_index()
correlation_matrix.columns = ['ETF', 'Pair', 'Correlation']

correlation_matrix =  correlation_matrix[correlation_matrix['ETF'] != correlation_matrix['Pair']]
correlation_matrix = correlation_matrix.sort_values(by = 'Correlation', ascending = False)
correlation_matrix = correlation_matrix.drop_duplicates('Correlation')

potential_pairs = correlation_matrix[correlation_matrix['Correlation'] > 0.8]

print("Potential Correlated Pairs:")
potential_pairs

  prices = yf.download(tickers, start = '2010-01-01', end = '2020-12-31')['Close']
[*********************100%***********************]  45 of 45 completed


Potential Correlated Pairs:


Unnamed: 0,ETF,Pair,Correlation
1020,SPY,VOO,0.998873
742,QUAL,SPY,0.990761
750,QUAL,VOO,0.989718
1220,VEU,EFA,0.988393
1829,XLRE,VNQ,0.986561
...,...,...,...
1929,XLV,XLP,0.801948
956,SPLV,IWM,0.801691
1274,VLUE,MTUM,0.801499
1714,XLK,EEM,0.800861


In [None]:
tickers = prices.columns
cointegrated_pairs = []
prices = prices.dropna()
candidate_pairs = []

for i in range(len(tickers)):
    for j in range(i + 1, len(tickers)):
        candidate_pairs.append([tickers[i], tickers[j]])

for i, j in candidate_pairs:
        etf1 = i
        etf2 = j

        score, pvalue, _ = coint(prices[etf1], prices[etf2])

        if pvalue < 0.05:
                cointegrated_pairs.append((etf1, etf2, pvalue))
                
cointegrated_pairs_df = pd.DataFrame(cointegrated_pairs, columns = ['ETF', 'Pair', 'P-Value'])
cointegrated_pairs_df = cointegrated_pairs_df.sort_values('P-Value').reset_index(drop=True)

print("Cointegration Test Results:")
cointegrated_pairs_df

Cointegration Test Results:


Unnamed: 0,ETF,Pair,P-Value
0,RPG,XLC,0.000645
1,FXI,XLB,0.000654
2,FXI,SIZE,0.001232
3,MTUM,XLC,0.001732
4,SPY,VOO,0.002682
...,...,...,...
66,EWJ,TAN,0.098037
67,EFA,VTV,0.098338
68,SLV,VOO,0.098538
69,GLD,XLE,0.098778


In [None]:
rolling_cointegrated_pairs = []
prices = prices.dropna()

window_size = 504
min_passes = 0.6
step = 30

for etf1, etf2 in candidate_pairs:
        series1 = prices[etf1]
        series2 = prices[etf2]

        df = pd.concat([series1, series2], axis = 1)

        if df[etf1].empty:
            print(f"{etf1} does not have sufficient data")
            continue
        elif df[etf2].empty:
            print(f"{etf2} does not have sufficient data")  

        series1 = df.iloc[:, 0]
        series2 = df.iloc[:, 1]

        cointegrated_windows = 0
        total_windows = 0

        for start in range(0, len(df) - window_size + 1, step):
              end = start + window_size

              window_s1 = series1.iloc[start:end]
              window_s2 = series2.iloc[start:end]
              
              score, pvalue, _ = coint(window_s1, window_s2)
              total_windows += 1
              
              if pvalue < 0.05:
                   cointegrated_windows += 1

        if cointegrated_windows / total_windows >= min_passes:
             rolling_cointegrated_pairs.append({'ETF1': etf1,
                                        'ETF2': etf2,
                                        'Pass %': cointegrated_windows / total_windows})
                
rolling_cointegrated_pairs_df = pd.DataFrame(rolling_cointegrated_pairs)
rolling_cointegrated_pairs_df = rolling_cointegrated_pairs_df.sort_values('Pass %', ascending = False).reset_index(drop=True)

print("Rolling Cointegration Test Results:")
rolling_cointegrated_pairs_df

Rolling Cointegration Test Results:


Unnamed: 0,ETF1,ETF2,Pass %
0,SPY,VOO,1.0
1,EFA,XLI,1.0
2,MTUM,XLC,1.0
3,FXI,XLB,1.0
4,FXI,IWM,1.0
5,FXI,IYT,1.0
6,RPG,XLC,1.0
7,FXI,SIZE,1.0
8,FXI,SPY,0.8
9,FXI,XLI,0.8


In [64]:
results = []

def zscore_calc(series):
    return (series - series.mean()) / series.std()

def adf_test(series):
    test_res = adfuller(series)
    return {'stat': test_res[0], 'p-value': test_res[1]}

def hedge_ratio_calc(series1, series2):
    x = sm.add_constant(series2)
    model = sm.OLS(series1, x).fit()

    return model.params.iloc[1]

for etf1, etf2 in candidate_pairs:
    series1 = prices[etf1]
    series2 = prices[etf2]

    hedge_ratio = hedge_ratio_calc(series1, series2)

    spread = series1 - (hedge_ratio * series2)

    zscore_spread = zscore_calc(spread)

    adf_res = adf_test(spread)

    results.append(
        {'ETF1': etf1,
         'ETF2': etf2,
         'adf_value': adf_res['stat'],
         'p-value': adf_res['p-value'],
         'mean': zscore_spread.mean(),
         'std': zscore_spread.std()}
    )

results = pd.DataFrame(results)
results = results.sort_values('p-value', ascending = True)
results = results[results['p-value'] < 0.05]

print("ADF Test Results:")
results

ADF Test Results:


Unnamed: 0,ETF1,ETF2,adf_value,p-value,mean,std
628,RPG,XLC,-4.662230,0.000099,4.447842e-17,1.0
348,FXI,XLB,-4.658834,0.000100,2.001529e-16,1.0
334,FXI,SIZE,-4.495183,0.000201,-4.447842e-16,1.0
544,MTUM,XLC,-4.404014,0.000292,-2.001529e-16,1.0
744,SPY,VOO,-4.284639,0.000472,-1.089721e-15,1.0
...,...,...,...,...,...,...
590,QUAL,TAN,-2.883504,0.047300,-2.090486e-15,1.0
606,QUAL,XLP,-2.881219,0.047576,1.000764e-16,1.0
139,DIA,MTUM,-2.876774,0.048116,-2.557509e-16,1.0
140,DIA,QQQ,-2.862267,0.049915,-7.450135e-16,1.0
