In [42]:
import yfinance as yf
import pandas as pd
from statsmodels.tsa.stattools import coint
from statsmodels.tsa.stattools import adfuller
import statsmodels.api as sm

sector_etfs = ['XLE', 'XLF', 'XLK', 'XLI', 'XLB', 'XLV', 'XLY', 'XLP', 'XLU', 'XLC', 'XLRE']
factor_etfs = ['VTV', 'VUG', 'MTUM', 'QUAL', 'SPLV', 'USMV', 'VLUE', 'SIZE', 'RPV', 'RPG']
commodity_etfs = ['GLD', 'SLV', 'DBC', 'USO']
international_etfs = ['EFA', 'EEM', 'VEU', 'EWJ', 'FXI', 'EWZ', 'EPA']
broad_index_etfs = ['SPY', 'VOO', 'DIA', 'IWM', 'QQQ']
thematic_etfs = ['ARKK', 'BOTZ', 'TAN', 'LIT']
industry_etfs = ['XRT', 'IYT', 'VNQ', 'IQR', 'GLX']

etf_categories = {
    'sector': sector_etfs,
    'factor': factor_etfs,
    'commodity': commodity_etfs,
    'international': international_etfs,
    'broad_index': broad_index_etfs,
    'thematic': thematic_etfs,
    'industry': industry_etfs
}

flattened_etfs = [etf for etfs in etf_categories.values() for etf in etfs]

prices = yf.download(flattened_etfs, start = '2010-01-01', end = '2020-12-31')['Close']

for etf in prices.columns:
    data = prices[etf]

    if data.empty:
        print(f"{etf} does not contain data")

# first_frame = ['2010-01-01', '2020-12-31']
# second_frame = ['2015-01-01, 2020-12-31']
# third_frame = ['2018-01-01', '2023-12-31']

for category, etfs in etf_categories.items():
    desired_etfs = [etf for etf in etfs]

    category_prices = prices[desired_etfs]

    returns = category_prices.pct_change().dropna()

    correlation_matrix = returns.corr()
    correlation_matrix = correlation_matrix.rename_axis(None).rename_axis(None, axis = 1)
    correlation_matrix = correlation_matrix.stack().reset_index()
    correlation_matrix.columns = ['ETF', 'Pair', 'Correlation']

    correlation_matrix =  correlation_matrix[correlation_matrix['ETF'] != correlation_matrix['Pair']]
    correlation_matrix = correlation_matrix.sort_values(by = 'Correlation', ascending = False)
    correlation_matrix = correlation_matrix.drop_duplicates('Correlation')

    potential_pairs = correlation_matrix[correlation_matrix['Correlation'] > 0.8]

    print(f"Potential Correlated Pairs from {category}:")
    print(potential_pairs)

  prices = yf.download(flattened_etfs, start = '2010-01-01', end = '2020-12-31')['Close']
[*********************100%***********************]  46 of 46 completed
  returns = category_prices.pct_change().dropna()


Potential Correlated Pairs from sector:
     ETF  Pair  Correlation
37   XLI   XLB     0.915227
34   XLI   XLF     0.910968
31   XLK   XLC     0.902712
28   XLK   XLY     0.900094
105  XLC   XLY     0.883070
45   XLB   XLF     0.871236
69   XLY   XLI     0.868810
50   XLB   XLY     0.845580
98   XLU  XLRE     0.838442
57   XLV   XLK     0.831221
17   XLF   XLY     0.826989
11   XLF   XLE     0.825928
85   XLP   XLU     0.817952
33   XLI   XLE     0.810708
71   XLY   XLV     0.803545
82   XLP   XLV     0.801949
Potential Correlated Pairs from factor:
     ETF  Pair  Correlation
54  USMV  SPLV     0.968808
91   RPG   VUG     0.966530
3    VTV  QUAL     0.954643
12   VUG  MTUM     0.953323
31  QUAL   VUG     0.949628
39  QUAL   RPG     0.949147
6    VTV  VLUE     0.947858
29  MTUM   RPG     0.945649
80   RPV   VTV     0.943162
35  QUAL  USMV     0.939103
86   RPV  VLUE     0.936319
5    VTV  USMV     0.935943
63  VLUE  QUAL     0.913488
32  QUAL  MTUM     0.912546
67  VLUE  SIZE     0.902

  returns = category_prices.pct_change().dropna()


In [43]:
for category, etfs in etf_categories.items():

        candidate_pairs = []

        desired_etfs = [etf for etf in etfs]

        category_prices = prices[desired_etfs]

        for i in range(len(desired_etfs)):
                for j in range(i + 1, len(desired_etfs)):
                        candidate_pairs.append([desired_etfs[i], desired_etfs[j]])

        cointegrated_pairs = []

        for etf1, etf2 in candidate_pairs:
                s_etf1 = category_prices[etf1].dropna()
                s_etf2 = category_prices[etf2].dropna()

                common_idx = s_etf1.index.intersection(s_etf2.index)
                s_etf1_aligned = s_etf1.loc[common_idx]
                s_etf2_aligned = s_etf2.loc[common_idx]

                score, pvalue, _ = coint(s_etf1_aligned, s_etf2_aligned)

                if pvalue < 0.05:
                        cointegrated_pairs.append((etf1, etf2, pvalue))
                        
        cointegrated_pairs_df = pd.DataFrame(cointegrated_pairs, columns = ['ETF', 'Pair', 'P-Value'])
        cointegrated_pairs_df = cointegrated_pairs_df.sort_values('P-Value').reset_index(drop=True)

        print(f"Cointegration Test Results for {category}:")
        print(cointegrated_pairs_df)

Cointegration Test Results for sector:
   ETF  Pair   P-Value
0  XLU  XLRE  0.000503
1  XLY   XLC  0.016411
2  XLI   XLU  0.017895
3  XLV   XLC  0.036852
4  XLV   XLP  0.043850
Cointegration Test Results for factor:
    ETF  Pair   P-Value
0  MTUM   RPG  0.007179
1  MTUM  QUAL  0.022807
Cointegration Test Results for commodity:
Empty DataFrame
Columns: [ETF, Pair, P-Value]
Index: []
Cointegration Test Results for international:
   ETF Pair   P-Value
0  EWJ  FXI  0.031978
1  VEU  EWJ  0.034174
Cointegration Test Results for broad_index:
Empty DataFrame
Columns: [ETF, Pair, P-Value]
Index: []
Cointegration Test Results for thematic:
Empty DataFrame
Columns: [ETF, Pair, P-Value]
Index: []
Cointegration Test Results for industry:
   ETF Pair   P-Value
0  IQR  GLX  0.020831


In [48]:
window_size = 504
min_passes = 0.6
step = 30

for category, etfs in etf_categories.items():

    candidate_pairs = []
    rolling_cointegrated_pairs = []

    desired_etfs = [etf for etf in etfs]

    category_prices = prices[desired_etfs]

    for i in range(len(desired_etfs)):
        for j in range(i + 1, len(desired_etfs)):
                candidate_pairs.append([desired_etfs[i], desired_etfs[j]])

    for etf1, etf2 in candidate_pairs:
            
        df = category_prices[[etf1, etf2]].dropna()

        s_etf1 = df[etf1]
        s_etf2 = df[etf2]

        df = pd.concat([s_etf1, s_etf2], axis = 1)

        if len(df[etf1]) == 0:
            print(f"{etf1} does not have sufficient data")
            continue
        elif len(df[etf2]) == 0:
            print(f"{etf2} does not have sufficient data")
            continue
            
        series1 = df.iloc[:, 0]
        series2 = df.iloc[:, 1]

        cointegrated_windows = 0
        total_windows = 0

        for start in range(0, len(df) - window_size + 1, step):
            end = start + window_size

            window_s1 = series1.iloc[start:end]
            window_s2 = series2.iloc[start:end]
                
            score, pvalue, _ = coint(window_s1, window_s2)
            total_windows += 1
                
            if pvalue < 0.05:
                cointegrated_windows += 1

        if cointegrated_windows / total_windows >= min_passes:
            rolling_cointegrated_pairs.append({'ETF1': etf1,
                                            'ETF2': etf2,
                                            'Pass %': cointegrated_windows / total_windows})


    rolling_cointegrated_pairs_df = pd.DataFrame(rolling_cointegrated_pairs)

    if rolling_cointegrated_pairs_df.empty:
        print(f"{category} has no rolling cointegrated pairs.")
        continue
    else:
        rolling_cointegrated_pairs_df = rolling_cointegrated_pairs_df.sort_values('Pass %', ascending = False).reset_index(drop=True)
        print("Rolling Cointegration Test Results:")
        rolling_cointegrated_pairs_df

sector has no rolling cointegrated pairs.
factor has no rolling cointegrated pairs.
commodity has no rolling cointegrated pairs.
international has no rolling cointegrated pairs.
broad_index has no rolling cointegrated pairs.
thematic has no rolling cointegrated pairs.


ZeroDivisionError: division by zero

In [None]:
results = []

def zscore_calc(series):
    return (series - series.mean()) / series.std()

def adf_test(series):
    test_res = adfuller(series)
    return {'stat': test_res[0], 'p-value': test_res[1]}

def hedge_ratio_calc(series1, series2):
    x = sm.add_constant(series2)
    model = sm.OLS(series1, x).fit()

    return model.params.iloc[1]

for etf1, etf2 in candidate_pairs:
    series1 = prices[etf1]
    series2 = prices[etf2]

    hedge_ratio = hedge_ratio_calc(series1, series2)

    spread = series1 - (hedge_ratio * series2)

    zscore_spread = zscore_calc(spread)

    adf_res = adf_test(spread)

    results.append(
        {'ETF1': etf1,
         'ETF2': etf2,
         'adf_value': adf_res['stat'],
         'p-value': adf_res['p-value'],
         'mean': zscore_spread.mean(),
         'std': zscore_spread.std()}
    )

results = pd.DataFrame(results)
results = results.sort_values('p-value', ascending = True)
results = results[results['p-value'] < 0.05]

print("ADF Test Results:")
results

ADF Test Results:


Unnamed: 0,ETF1,ETF2,adf_value,p-value,mean,std
628,RPG,XLC,-4.662230,0.000099,4.447842e-17,1.0
348,FXI,XLB,-4.658834,0.000100,2.001529e-16,1.0
334,FXI,SIZE,-4.495183,0.000201,-4.447842e-16,1.0
544,MTUM,XLC,-4.404014,0.000292,-2.001529e-16,1.0
744,SPY,VOO,-4.284639,0.000472,-1.089721e-15,1.0
...,...,...,...,...,...,...
590,QUAL,TAN,-2.883504,0.047300,-2.090486e-15,1.0
606,QUAL,XLP,-2.881219,0.047576,1.000764e-16,1.0
139,DIA,MTUM,-2.876774,0.048116,-2.557509e-16,1.0
140,DIA,QQQ,-2.862267,0.049915,-7.450135e-16,1.0
