In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import yfinance as yf
from datetime import datetime, timedelta
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
import sklearn as sk
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from hmmlearn import hmm , vhmm
import scipy.fftpack as fftpack
import warnings
warnings.filterwarnings('ignore')
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torch.optim as optim
import itertools
from ta.trend import sma_indicator
from ta.momentum import rsi
from ta.momentum import stoch
from ta.trend import ema_indicator
from ta.trend import adx
from ta.trend import macd
from ta.volume import on_balance_volume
import random
import itertools
from collections import deque
import copy
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
from sklearn.preprocessing import MinMaxScaler
from statsmodels.tsa.vector_ar.vecm import coint_johansen
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
np.random.seed(42)
torch.manual_seed(42)
random_state=42

# Generate Features and Dataset

In [None]:

class data_processor:
    def __init__(self,tickers,start_date='1980-01-01',end_date='2025-04-30', window_sizes=[24,24*3]):
        self.ticker = tickers
        self.start_date = start_date
        self.end_date = end_date
        self.window_sizes = window_sizes


        self.data= None

    def get_data(self):
        df1 = yf.download(self.ticker[0],interval='1h',start=self.start_date, end=self.end_date)
        df2 = yf.download(self.ticker[1],interval='1h',start=self.start_date, end=self.end_date)
       
        # df1 = yf.download(self.ticker[0],start=self.start_date, end=self.end_date)
        # df2 = yf.download(self.ticker[1],start=self.start_date, end=self.end_date)
        df1.columns = df1.columns.droplevel(1)
        df2.columns = df2.columns.droplevel(1)
     


        df1['Return1'] = np.log(df1['Close']) - np.log(df1['Close']).shift(1)
        df2['Return2'] = np.log(df2['Close']) - np.log(df2['Close']).shift(1)


        df1['rsi'] = rsi(df1[( 'Close')], 14)   
        df1['macd'] = macd(df1[( 'Close')])
        df1['obv'] = on_balance_volume(df1[( 'Close')], df1[( 'Volume')])

        df2['rsi'] = rsi(df2[( 'Close')], 14) 
        df2['macd'] = macd(df2[( 'Close')])
        df2['obv'] = on_balance_volume(df2[( 'Close')], df2[( 'Volume')])
        
        df1 , df2 = df1.align(df2,join='inner',axis=0)

    
        self.df1 = df1
        self.df2 = df2
    
    def get_hedge_ratio(self,window_size,clopen = 'Close'):
        pairs_stats = [] # date, close1, close2, hedge ratio, half life, spread (z-score), adf p-value

        for i in range(len(self.df1)-window_size):
            window1 = self.df1[clopen][i:i+window_size]
            window2 = self.df2[clopen][i:i+window_size]
            last_index = window1.index[-1]

            model = sm.OLS(window1,window2).fit()
            hedge_ratio = model.params[0]
            spread = window2 - hedge_ratio*window1 
            spread_z_score = (spread - spread.mean())/spread.std()
            half_life = np.log(2)/hedge_ratio

            adf = adfuller(spread)[1]

            pairs_stats.append([last_index,self.df1.loc[last_index, 'Return1'],self.df2.loc[last_index, 'Return2'],self.df2.loc[last_index, 'rsi'],self.df2.loc[last_index, 'macd'],self.df2.loc[last_index, 'obv'],self.df1.loc[last_index, 'rsi'],self.df1.loc[last_index, 'macd'],self.df1.loc[last_index, 'obv']                  ,hedge_ratio,half_life,spread_z_score.iloc[-1],adf])
         

        data = pd.DataFrame(data= pairs_stats,columns=['Date','Return1','Return2','rsi2','macd2','obv2','rsi1','macd1','obv1','Hedge Ratio','Half Life','Spread','ADF'])
        data = data.set_index('Date')
        return data

    def windows(self):
        data1 = self.get_hedge_ratio(self.window_sizes[0])
        data2 = self.get_hedge_ratio(self.window_sizes[1])

        data = data1.join(data2,rsuffix=' Long')
        data = data.dropna()
        data = data.drop(['Return1 Long','Return2 Long','rsi2 Long','macd2 Long','obv2 Long','rsi1 Long','macd1 Long','obv1 Long'],axis=1)
      

        self.data=data
    def clopen(self):
        data1 = self.get_hedge_ratio(252,'Adj Close')
        data2 = self.get_hedge_ratio(252,'Open')

        data = data1.join(data2,rsuffix=' Open')
        data = data.dropna()
        data = data.drop(['Return1 Open','Return2 Open','rsi2 Open','macd2 Open','obv2 Open','rsi1 Open','macd1 Open','obv1 Open'],axis=1)


        self.data=data

In [None]:
processor = data_processor(['DG','DLTR'],start_date='2023-05-27',end_date='2025-05-25')
processor.get_data()
processor.windows()
print(processor.data)

# Find Cointegrated Pairs

In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
from itertools import combinations
from statsmodels.tsa.vector_ar.vecm import coint_johansen
from datetime import datetime, timedelta

def download_stock_data(tickers, start_date, end_date):
    
    stock_data = {}
    
    for ticker in tickers:
        try:
            data = yf.download(ticker, interval='1h', start=start_date, end=end_date)
            if not data.empty:
                # Use adjusted close price
                data.columns.droplevel(1)
                stock_data[ticker] = data['Close']

                print(f"Downloaded data for {ticker}: {len(data)} records")
            else:
                print(f"No data found for {ticker}")
        except Exception as e:
            print(f"Error downloading {ticker}: {e}")
    
    return stock_data

def perform_johansen_test(price_series1, price_series2, det_order=0, k_ar_diff=1):
    
    # Align the series by index and remove NaN values
    combined_data = pd.concat([price_series1, price_series2], axis=1).dropna()
    
    if len(combined_data) < 20:  # Need sufficient data points
        return None
    
    try:
        # Perform Johansen test
        result = coint_johansen(combined_data.values, det_order=det_order, k_ar_diff=k_ar_diff)
        
        # Extract test statistics and critical values
        trace_stat = result.lr1[0]  # Trace statistic for r=0 (no cointegration)
        max_eigen_stat = result.lr2[0]  # Max eigenvalue statistic for r=0
        
        # Critical values for trace test (90%, 95%, 99%)
        trace_crit_values = result.cvt[0]  # [90%, 95%, 99%]
        
        # Critical values for max eigenvalue test (90%, 95%, 99%)
        max_eigen_crit_values = result.cvm[0]  # [90%, 95%, 99%]
        
        # Determine significance levels
        trace_significance = []
        if trace_stat > trace_crit_values[2]:  # 99%
            trace_significance.append('99%')
        if trace_stat > trace_crit_values[1]:  # 95%
            trace_significance.append('95%')
        if trace_stat > trace_crit_values[0]:  # 90%
            trace_significance.append('90%')
        
        max_eigen_significance = []
        if max_eigen_stat > max_eigen_crit_values[2]:  # 99%
            max_eigen_significance.append('99%')
        if max_eigen_stat > max_eigen_crit_values[1]:  # 95%
            max_eigen_significance.append('95%')
        if max_eigen_stat > max_eigen_crit_values[0]:  # 90%
            max_eigen_significance.append('90%')
        
        return {
            'trace_statistic': trace_stat,
            'max_eigen_statistic': max_eigen_stat,
            'trace_critical_values': {
                '90%': trace_crit_values[0],
                '95%': trace_crit_values[1],
                '99%': trace_crit_values[2]
            },
            'max_eigen_critical_values': {
                '90%': max_eigen_crit_values[0],
                '95%': max_eigen_crit_values[1],
                '99%': max_eigen_crit_values[2]
            },
            'trace_significance': trace_significance,
            'max_eigen_significance': max_eigen_significance,
            'data_points': len(combined_data)
        }
    
    except Exception as e:
        print(f"Error in Johansen test: {e}")
        return None

def analyze_cointegration(tickers, start_date, end_date):
    """
    Main function to analyze cointegration between all pairs of tickers
    """
    # Download data
    print("Downloading stock data...")
    stock_data = download_stock_data(tickers, start_date, end_date)
    
    if len(stock_data) < 2:
        print("Need at least 2 valid tickers for cointegration analysis")
        return
    
    # Generate all pairs
    valid_tickers = list(stock_data.keys())
    pairs = list(combinations(valid_tickers, 2))
    
    print(f"\nTesting {len(pairs)} pairs for cointegration...")
    
    # Store results
    results = []
    
    for ticker1, ticker2 in pairs:
        print(f"Testing {ticker1} - {ticker2}")
        
        result = perform_johansen_test(stock_data[ticker1], stock_data[ticker2])
        
        if result:
            results.append({
                'Pair': f"{ticker1} - {ticker2}",
                'Trace_Statistic': result['trace_statistic'],
                'Max_Eigen_Statistic': result['max_eigen_statistic'],
                'Trace_90%_Critical': result['trace_critical_values']['90%'],
                'Trace_95%_Critical': result['trace_critical_values']['95%'],
                'Trace_99%_Critical': result['trace_critical_values']['99%'],
                'Max_Eigen_90%_Critical': result['max_eigen_critical_values']['90%'],
                'Max_Eigen_95%_Critical': result['max_eigen_critical_values']['95%'],
                'Max_Eigen_99%_Critical': result['max_eigen_critical_values']['99%'],
                'Trace_Significance': ', '.join(result['trace_significance']) if result['trace_significance'] else 'Not Significant',
                'Max_Eigen_Significance': ', '.join(result['max_eigen_significance']) if result['max_eigen_significance'] else 'Not Significant',
                'Data_Points': result['data_points']
            })
    
    # Convert to DataFrame and sort by trace statistic (descending)
    df_results = pd.DataFrame(results)
    df_results = df_results.sort_values('Trace_Statistic', ascending=False)
    
    return df_results

In [None]:
tickers = ['DG','DLTR',"CPT","EQR"]
# payload=pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
# first_table = payload[0]
# second_table = payload[1]

# df = first_table
# tickers = df.Symbol.tolist()[:250]

# Define date range
end_date = '2025-05-25'
start_date = '2023-05-27'



print(f"Analyzing cointegration from {start_date} to {end_date}")

# Perform analysis
results_df = analyze_cointegration(tickers, start_date, end_date)

if results_df is not None and not results_df.empty:
    print("\n" + "="*100)
    print("JOHANSEN COINTEGRATION TEST RESULTS")
    print("="*100)
    print(f"{'Pair':<15} {'Trace Stat':<12} {'Max Eigen':<12} {'Trace Signif':<15} {'Max Eigen Signif':<15} {'Data Points':<12}")
    print("-"*100)
    
    for _, row in results_df.iterrows():
        print(f"{row['Pair']:<15} {row['Trace_Statistic']:<12.4f} {row['Max_Eigen_Statistic']:<12.4f} {row['Trace_Significance']:<15} {row['Max_Eigen_Significance']:<15} {row['Data_Points']:<12}")
    
    print("\n" + "="*100)
    print("DETAILED RESULTS")
    print("="*100)
    
    # Display detailed results for cointegrated pairs
    cointegrated_pairs = results_df[
        (results_df['Trace_Significance'] != 'Not Significant') | 
        (results_df['Max_Eigen_Significance'] != 'Not Significant')
    ]
    
    if not cointegrated_pairs.empty:
        print("\nCOINTEGRATED PAIRS:")
        for _, row in cointegrated_pairs.iterrows():
            print(f"\n{row['Pair']}:")
            print(f"  Trace Statistic: {row['Trace_Statistic']:.4f}")
            print(f"  Trace Critical Values: 90%={row['Trace_90%_Critical']:.4f}, 95%={row['Trace_95%_Critical']:.4f}, 99%={row['Trace_99%_Critical']:.4f}")
            print(f"  Trace Significance: {row['Trace_Significance']}")
            print(f"  Max Eigenvalue Statistic: {row['Max_Eigen_Statistic']:.4f}")
            print(f"  Max Eigen Critical Values: 90%={row['Max_Eigen_90%_Critical']:.4f}, 95%={row['Max_Eigen_95%_Critical']:.4f}, 99%={row['Max_Eigen_99%_Critical']:.4f}")
            print(f"  Max Eigen Significance: {row['Max_Eigen_Significance']}")
            print(f"  Data Points: {row['Data_Points']}")
    else:
        print("\nNo significantly cointegrated pairs found at 90% confidence level or higher.")
    
    # Save results to CSV
    results_df.to_csv('johansen_cointegration_results.csv', index=False)
    print(f"\nResults saved to 'johansen_cointegration_results.csv'")
else:
    print("No results to display.")