In [132]:
from FlagEmbedding import FlagModel
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yfinance as yf
import sklearn
import plotly.express as px
from tqdm import tqdm

In [None]:
model = FlagModel('BAAI/bge-base-en-v1.5')

Collect all tickers from the NASDAQ

In [3]:
# Read the NASDAQ ticker file
url = "ftp://ftp.nasdaqtrader.com/SymbolDirectory/nasdaqlisted.txt"
nasdaq_data = pd.read_csv(url, sep="|")
nasdaq_data['Symbol'] = nasdaq_data['Symbol'].apply(lambda x: str(x).replace('.','-'))
# Extract the Tickers column
nasdaq_tickers = nasdaq_data["Symbol"].tolist()

Next, go through all possible stocks on the NASDAQ and check if they have 1 year of historical data then save the stock's description from Yahoo! Finance alongside the historical data

In [7]:
from collections import defaultdict
from time import sleep

# Dataframe containing Ticker name and description
StockData = pd.DataFrame(None, columns=['Ticker', 'Description'])
# Dictionary with the ticker name as the key and historical data as the value for cointegration testing later on
priceDict = defaultdict()
counter = 0
for ticker in tqdm(nasdaq_tickers):
    counter += 1
    # To keep API call rate low enough to continue a sleep function is needed
    if counter % 5 == 0:
        sleep(4)
    try:
        stock = yf.Ticker(ticker)
        if '1y' in stock.history_metadata.get('validRanges', []):
            description = stock.info.get('longBusinessSummary',None)
            if not description:
                continue
            StockData.loc[len(StockData)] = {'Ticker': ticker, 'Description': description}
            priceDict[ticker] = stock.history(period='1y')['Close']
    except:
        continue

  2%|▏         | 101/4809 [02:03<1:51:38,  1.42s/it]$AFJKR: possibly delisted; no price data found  (period=5d)
  3%|▎         | 138/4809 [02:47<1:10:15,  1.11it/s]$AIMDW: possibly delisted; no price data found  (period=5d)
  3%|▎         | 156/4809 [03:11<1:45:05,  1.36s/it]$AITRU: possibly delisted; no price data found  (period=5d)
  4%|▎         | 171/4809 [03:30<1:54:09,  1.48s/it]$ALCYU: possibly delisted; no price data found  (period=5d)
  4%|▍         | 198/4809 [04:01<1:12:23,  1.06it/s]$ALSAR: possibly delisted; no price data found  (period=5d)
  4%|▍         | 199/4809 [04:01<53:07,  1.45it/s]  $ALSAU: possibly delisted; no price data found  (period=5d)
  6%|▌         | 293/4809 [06:00<1:11:44,  1.05it/s]$AQUNR: possibly delisted; no price data found  (period=5d)
  8%|▊         | 368/4809 [07:31<1:08:41,  1.08it/s]$ATMCR: possibly delisted; no price data found  (period=5d)
  8%|▊         | 369/4809 [07:31<50:55,  1.45it/s]  $ATMCU: possibly delisted; no price data found  (per

It is highly recommended to save the data afterwards as this process takes 40+ minutes

In [8]:
StockData.to_csv('StockDataDescrpts')

In [None]:
StockData = pd.read_csv('StockDataDescriptions.csv')

In [12]:
StockData['Encoded'] = StockData['Description'].apply(lambda x: model.encode(x))

Using the method outlined by the original authors of DBScan for finding an optimal Eps by finding the knee/elbow from the paper [here](https://www.google.com/url?sa=t&rct=j&q=&source=web&uact=8&url=https%3A%2F%2Fwww.ccs.neu.edu%2Fhome%2Fvip%2Fteach%2FDMcourse%2F2_cluster_EM_mixt%2Fnotes_slides%2FrevisitofrevisitDBSCAN.pdf)     

In [186]:
from kneed import KneeLocator
from sklearn.neighbors import NearestNeighbors

def findOptimalEps(data):
    nn = NearestNeighbors(n_neighbors=2)
    nn.fit(data)
    distances, _ = nn.kneighbors(data)
    distances = np.sort(distances, axis=0)
    distances = distances[:,1]
    kl = KneeLocator(x=range(len(distances)), y = distances, curve='concave', interp_method='interp1d', online=True)
    knee = kl.elbow
    print(kl.elbow_y)
    fig = px.scatter(
    distances, 
    title='Distance from nearest neighbor curve')
    fig.update_xaxes(title_text='Neighbors')
    fig.add_vline(x=knee, line_width=3, line_dash="dash", line_color="green")
    fig.update_yaxes(title_text='Distance from Neighbors (Eps)')
    fig.show()

In [187]:
RawEps = findOptimalEps(StockData['Encoded'].tolist())

0.8580353171474902


In [47]:
import pacmap
import matplotlib.pyplot as plt
import numpy as np
encode_array = StockData['Encoded'].to_numpy()
X_raw = np.hstack(encode_array)
X_raw = X_raw.reshape((4032,768))
embedding = pacmap.PaCMAP(n_components=300, n_neighbors=10)
X_transformed = embedding.fit_transform(X_raw)

In [188]:
findOptimalEps(X_transformed)


0.6991355462891019


In [None]:
from sklearn.decomposition import PCA
PCAEncoded = PCA(n_components=300).fit_transform(StockData['Encoded'].tolist())

In [189]:
PCAEps = findOptimalEps(PCAEncoded)
print(f'Optimal Eps for PCA labels is {PCAEps}')

0.8123682597146755


Optimal Eps for PCA labels is None


In [198]:
from sklearn.cluster import DBSCAN
StockData['RawLabels'] = DBSCAN(min_samples=2, eps=0.63).fit_predict(X_raw)
StockData['DRLabels'] = DBSCAN(min_samples=2, eps=0.52).fit_predict(X_transformed)
StockData['PCALabels'] = DBSCAN(min_samples=2, eps=0.56).fit_predict(PCAEncoded)

Many leveraged ETFs of a stock or stocks with different class types differe their ticker name by 1 letter. To remove these, as they would obviously be cointegrated, pairs with an edit distance $\leq$ 1 will be disregarded

In [263]:
from statsmodels.tsa.stattools import coint
from nltk.metrics import edit_distance

seen_pairs = set()
coint_pairs = []
pairs = list(StockData.groupby('DRLabels')['Ticker'].filter(lambda x: len(x) <= 40).groupby(StockData['DRLabels']).apply(tuple))
counter = 0
for pair in pairs:
    counter += (len(pair) * (len(pair) - 1)) // 2
print(f'{counter} comparions made among {len(pairs)} different clusters')
for pair in pairs:
    for i in range(0, len(pair) - 1):
        for j in range(i+1, len(pair)):
            top, bot = pair[i], pair[j]
            seen_pairs.add((top, bot))
            if edit_distance(top,bot) == 1:
                continue
            top_data = priceDict[top]
            bot_data = priceDict[bot]

            aligned_data = pd.concat([top_data, bot_data], axis=1, join="inner")
            aligned_data.dropna(inplace=True)
            top_series = aligned_data.iloc[:, 0]
            bot_series = aligned_data.iloc[:, 1]
            if len(top_series) != len(bot_series) or len(top_series) <= 30 or len(bot_series) <= 30:
                continue
            if coint(top_series, bot_series)[1] <= 0.05:
                coint_pairs.append((top,bot))
total_pairs = len(coint_pairs)
print(f'{total_pairs} possible cointegrated pairs have been found by PaCMAP')

2032 comparions made among 45 different clusters
198 possible cointegrated pairs have been found by PaCMAP


In [264]:
pairs = list(StockData.groupby('PCALabels')['Ticker'].filter(lambda x: len(x) <= 40).groupby(StockData['PCALabels']).apply(tuple))
counter = 0

for pair in pairs:
    counter += (len(pair) * (len(pair) - 1)) // 2
print(f'{counter} comparions made among {len(pairs)} different clusters')
for pair in pairs:
    for i in range(0, len(pair) - 1):
        for j in range(i+1, len(pair)):
            top, bot = pair[i], pair[j]
            if (top, bot) in seen_pairs or (bot, top) in seen_pairs:
                continue
            seen_pairs.add((top, bot))
            if edit_distance(top,bot) == 1:
                continue
            top_data = priceDict[top]
            bot_data = priceDict[bot]

            aligned_data = pd.concat([top_data, bot_data], axis=1, join="inner")
            aligned_data.dropna(inplace=True)
            top_series = aligned_data.iloc[:, 0]
            bot_series = aligned_data.iloc[:, 1]
            if len(top_series) != len(bot_series) or len(top_series) <= 30 or len(bot_series) <= 30:
                continue
            if coint(top_series, bot_series)[1] <= 0.05:
                coint_pairs.append((top, bot))

print(f'{len(coint_pairs) - total_pairs} new possible cointegrated pairs have been found by PCA')
total_pairs = len(coint_pairs)

1158 comparions made among 227 different clusters
102 new possible cointegrated pairs have been found by PCA


In [265]:
pairs = list(StockData.groupby('DRLabels')['Ticker'].filter(lambda x: len(x) <= 40).groupby(StockData['DRLabels']).apply(tuple))
counter = 0

for pair in pairs:
    counter += (len(pair) * (len(pair) - 1)) // 2
print(f'{counter} comparions made among {len(pairs)} different clusters')
for pair in pairs:
    for i in range(0, len(pair) - 1):
        for j in range(i+1, len(pair)):
            top, bot = pair[i], pair[j]
            if (top, bot) in seen_pairs or (bot, top) in seen_pairs:
                continue
            seen_pairs.add((top, bot))
            if edit_distance(top,bot) == 1:
                continue
            top_data = priceDict[top]
            bot_data = priceDict[bot]

            aligned_data = pd.concat([top_data, bot_data], axis=1, join="inner")
            aligned_data.dropna(inplace=True)
            top_series = aligned_data.iloc[:, 0]
            bot_series = aligned_data.iloc[:, 1]
            if len(top_series) != len(bot_series) or len(top_series) <= 30 or len(bot_series) <= 30:
                continue
            if coint(top_series, bot_series)[1] <= 0.05:
                coint_pairs.append((top,bot))

print(f'{len(coint_pairs) - total_pairs} new possible cointegrated pairs have been found by raw embeddings')
total_pairs = len(coint_pairs)

2032 comparions made among 45 different clusters
0 new possible cointegrated pairs have been found by raw embeddings


In [266]:
print(coint_pairs)

[('BLDE', 'SKYW'), ('BLDE', 'UAL'), ('MESA', 'SNCY'), ('AAPB', 'TSDD'), ('AAPB', 'TSLZ'), ('AAPU', 'TSDD'), ('AMDL', 'SKRE'), ('AMDS', 'BABX'), ('AMDS', 'CONL'), ('AMDS', 'GGLL'), ('AMDS', 'GGLS'), ('AMDS', 'NVDL'), ('AMDS', 'NVDU'), ('AMDS', 'SKRE'), ('AMDS', 'TARK'), ('AMDS', 'TSDD'), ('AMDS', 'TSL'), ('AMDS', 'TSLL'), ('AMDS', 'TSLQ'), ('AMDS', 'TSLR'), ('AMDS', 'TSLS'), ('AMDS', 'TSLT'), ('AMDS', 'TSLZ'), ('MSFD', 'SKRE'), ('MSFD', 'TARK'), ('NVD', 'SKRE'), ('NVD', 'TARK'), ('NVD', 'TSDD'), ('NVD', 'TSLQ'), ('NVD', 'TSLS'), ('NVD', 'TSLZ'), ('NVDD', 'SKRE'), ('NVDD', 'TSL'), ('NVDD', 'TSLL'), ('NVDS', 'SKRE'), ('NVDS', 'TSL'), ('NVDS', 'TSLL'), ('SKRE', 'TSDD'), ('SKRE', 'TSLS'), ('SKRE', 'TSLZ'), ('TSDD', 'TSLZ'), ('ABTS', 'ANY'), ('ABTS', 'ARBK'), ('ABTS', 'BTM'), ('ANY', 'BTM'), ('ANY', 'GREE'), ('ANY', 'GRYP'), ('ANY', 'MIGI'), ('ANY', 'RIOT'), ('APLD', 'BTBT'), ('ARBK', 'BTM'), ('ARBK', 'BTOG'), ('ARBK', 'GRYP'), ('ARBK', 'SLNHP'), ('BITF', 'BTM'), ('BITF', 'GREE'), ('BTBT', '

In [271]:
mono_pairs = []
pairDict = defaultdict(list)
for pair in coint_pairs:
    x, y = pair
    if len(pairDict[x]) >= 1 or len(pairDict[y]) >= 1:
        continue
    pairDict[x].append(y)
    pairDict[y].append(x)
    mono_pairs.append(pair)

print(f'Out of {len(coint_pairs)} possible pairs. {len(mono_pairs)} monogamous possible pairs have been found')


Out of 300 possible pairs. 89 monogamous possible pairs have been found


In [272]:
print(mono_pairs)

[('BLDE', 'SKYW'), ('MESA', 'SNCY'), ('AAPB', 'TSDD'), ('AMDL', 'SKRE'), ('AMDS', 'BABX'), ('MSFD', 'TARK'), ('NVD', 'TSLQ'), ('NVDD', 'TSL'), ('NVDS', 'TSLL'), ('ABTS', 'ANY'), ('APLD', 'BTBT'), ('ARBK', 'BTM'), ('BITF', 'GREE'), ('BTCS', 'BTDR'), ('BTCT', 'DGHI'), ('BTOG', 'CORZ'), ('GRYP', 'MIGI'), ('MARA', 'RIOT'), ('SLNHP', 'WULF'), ('EBAY', 'ETSY'), ('SWAG', 'TACT'), ('APEI', 'LOPE'), ('AIRE', 'PDYN'), ('AIRTP', 'FTAI'), ('ASLE', 'FTAIO'), ('AMCX', 'PARA'), ('CMCSA', 'IHRT'), ('MDIA', 'WMG'), ('NWS', 'SGA'), ('NXST', 'PARAA'), ('RSVR', 'SIRI'), ('RSVRW', 'WBD'), ('SSP', 'UONEK'), ('ARRY', 'CNTM'), ('BSCO', 'BSJP'), ('BSCP', 'BSJQ'), ('BSCQ', 'BSJR'), ('BSCR', 'BSJV'), ('BSJS', 'BSMO'), ('BSJT', 'BSMQ'), ('BSMU', 'BSSX'), ('CMPR', 'KRNT'), ('CNSL', 'FYBR'), ('EDUC', 'EEIQ'), ('INV', 'IZEA'), ('FAB', 'FEX'), ('FNK', 'FTA'), ('FNX', 'FTC'), ('FBYD', 'MGOL'), ('FDTS', 'FEP'), ('FJP', 'FLN'), ('MNY', 'MNYWW'), ('GOOD', 'LANDM'), ('HIHO', 'RAY'), ('MGIH', 'PMEC'), ('ALSAU', 'ATMV'), ('