In [1]:
import numpy as np
import pandas as pd

from scipy import linalg
from scipy import sparse
from scipy.sparse.linalg import svds
import ot


from numpy.linalg import matrix_rank
import itertools
import copy
import time as time
from sklearn.cluster import AgglomerativeClustering
from matplotlib.pyplot import figure
from scipy.cluster.hierarchy import dendrogram, fcluster, cophenet
from scipy.spatial.distance import pdist
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import hdbscan
from scipy.linalg import orthogonal_procrustes

from tqdm import tqdm
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.cluster import adjusted_rand_score, fowlkes_mallows_score
from scipy.stats import rankdata,kendalltau,sem


import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from sklearn.manifold import TSNE
from random import choices

from matplotlib.lines import Line2D

from textblob import Word
from nltk.corpus import stopwords
import nltk
import re
from pprint import pprint
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/ag16115/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
import random 
random.seed(1)

In [3]:
import warnings
warnings.filterwarnings("ignore")

# Functions 

## General

In [4]:
def signif(x, p):
    x = np.asarray(x)
    x_positive = np.where(np.isfinite(x) & (x != 0), np.abs(x), 10**(p-1))
    mags = 10 ** (p - 1 - np.floor(np.log10(x_positive)))
    return np.round(x * mags) / mags

def flatten_list(l):
    flat_list = [item for sublist in l for item in sublist]
    return flat_list

def check_symmetric(a, rtol=1e-05, atol=1e-08):
    return np.allclose(a, a.T, rtol=rtol, atol=atol)

def is_pos_def(x):
    return np.all(np.linalg.eigvals(x) > 0)

def reverse_dict(dic):
    for r in dic.keys():
        if not isinstance(dic[r], list):
            dic[r] = [dic[r]]
    inverse = { v: k for k, l in dic.items() for v in l }
    return inverse

def pc_scores(X, r):
    U, s, Vh = svds(X,k=r)
    idx = s.argsort()[::-1]   
    Vh = Vh[idx,:]
    Y = X @ Vh.T
    return Y

def embed_cov(X, r):
    if r == X.shape[0]:
        U, s, Vh = np.linalg.svd(X, full_matrices=True)
    else:
        U, s, Vh = svds(X,k=r)
    idx = s.argsort()[::-1]   
    U = U[:,idx]
    s = s[idx]
    ## need to take square root as these are eigenvalues
    Y = U @ np.diag(np.sqrt(s)) 
    return Y

def find_rows_to_merge(F):
    f = copy.deepcopy(F)
    np.fill_diagonal(f, -np.inf)
    i,j = np.unravel_index(f.argmax(), f.shape)
    return [i,j]   

def inner_products(X):
    return X.dot(X.T)

## Dimension selection

In [5]:
def wasserstein_dim_select(Y, split = 0.5, rmin = 1, rmax = 50):
    n = Y.shape[0]
    train = round(n * split)
    rtry = int(np.min((train, rmax)))
    if sparse.issparse(Y):
        Y = Y.todense()
    Ytrain = Y[:train,:]
    Ytest = Y[train:n,:]
    U, s, Vh = sparse.linalg.svds(Ytrain,k=rtry-1)
    idx = s.argsort()[::-1] 
    s = s[idx]
    Vh = Vh[idx,:]
    ws = []
    for r in tqdm(range(rmin,rtry+1)):
        P = Vh.T[:,:r] @ Vh[:r,:]
        Yproj = Ytrain @ P.T
        n1 = Yproj.shape[0]
        n2 = Ytest.shape[0]
        M = ot.dist(Yproj,Ytest, metric='euclidean')
        W1 = ot.emd2(np.repeat(1/n1,n1),np.repeat(1/n2,n2),M)
        ws.append(W1)
    return ws

## Document data: text functions

In [6]:
def del_email_address(text):
    e = '\S*@\S*\s?'
    pattern = re.compile(e)
    return pattern.sub('', text) 

def clean_text(text):
    return " ".join([ Word(word).lemmatize() for word in re.sub("[^A-Za-z0-9]+", " ", text).lower().split() if word not in stopword])  

## IP HC

In [7]:
def ip_metric(X,Y):
    return  np.sum(X * Y)

def ip_affinity(X):
    ips = pairwise_distances(X, metric = ip_metric)
    return np.max(ips) - ips

In [8]:
def clusters_to_labels(clusters):
    d = defaultdict(list)
    for index, sublist in enumerate(clusters):
        for item in sublist:
            d[item].append(index)
    labels = flatten_list([d[c] for c in range(len(d.keys()))])
    return labels

## Ranking

In [9]:
def find_ancestors(model, target):
    n_samples = len(model.labels_)
    global ances
    for ind, merge in enumerate(model.children_):
        if target in merge:
            if n_samples+ind in ances:
                return [target]+ ances[n_samples+ind]
            ances[n_samples+ind] = find_ancestors(model,n_samples+ind)
            return [target]+ances[n_samples+ind]
    return [ind+n_samples]

def find_descendents(model,node):
    n_samples = len(model.labels_)
    global desc
    if node in desc:
        return desc[node]
    if node < n_samples:
        return [node]
    pair = model.children_[node-n_samples]
    desc[node] = find_descendents(model,pair[0])+find_descendents(model,pair[1])
    return desc[node]

def get_ranking(model, target):
    rank = np.zeros(len(model.labels_))
    to_root = [find_descendents(model, cl) for cl in find_ancestors(model, target)]
    to_rank = [list(set(to_root[i+1]) - set(to_root[i])) for i in range(len(to_root)-1)]
    for i in range(1,len(to_rank)+1):
        rank[to_rank[i-1]] = i
    return rank

In [10]:
def find_ancestors_v1(children,n, target):
    n_samples = n
    global ances
    for ind, merge in enumerate(children):
        if target in merge:
            if n_samples+ind in ances:
                return [target]+ ances[n_samples+ind]
            ances[n_samples+ind] = find_ancestors_v1(children,n,n_samples+ind)
            return [target]+ances[n_samples+ind]
    return [ind+n_samples]

def find_descendents_v1(children,n ,node):
    n_samples = n
    global desc
    if node in desc:
        return desc[node]
    if node < n_samples:
        return [node]
    pair = children[node-n_samples]
    desc[node] = find_descendents_v1(children,n,pair[0])+find_ancestors_v1(children,n,pair[1])
    return desc[node]

def get_ranking_v1(children,n, target):
    rank = np.zeros(n)
    to_root = [find_descendents_v1(children,n, cl) for cl in find_ancestors_v1(children,n, target)]
    to_rank = [list(set(to_root[i+1]) - set(to_root[i])) for i in range(len(to_root)-1)]
    for i in range(1,len(to_rank)+1):
        rank[to_rank[i-1]] = i
    return rank

## Plotting dendrogram

In [11]:
def plot_dendrogram(model, rescale = False, size = (10,10), **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count
    
    if rescale == True:
        d_max = np.max(model.distances_)
        d_min = np.min(model.distances_)
        distances = (model.distances_ - d_min) / (d_max - d_min)
    else:
        distances = model.distances_

    linkage_matrix = np.column_stack(
        [model.children_, distances, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    fig = plt.figure(figsize = size)
    dendrogram(linkage_matrix, **kwargs)

In [12]:
def linkage_matrix(model, rescale = False):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count
    
    if rescale == True:
        d_max = np.max(model.distances_)
        d_min = np.min(model.distances_)
        distances = (model.distances_ - d_min) / (d_max - d_min)
    else:
        distances = model.distances_

    linkage_matrix = np.column_stack(
        [model.children_, distances, counts]
    ).astype(float)

    return linkage_matrix

# input data

In [13]:
tables = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
comp_df = tables[0]
comp_df["Symbol"] = comp_df["Symbol"].map(lambda x: x.replace(".", "-"))  # rename symbol to escape symbol error 

In [14]:
industry_dict = {comp_df['Symbol'].iloc[i]: comp_df['GICS Sector'].iloc[i] for i in range(len(comp_df))}
sub_industry_dict = {comp_df['Symbol'].iloc[i]: comp_df['GICS Sub-Industry'].iloc[i] for i in range(len(comp_df))}
security_dict = {comp_df['Security'].iloc[i]: comp_df['Symbol'].iloc[i] for i in range(len(comp_df))}

In [15]:
df = pd.read_csv('sp500_data.csv')
df = df[list(set(df.columns) & set(comp_df['Symbol']))]
df = df.dropna(axis = 1, how = 'any')

Y = np.array(df).T
comps = df.columns

In [16]:
n = len(comps)

In [17]:
# true ranking of data 
hier_df = pd.DataFrame()
hier_df['comp'] = comps
hier_df['ind'] = [industry_dict[i] for i in comps]
hier_df['sub_ind'] = [sub_industry_dict[i] for i in comps]

id_2cats = {i:list(hier_df[['ind','sub_ind']].iloc[i]) for i in range(n)}
pairs = np.array(list(itertools.combinations(list(range(n)), 2)))
n_inter =  [len(list(set(id_2cats[pairs[i][0]]) & set(id_2cats[pairs[i][1]]))) for i in range(pairs.shape[0])]

upper = np.zeros((n, n))
upper[np.triu_indices(n, 1)] = np.max(n_inter) - n_inter
true_ranking = upper + upper.T

In [18]:
Y = np.array(df).T

In [19]:
(n,p) = Y.shape
print(n,p)

368 1259


In [20]:
# rmin = 1
# rmax = 50
# ws = wasserstein_dim_select(Y,rmin = rmin, rmax = rmax)
# dim = np.argmin(ws) + rmin
# print(f'Dimension selected: {dim}')

In [21]:
dim = 10

In [22]:
zeta = p**-.5 * pc_scores(Y, dim)
zeta = np.array(zeta)

# method comparison

In [23]:
method_comparison_df = pd.DataFrame()

## dot product

### PCA 

In [24]:
%%time
ip_clust = AgglomerativeClustering(affinity = ip_affinity, linkage = 'average',distance_threshold=0, n_clusters=None)
ip_clust.fit(zeta);

CPU times: user 446 ms, sys: 500 ms, total: 946 ms
Wall time: 342 ms


AgglomerativeClustering(affinity=<function ip_affinity at 0x7fdab08ed430>,
                        distance_threshold=0, linkage='average',
                        n_clusters=None)

In [25]:
# plot_dendrogram(ward,labels = list(industry_list), size = (10,20), orientation='left',truncate_mode="level",color_threshold = 0)

In [26]:
%%time
ances = {}; desc = {}
ip_ranking = np.array([get_ranking(ip_clust,t) for t in range(n)])

ip_kt_z = [kendalltau(ip_ranking[i], true_ranking[i]).correlation for i in range(ip_ranking.shape[0])]
np.mean(ip_kt_z)

CPU times: user 608 ms, sys: 5.19 ms, total: 613 ms
Wall time: 601 ms


0.357447079797884

### Y

In [27]:
%%time
ip_clust = AgglomerativeClustering(affinity = ip_affinity, linkage = 'average',distance_threshold=0, n_clusters=None)
ip_clust.fit(Y);

CPU times: user 377 ms, sys: 0 ns, total: 377 ms
Wall time: 376 ms


AgglomerativeClustering(affinity=<function ip_affinity at 0x7fdab08ed430>,
                        distance_threshold=0, linkage='average',
                        n_clusters=None)

In [28]:
%%time
ances = {}; desc = {}
ip_ranking = np.array([get_ranking(ip_clust,t) for t in range(n)])

ip_kt_y = [kendalltau(ip_ranking[i], true_ranking[i]).correlation for i in range(ip_ranking.shape[0])]
np.mean(ip_kt_y)

CPU times: user 548 ms, sys: 1.98 ms, total: 550 ms
Wall time: 540 ms


0.3378807319669298

## ward

### PCA

In [29]:
%%time
ward = AgglomerativeClustering(linkage="ward", distance_threshold=0, n_clusters = None)
ward.fit(zeta)

CPU times: user 3.22 ms, sys: 0 ns, total: 3.22 ms
Wall time: 2.67 ms


AgglomerativeClustering(distance_threshold=0, n_clusters=None)

In [30]:
%%time
ances = {}; desc = {}
w_ranking = np.array([get_ranking(ward,t) for t in range(n)])

w_kt_z = [kendalltau(w_ranking[i], true_ranking[i]).correlation for i in range(w_ranking.shape[0])]
np.mean(w_kt_z)

CPU times: user 469 ms, sys: 1.71 ms, total: 471 ms
Wall time: 460 ms


0.391970663145865

### Y

In [31]:
%%time
ward = AgglomerativeClustering(linkage="ward", distance_threshold=0, n_clusters = None)
ward.fit(Y)

CPU times: user 65.9 ms, sys: 750 µs, total: 66.7 ms
Wall time: 65.4 ms


AgglomerativeClustering(distance_threshold=0, n_clusters=None)

In [32]:
%%time
ances = {}; desc = {}
w_ranking = np.array([get_ranking(ward,t) for t in range(n)])

w_kt_y = [kendalltau(w_ranking[i], true_ranking[i]).correlation for i in range(w_ranking.shape[0])]
np.mean(w_kt_y)

CPU times: user 481 ms, sys: 0 ns, total: 481 ms
Wall time: 477 ms


0.347953606262907

## UPGMA

### PCA 

In [33]:
%%time
average = AgglomerativeClustering(linkage="average", distance_threshold=0, n_clusters = None)
average.fit(zeta)

CPU times: user 2.44 ms, sys: 755 µs, total: 3.2 ms
Wall time: 2.63 ms


AgglomerativeClustering(distance_threshold=0, linkage='average',
                        n_clusters=None)

In [34]:
%%time
ances = {}; desc  = {}
a_ranking = np.array([get_ranking(average,t) for t in range(n)])

a_kt_z = [kendalltau(a_ranking[i], true_ranking[i]).correlation for i in range(a_ranking.shape[0])]
np.mean(a_kt_z)

CPU times: user 485 ms, sys: 6.4 ms, total: 491 ms
Wall time: 480 ms


0.390738922457119

### Y 

In [35]:
%%time
average = AgglomerativeClustering(linkage="average", distance_threshold=0, n_clusters = None)
average.fit(Y)

CPU times: user 52.7 ms, sys: 159 µs, total: 52.9 ms
Wall time: 51.9 ms


AgglomerativeClustering(distance_threshold=0, linkage='average',
                        n_clusters=None)

In [36]:
%%time
ances = {}; desc  = {}
a_ranking = np.array([get_ranking(average,t) for t in range(n)])

a_kt_y = [kendalltau(a_ranking[i], true_ranking[i]).correlation for i in range(a_ranking.shape[0])]
np.mean(a_kt_y)

CPU times: user 561 ms, sys: 6.08 ms, total: 568 ms
Wall time: 554 ms


0.3444095037157435

## cosine 

### PCA 

In [37]:
%%time
cosine = AgglomerativeClustering(affinity = 'cosine', linkage = 'average', distance_threshold=0, n_clusters = None)
cosine.fit(zeta)

CPU times: user 3.69 ms, sys: 0 ns, total: 3.69 ms
Wall time: 3.02 ms


AgglomerativeClustering(affinity='cosine', distance_threshold=0,
                        linkage='average', n_clusters=None)

In [38]:
%%time
ances = {}; desc  = {}
cs_ranking = np.array([get_ranking(cosine,t) for t in range(n)])

cs_kt_z = [kendalltau(cs_ranking[i], true_ranking[i]).correlation for i in range(cs_ranking.shape[0])]
np.mean(cs_kt_z)

CPU times: user 481 ms, sys: 788 µs, total: 482 ms
Wall time: 472 ms


0.41507580466588084

### Y

In [39]:
%%time
cosine = AgglomerativeClustering(affinity = 'cosine', linkage = 'average', distance_threshold=0, n_clusters = None)
cosine.fit(Y)

CPU times: user 61.1 ms, sys: 211 µs, total: 61.3 ms
Wall time: 60.3 ms


AgglomerativeClustering(affinity='cosine', distance_threshold=0,
                        linkage='average', n_clusters=None)

In [40]:
%%time
ances = {}; desc  = {}
cs_ranking = np.array([get_ranking(cosine,t) for t in range(n)])

cs_kt_y = [kendalltau(cs_ranking[i], true_ranking[i]).correlation for i in range(cs_ranking.shape[0])]
np.mean(cs_kt_y)

CPU times: user 548 ms, sys: 5.2 ms, total: 553 ms
Wall time: 540 ms


0.3378807319669298

## save

In [41]:
method_comparison_df['ip_z'] = [np.mean(ip_kt_z),sem(ip_kt_z)]
method_comparison_df['ip_y'] = [np.mean(ip_kt_y),sem(ip_kt_y)]

method_comparison_df['w_z'] = [np.mean(w_kt_z),sem(w_kt_z)]
method_comparison_df['w_y'] = [np.mean(w_kt_y),sem(w_kt_y)]

method_comparison_df['a_z'] = [np.mean(a_kt_z),sem(a_kt_z)]
method_comparison_df['a_y'] = [np.mean(a_kt_y),sem(a_kt_y)]

method_comparison_df['cs_z'] = [np.mean(cs_kt_z),sem(cs_kt_z)]
method_comparison_df['cs_y'] = [np.mean(cs_kt_y),sem(cs_kt_y)]

# method_comparison_df.to_csv('method_comparison_df.csv', index=False)

## hdbscan

### PCA 

In [42]:
hdbscan_ = hdbscan.HDBSCAN(min_cluster_size=2)
hdbscan_.fit(zeta)

hdbscan_.single_linkage_tree_;
hdbscan_.children_ = np.array(hdbscan_.single_linkage_tree_.to_pandas()[['left_child','right_child']], dtype=int)

In [43]:
%%time
ances = {}; desc = {}
hdbscan_ranking = np.array([get_ranking(hdbscan_,t) for t in range(n)]) 

hdbscan_kt_z = [kendalltau(hdbscan_ranking[i], true_ranking[i]).correlation for i in range(hdbscan_ranking.shape[0])]
np.mean(hdbscan_kt_z)

CPU times: user 734 ms, sys: 0 ns, total: 734 ms
Wall time: 731 ms


0.3317329944737204

### Y

In [44]:
hdbscan_ = hdbscan.HDBSCAN(min_cluster_size=2)
hdbscan_.fit(Y)

hdbscan_.single_linkage_tree_;
hdbscan_.children_ = np.array(hdbscan_.single_linkage_tree_.to_pandas()[['left_child','right_child']], dtype=int)

In [45]:
%%time
ances = {}; desc = {}
hdbscan_ranking = np.array([get_ranking(hdbscan_,t) for t in range(n)]) # range(n)

hdbscan_kt_y = [kendalltau(hdbscan_ranking[i], true_ranking[i]).correlation for i in range(hdbscan_ranking.shape[0])]
np.mean(hdbscan_kt_y)

CPU times: user 1.15 s, sys: 7.66 ms, total: 1.16 s
Wall time: 1.16 s


0.1381112863212358

In [46]:
method_comparison_df['hdbscan_z'] = [np.mean(hdbscan_kt_z),sem(hdbscan_kt_z)]
method_comparison_df['hdbscan_y'] = [np.mean(hdbscan_kt_y),sem(hdbscan_kt_y)]

## save 

In [47]:
# method_comparison_df.to_csv('method_comparison_df.csv', index=False)

In [48]:
method_comparison_df

Unnamed: 0,ip_z,ip_y,w_z,w_y,a_z,a_y,cs_z,cs_y,hdbscan_z,hdbscan_y
0,0.357447,0.337881,0.391971,0.347954,0.390739,0.34441,0.415076,0.337881,0.331733,0.138111
1,0.009416,0.010131,0.010665,0.011098,0.010717,0.01004,0.01084,0.010131,0.012766,0.009307
