In [1]:
import numpy as np
import pandas as pd

from scipy import linalg
from scipy import sparse
from scipy.sparse.linalg import svds
import ot


from numpy.linalg import matrix_rank
import itertools
import copy
import time as time
from sklearn.cluster import AgglomerativeClustering
from matplotlib.pyplot import figure
from scipy.cluster.hierarchy import dendrogram, fcluster, cophenet
from scipy.spatial. distance import pdist
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import hdbscan
from scipy.linalg import orthogonal_procrustes

from tqdm import tqdm
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.cluster import adjusted_rand_score, fowlkes_mallows_score
from scipy.stats import rankdata,kendalltau,sem

import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from sklearn.manifold import TSNE
from random import choices

from matplotlib.lines import Line2D

from textblob import Word
from nltk.corpus import stopwords
import nltk
import re
from pprint import pprint
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/ag16115/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
import random 
random.seed(1)

# Functions 

## General

In [3]:
def signif(x, p):
    x = np.asarray(x)
    x_positive = np.where(np.isfinite(x) & (x != 0), np.abs(x), 10**(p-1))
    mags = 10 ** (p - 1 - np.floor(np.log10(x_positive)))
    return np.round(x * mags) / mags

def flatten_list(l):
    flat_list = [item for sublist in l for item in sublist]
    return flat_list

def check_symmetric(a, rtol=1e-05, atol=1e-08):
    return np.allclose(a, a.T, rtol=rtol, atol=atol)

def is_pos_def(x):
    return np.all(np.linalg.eigvals(x) > 0)

def reverse_dict(dic):
    for r in dic.keys():
        if not isinstance(dic[r], list):
            dic[r] = [dic[r]]
    inverse = { v: k for k, l in dic.items() for v in l }
    return inverse

def pc_scores(X, r):
    U, s, Vh = svds(X,k=r)
    idx = s.argsort()[::-1]   
    Vh = Vh[idx,:]
    Y = X @ Vh.T
    return Y

def embed_cov(X, r):
    if r == X.shape[0]:
        U, s, Vh = np.linalg.svd(X, full_matrices=True)
    else:
        U, s, Vh = svds(X,k=r)
    idx = s.argsort()[::-1]   
    U = U[:,idx]
    s = s[idx]
    ## need to take square root as these are eigenvalues
    Y = U @ np.diag(np.sqrt(s)) 
    return Y

def find_rows_to_merge(F):
    f = copy.deepcopy(F)
    np.fill_diagonal(f, -np.inf)
    i,j = np.unravel_index(f.argmax(), f.shape)
    return [i,j]   

def inner_products(X):
    return X.dot(X.T)

## Dimension selection

In [4]:
def wasserstein_dim_select(Y, split = 0.5, rmin = 1, rmax = 50):
    n = Y.shape[0]
    train = round(n * split)
    rtry = int(np.min((train, rmax)))
    if sparse.issparse(Y):
        Y = Y.todense()
    Ytrain = Y[:train,:]
    Ytest = Y[train:n,:]
    U, s, Vh = sparse.linalg.svds(Ytrain,k=rtry-1)
    idx = s.argsort()[::-1] 
    s = s[idx]
    Vh = Vh[idx,:]
    ws = []
    for r in tqdm(range(rmin,rtry+1)):
        P = Vh.T[:,:r] @ Vh[:r,:]
        Yproj = Ytrain @ P.T
        n1 = Yproj.shape[0]
        n2 = Ytest.shape[0]
        M = ot.dist(Yproj,Ytest, metric='euclidean')
        W1 = ot.emd2(np.repeat(1/n1,n1),np.repeat(1/n2,n2),M)
        ws.append(W1)
    return ws

## Document data: text functions

In [5]:
def del_email_address(text):
    e = '\S*@\S*\s?'
    pattern = re.compile(e)
    return pattern.sub('', text) 

def clean_text(text):
    return " ".join([ Word(word).lemmatize() for word in re.sub("[^A-Za-z0-9]+", " ", text).lower().split() if word not in stopword])  

## IP HC

In [6]:
def ip_metric(X,Y):
    return  np.sum(X * Y)

def ip_affinity(X):
    ips = pairwise_distances(X, metric = ip_metric)
    return np.max(ips) - ips

In [7]:
def clusters_to_labels(clusters):
    d = defaultdict(list)
    for index, sublist in enumerate(clusters):
        for item in sublist:
            d[item].append(index)
    labels = flatten_list([d[c] for c in range(len(d.keys()))])
    return labels

## Ranking

In [8]:
def find_ancestors(model, target):
    n_samples = len(model.labels_)
    global ances
    for ind, merge in enumerate(model.children_):
        if target in merge:
            if n_samples+ind in ances:
                return [target]+ ances[n_samples+ind]
            ances[n_samples+ind] = find_ancestors(model,n_samples+ind)
            return [target]+ances[n_samples+ind]
    return [ind+n_samples]

def find_descendents(model,node):
    n_samples = len(model.labels_)
    global desc
    if node in desc:
        return desc[node]
    if node < n_samples:
        return [node]
    pair = model.children_[node-n_samples]
    desc[node] = find_descendents(model,pair[0])+find_descendents(model,pair[1])
    return desc[node]

def get_ranking(model, target):
    rank = np.zeros(len(model.labels_))
    to_root = [find_descendents(model, cl) for cl in find_ancestors(model, target)]
    to_rank = [list(set(to_root[i+1]) - set(to_root[i])) for i in range(len(to_root)-1)]
    for i in range(1,len(to_rank)+1):
        rank[to_rank[i-1]] = i
    return rank

In [9]:
def find_ancestors_v1(children,n, target):
    n_samples = n
    global ances
    for ind, merge in enumerate(children):
        if target in merge:
            if n_samples+ind in ances:
                return [target]+ ances[n_samples+ind]
            ances[n_samples+ind] = find_ancestors_v1(children,n,n_samples+ind)
            return [target]+ances[n_samples+ind]
    return [ind+n_samples]

def find_descendents_v1(children,n ,node):
    n_samples = n
    global desc
    if node in desc:
        return desc[node]
    if node < n_samples:
        return [node]
    pair = children[node-n_samples]
    desc[node] = find_descendents_v1(children,n,pair[0])+find_ancestors_v1(children,n,pair[1])
    return desc[node]

def get_ranking_v1(children,n, target):
    rank = np.zeros(n)
    to_root = [find_descendents_v1(children,n, cl) for cl in find_ancestors_v1(children,n, target)]
    to_rank = [list(set(to_root[i+1]) - set(to_root[i])) for i in range(len(to_root)-1)]
    for i in range(1,len(to_rank)+1):
        rank[to_rank[i-1]] = i
    return rank

# input data

In [14]:
# # df = pd.read_csv('train_40k.csv')
# # df = df[df['Cat3'] != 'unknown']

# min_length = 2
# lengths = [len(df['Text'].iloc[i].split()) for i in range(len(df))]
# length_ids = np.where(np.array(lengths) > min_length)[0]
# df = df.iloc[length_ids].reset_index()

# # ## use random sample of data 
# n = 5000
# df = df.sample(n=n, replace=False).reset_index(drop=True)
# n = len(df)

# ## list of common words to delete
# stopword = set(nltk.corpus.stopwords.words("english"))
# # gets rid of stopwords, symbols, makes lower case and base words
# df["Text"] = df.Text.apply(lambda row: clean_text(row))

# # min_length = 2
# # lengths = [len(df['Text'].iloc[i].split()) for i in range(n)]
# # length_ids = np.where(np.array(lengths) > min_length)[0]
# # df = df.iloc[length_ids].reset_index()

In [15]:
## load sample of review data
df = pd.read_csv('sample_df.csv')
n = len(df)

In [16]:
## list of common words to delete
stopword = set(nltk.corpus.stopwords.words("english"))
df["Text"] = df.Text.apply(lambda row: clean_text(row))

## vectorize data
vectorizer = TfidfVectorizer(min_df = 3, max_df = n-500)
Y = vectorizer.fit_transform(df.Text)

In [17]:
(n,p) = Y.shape
print(n,p)

5000 5588


In [18]:
# create true ranking
id_2cats = {i:list(df[['Cat1','Cat2','Cat3']].iloc[i]) for i in range(n)}
pairs = np.array(list(itertools.combinations(list(range(n)), 2)))
n_inter =  [len(list(set(id_2cats[pairs[i][0]]) & set(id_2cats[pairs[i][1]]))) for i in range(pairs.shape[0])]

upper = np.zeros((n, n))
upper[np.triu_indices(n, 1)] = np.max(n_inter) - n_inter
true_ranking = upper + upper.T

In [19]:
## dimension selection
# rmin = 1
# rmax = 50
# ws = wasserstein_dim_select(Y, rmin = rmin, rmax = rmax)
# dim = np.argmin(ws) + rmin
# print(f'Dimension selected: {dim}')

In [20]:
dim = 22 

In [21]:
zeta = p**-.5 * pc_scores(Y, dim)
zeta = np.array(zeta)

# method comparison

In [22]:
method_comparison_df = pd.DataFrame()

In [23]:
Y = np.asarray(Y.todense())

## dot product

### PCA 

In [24]:
%%time
ip_clust = AgglomerativeClustering(affinity = ip_affinity, linkage = 'average',distance_threshold=0, n_clusters=None)
ip_clust.fit(zeta);

CPU times: user 1min 17s, sys: 298 ms, total: 1min 18s
Wall time: 1min 18s


AgglomerativeClustering(affinity=<function ip_affinity at 0x7f2f34407e50>,
                        distance_threshold=0, linkage='average',
                        n_clusters=None)

In [25]:
%%time
ances = {}; desc = {}
ip_ranking = np.array([get_ranking(ip_clust,t) for t in range(n)])

ip_kt_z = [kendalltau(ip_ranking[i], true_ranking[i]).correlation for i in range(ip_ranking.shape[0])]
np.mean(ip_kt_z)

CPU times: user 8min 32s, sys: 291 ms, total: 8min 32s
Wall time: 8min 34s


0.14268642895718653

### Y

In [26]:
%%time
ip_clust = AgglomerativeClustering(affinity = ip_affinity, linkage = 'average',distance_threshold=0, n_clusters=None)
ip_clust.fit(Y);

CPU times: user 3min 38s, sys: 444 ms, total: 3min 39s
Wall time: 3min 39s


AgglomerativeClustering(affinity=<function ip_affinity at 0x7f2f34407e50>,
                        distance_threshold=0, linkage='average',
                        n_clusters=None)

In [27]:
%%time
ances = {}; desc = {}
ip_ranking = np.array([get_ranking(ip_clust,t) for t in range(n)])

ip_kt_y = [kendalltau(ip_ranking[i], true_ranking[i]).correlation for i in range(ip_ranking.shape[0])]
np.mean(ip_kt_y)


KeyboardInterrupt



## ward

### PCA

In [28]:
%%time
ward = AgglomerativeClustering(linkage="ward", distance_threshold=0, n_clusters = None)
ward.fit(zeta)

CPU times: user 1 s, sys: 52 ms, total: 1.06 s
Wall time: 1.05 s


AgglomerativeClustering(distance_threshold=0, n_clusters=None)

In [None]:
%%time
ances = {}; desc = {}
w_ranking = np.array([get_ranking(ward,t) for t in range(n)])

w_kt_z = [kendalltau(w_ranking[i], true_ranking[i]).correlation for i in range(w_ranking.shape[0])]
np.mean(w_kt_z)

### Y

In [None]:
%%time
ward = AgglomerativeClustering(linkage="ward", distance_threshold=0, n_clusters = None)
ward.fit(Y)

In [None]:
%%time
ances = {}; desc = {}
w_ranking = np.array([get_ranking(ward,t) for t in range(n)])

w_kt_y = [kendalltau(w_ranking[i], true_ranking[i]).correlation for i in range(w_ranking.shape[0])]
np.mean(w_kt_y)

## UPGMA

### PCA 

In [None]:
%%time
average = AgglomerativeClustering(linkage="average", distance_threshold=0, n_clusters = None)
average.fit(zeta)

In [None]:
%%time
ances = {}; desc  = {}
a_ranking = np.array([get_ranking(average,t) for t in range(n)])

a_kt_z = [kendalltau(a_ranking[i], true_ranking[i]).correlation for i in range(a_ranking.shape[0])]
np.mean(a_kt_z)

### Y 

In [None]:
%%time
average = AgglomerativeClustering(linkage="average", distance_threshold=0, n_clusters = None)
average.fit(Y)

In [None]:
%%time
ances = {}; desc  = {}
a_ranking = np.array([get_ranking(average,t) for t in range(n)])

a_kt_y = [kendalltau(a_ranking[i], true_ranking[i]).correlation for i in range(a_ranking.shape[0])]
np.mean(a_kt_y)

## cosine 

### PCA 

In [None]:
%%time
cosine = AgglomerativeClustering(affinity = 'cosine', linkage = 'average', distance_threshold=0, n_clusters = None)
cosine.fit(zeta)

In [None]:
%%time
ances = {}; desc  = {}
cs_ranking = np.array([get_ranking(cosine,t) for t in range(n)])

cs_kt_z = [kendalltau(cs_ranking[i], true_ranking[i]).correlation for i in range(cs_ranking.shape[0])]
np.mean(cs_kt_z)

### Y

In [None]:
%%time
cosine = AgglomerativeClustering(affinity = 'cosine', linkage = 'average', distance_threshold=0, n_clusters = None)
cosine.fit(Y)

In [None]:
%%time
ances = {}; desc  = {}
cs_ranking = np.array([get_ranking(cosine,t) for t in range(n)])

cs_kt_y = [kendalltau(cs_ranking[i], true_ranking[i]).correlation for i in range(cs_ranking.shape[0])]
np.mean(cs_kt_y)

## save

In [None]:

method_comparison_df['ip_z'] = [np.mean(ip_kt_z),sem(ip_kt_z)]
method_comparison_df['ip_y'] = [np.mean(ip_kt_y),sem(ip_kt_y)]

method_comparison_df['w_z'] = [np.mean(w_kt_z),sem(w_kt_z)]
method_comparison_df['w_y'] = [np.mean(w_kt_y),sem(w_kt_y)]

method_comparison_df['a_z'] = [np.mean(a_kt_z),sem(a_kt_z)]
method_comparison_df['a_y'] = [np.mean(a_kt_y),sem(a_kt_y)]

method_comparison_df['cs_z'] = [np.mean(cs_kt_z),sem(cs_kt_z)]
method_comparison_df['cs_y'] = [np.mean(cs_kt_y),sem(cs_kt_y)]

# method_comparison_df.to_csv('method_comparison_df.csv', index=False)

## hdbscan

### PCA 

In [None]:
# if need to increase recursion limit

import sys
sys.getrecursionlimit()
sys.setrecursionlimit(10000)

In [None]:
hdbscan_ = hdbscan.HDBSCAN(min_cluster_size=2)
hdbscan_.fit(zeta)

hdbscan_.single_linkage_tree_;
hdbscan_.children_ = np.array(hdbscan_.single_linkage_tree_.to_pandas()[['left_child','right_child']], dtype=int)

In [None]:
%%time
ances = {}; desc = {}
hdbscan_ranking = np.array([get_ranking(hdbscan_,t) for t in range(n)]) 

hdbscan_kt_z = [kendalltau(hdbscan_ranking[i], true_ranking[i]).correlation for i in range(hdbscan_ranking.shape[0])]
np.mean(hdbscan_kt_z)

### Y

In [None]:
hdbscan_ = hdbscan.HDBSCAN(min_cluster_size=2)
hdbscan_.fit(Y)

hdbscan_.single_linkage_tree_;
hdbscan_.children_ = np.array(hdbscan_.single_linkage_tree_.to_pandas()[['left_child','right_child']], dtype=int)

In [None]:
%%time
ances = {}; desc = {}
hdbscan_ranking = np.array([get_ranking(hdbscan_,t) for t in range(n)]) # range(n)

hdbscan_kt_y = [kendalltau(hdbscan_ranking[i], true_ranking[i]).correlation for i in range(hdbscan_ranking.shape[0])]
np.mean(hdbscan_kt_y)

In [None]:
method_comparison_df['hdbscan_z'] = [np.mean(hdbscan_kt_z),sem(hdbscan_kt_z)]
method_comparison_df['hdbscan_y'] = [np.mean(hdbscan_kt_y),sem(hdbscan_kt_y)]

In [None]:
# method_comparison_df.to_csv('method_comparison_df.csv', index=False)

## Appendix method comparison

In [None]:
linkage = ['complete','single']
metric = ['euclidean', 'cosine']
combs = list(itertools.product(linkage, metric))

In [None]:
method_comparison_df_v2 = pd.DataFrame(columns = ['linkage','metric',
                                                  'zeta_mean','zeta_se','Y_mean','Y_se'])

In [None]:
for i in range(len(combs)):
    print(combs[i])
    
    on_zeta = AgglomerativeClustering(affinity = combs[i][1], linkage = combs[i][0],distance_threshold=0, n_clusters=None)
    on_zeta.fit(zeta);
    ances = {}; desc = {}
    on_zeta_ranking = np.array([get_ranking(on_zeta,t) for t in range(n)])
    kt_z = [kendalltau(on_zeta_ranking[i], true_ranking[i]).correlation for i in range(on_zeta_ranking.shape[0])]
    
    on_Y = AgglomerativeClustering(affinity = combs[i][1], linkage = combs[i][0],distance_threshold=0, n_clusters=None)
    on_Y.fit(Y);
    ances = {}; desc = {}
    on_Y_ranking = np.array([get_ranking(on_Y,t) for t in range(n)])
    kt_Y = [kendalltau(on_Y_ranking[i], true_ranking[i]).correlation for i in range(on_Y_ranking.shape[0])]
    
    
    new_row = {'linkage': combs[i][0],'metric':combs[i][1],
               'zeta_mean': np.mean(kt_z),'zeta_se': sem(kt_z),
               'Y_mean': np.mean(kt_Y),'Y_se': sem(kt_Y)}
    method_comparison_df_v2 = pd.concat([method_comparison_df_v2,pd.DataFrame(new_row, index=[0])]).reset_index(drop=True)

In [None]:
method_comparison_df_v2.to_csv('method_comparison_v2.csv', index=False)