## Word Embedding Experiment
- See whether word embeddings can be used with MStream
- Can we can create a meaningful lower dimensional representation of word embeddings?


### Experiment Setup

- Generate random seed vocabulary of size S
  - Find top 5 similar words to each word in group S
- Vocabulary size: V = 6S

1. Compute per-dimension variance of semantically similar words
2. Compute inter-/intra-group simiilarity
3. Experiment with methods for dimensionality reduction (D = 1-50)
4. Recompute steps 1-2 after reducing dimensionality
5. Experiment with LSH and how each word/dimension maps to bins. Check whether inter/intra-groups map to similar buckets

## TODO

- Calculate metric for reconstruction error of similarity
- Calculate metric for whether things bin to the same buckets
- Experiment with 1D
- Consider streaming setting

#### Embedding methods
- PCA (x)
- tSNE (x)
- StreamHashProjection (x)
- MDS (x)
- UMap (x)

In [1]:
from notebook_utils import resolve_paths_from_parent_directory
resolve_paths_from_parent_directory()
# auto reload
# 
#  notebook deps
%reload_ext autoreload
%autoreload 2

In [2]:
# Download fasttext embeddings and save them in data/embeddings
# https://fasttext.cc/docs/en/english-vectors.html
from gensim.models import KeyedVectors
fasttext = KeyedVectors.load_word2vec_format('../data/embeddings/wiki-news-300d-1M.vec')

In [114]:
import pandas as pd
import numpy as np
np.random.seed(42)

def cosine_similarity(a, b):
    return np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))

class WordGroupEmbeddings:
    def __init__(
        self, 
        word_groups, 
        get_embedding=(lambda word: fasttext.get_vector(word))
    ) -> None:
        self.word_groups = word_groups
        self.vectors = {}
        self.get_embedding = get_embedding

        # compute stats
        self.group_centroids = self.compute_centroids()
        self.df_embeddings = self.init_df_embeddings()

        # Inter vs. intra-group similarity
        # inter vs. intra-group variance
        self.df_stats = self.init_df_stats()

    def to_numpy(self):
        return self.df_embeddings.select_dtypes(
            include=np.number
        ).values

    def compute_centroids(self):
        group_centroids = {}
        for group in self.word_groups:
            seed_word = group[0]
            self.vectors[seed_word] = self.get_embedding(seed_word)
            group_embeddings = [self.get_embedding(seed_word)]
            for word in group[1:]:
                self.vectors[word] = self.get_embedding(word)
                group_embeddings.append(self.get_embedding(word))
            group_centroids[seed_word] = np.sum(np.vstack(group_embeddings), axis=0) / len(group_embeddings)        
        return group_centroids

    def init_df_embeddings(self):
        embeddings_with_labels = []
        embedding_size = 0

        for group in self.word_groups:
            seed_word = group[0]
            for word in group:
                row = [seed_word, word]
                embedding = self.vectors[word]
                embedding_size = len(embedding)
                for d in embedding:
                    row.append(d)
                embeddings_with_labels.append(row)

        return pd.DataFrame(
            embeddings_with_labels,
            columns=["seed_word", "word"] +[f"dim_{d}" for d in range(embedding_size)]
        )

    def init_df_stats(self):
        embeddings_agg_by_seed_word = self.df_embeddings.groupby("seed_word").agg([
            "var",
            "mean",
            "std"
        ])
        word_groups_data = []

        # Special case for all
        centroid_all = self.df_embeddings.mean(numeric_only=True).values
        
        word_groups_data.append([
            "all",
            "n/a",
            np.trace(np.cov(
                self.df_embeddings.select_dtypes(include=[np.number]),
                bias=True
            )),
            self.df_embeddings.var(numeric_only=True).mean(),
            np.mean(self.df_embeddings.std(numeric_only=True) / self.df_embeddings.mean(numeric_only=True)),
            np.mean(self.df_embeddings.var(numeric_only=True) / self.df_embeddings.mean(numeric_only=True)),
            np.nan,
            np.nan
        ])

        for group in self.word_groups:
            seed_word = group[0]
            intra_group_similarity = np.mean([cosine_similarity(
                self.get_embedding(seed_word),
                self.get_embedding(word),
            ) for word in group[1:]])
            inter_group_similarity = np.mean([cosine_similarity(
                self.group_centroids[seed_word],
                self.group_centroids[group2[0]],
            ) for group2 in self.word_groups if group2[0] != seed_word])

            covariance_trace = np.trace(
                np.cov(
                    self.df_embeddings[self.df_embeddings.seed_word == seed_word].select_dtypes(include=[np.number]),
                    bias=True
                )
            )
            mean_variance = embeddings_agg_by_seed_word.loc[seed_word].xs("var", level=1).mean()
            mean_coeff_variation = np.mean(embeddings_agg_by_seed_word.loc[seed_word].xs("std", level=1) / embeddings_agg_by_seed_word.loc[seed_word].xs("mean", level=1))
            mean_dispersion = np.mean(embeddings_agg_by_seed_word.loc[seed_word].xs("var", level=1) / embeddings_agg_by_seed_word.loc[seed_word].xs("mean", level=1))

            group_data = [
                seed_word, 
                ", ".join(group[1:]), 
                covariance_trace,
                mean_variance,
                mean_coeff_variation,
                mean_dispersion,
                intra_group_similarity,
                inter_group_similarity
            ]
            word_groups_data.append(group_data)

        return pd.DataFrame(
            word_groups_data,
            columns=(
                ["seed_word", "similar_words", "covariance_trace", "mean_variance", "mean_coefficient_variation", "mean_dispersion", "mean_intra_group_similarity", "mean_inter_group_similarity"]
            )
        ).set_index("seed_word").sort_values("covariance_trace", ascending=False)

test = WordGroupEmbeddings(
    [
        ["king", "queen"],
        ["Oslo", "Norway"],
        ["laptop", "computer"]
    ]
)


test.df_stats

Unnamed: 0_level_0,similar_words,covariance_trace,mean_variance,mean_coefficient_variation,mean_dispersion,mean_intra_group_similarity,mean_inter_group_similarity
seed_word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
all,,0.099019,0.00987,-0.015236,0.002932,,
Oslo,Norway,0.040097,0.005044,0.055075,0.019994,0.751196,0.373777
laptop,computer,0.030155,0.004961,0.146804,0.021744,0.678884,0.352565
king,queen,0.028767,0.003434,-3.998937,-0.313606,0.763854,0.403154


In [6]:
embeddings_agg_by_seed_word = test.df_embeddings.groupby("seed_word").agg([
    "var",
    "mean",
    "std"
])
seed_word = "king"
mean_variance = embeddings_agg_by_seed_word.loc[seed_word].xs("var", level=1).mean()
mean_coeff_variation = np.mean(embeddings_agg_by_seed_word.loc[seed_word].xs("std", level=1) / embeddings_agg_by_seed_word.loc[seed_word].xs("mean", level=1))
mean_dispersion = np.mean(embeddings_agg_by_seed_word.loc[seed_word].xs("var", level=1) / embeddings_agg_by_seed_word.loc[seed_word].xs("mean", level=1))
embeddings_agg_by_seed_word.loc[seed_word].xs("var", level=1)

print("trace for king<->queen", np.trace(
    np.cov(
        test.df_embeddings[test.df_embeddings.seed_word == "king"].select_dtypes(include=[np.number]),
        bias=True
    )
    #test.df_embeddings[test.df_embeddings.seed_word == "king"].cov()
))
print("trace for all", np.trace(
    np.cov(
        test.df_embeddings.select_dtypes(include=[np.number]),
        bias=True
    )
))

print("det for king<->queen", np.linalg.slogdet(test.df_embeddings[test.df_embeddings.seed_word == "king"].cov()))
print("det for all", np.linalg.slogdet(test.df_embeddings.cov()))

trace for king<->queen 0.028766704821706526
trace for all 0.09901925568134774
det for king<->queen (0.0, -inf)
det for all (1.0, -11664.242763715183)


In [186]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.manifold import MDS
from umap import UMAP
from sklearn import random_projection
from utils.StreamhashProjection import StreamhashProjection
from collections import defaultdict
from sklearn.metrics import pairwise
from scipy.stats import spearmanr


class EmbeddingsExperiment:
    def __init__(self, S=50, word_group_size=5) -> None:
        # Sample S words from fasttext vocab top 25k frequent words
        self.seed_words = [fasttext.index_to_key[idx] for idx in np.random.choice(range(500, 25000), S, replace=False)]
        self.word_groups = []
        for seed_word in self.seed_words:
            self.word_groups.append([seed_word] + [word for (word, similarity) in fasttext.most_similar(seed_word, topn=word_group_size - 1)])

        self.initial_embeddings = WordGroupEmbeddings(
            self.word_groups
        )
        self.embeddings = defaultdict(lambda: {})
    
    def measure_embedding_quality(self, N, random_state=0): 
        # Sample N embeddings and compute pairwise distances & similarities
        indices = experiment.initial_embeddings.df_embeddings.sample(N, random_state=random_state).index
        initial_embeddings = experiment.initial_embeddings.to_numpy()[indices]

        orig_distances = pairwise.euclidean_distances(
            initial_embeddings,
            initial_embeddings
        ).reshape(-1)
        orig_similarities = pairwise.cosine_similarity(
            initial_embeddings,
            initial_embeddings
        ).reshape(-1)

        correlations = [[
            "initial", 
            initial_embeddings.shape[1],
            spearmanr(orig_distances, orig_distances).correlation,
            spearmanr(orig_similarities, orig_similarities).correlation,
        ]]
        for method in self.embeddings.keys():
            for d, projection in self.embeddings[method].items():
                lower_embeddings = projection.to_numpy()[indices]
                reduced_distances = pairwise.euclidean_distances(
                    lower_embeddings,
                    lower_embeddings
                ).reshape(-1)
                reduced_similarities = pairwise.cosine_similarity(
                    lower_embeddings,
                    lower_embeddings
                ).reshape(-1)
                correlations.append([
                    f"{method}",
                    d,
                    spearmanr(orig_distances, reduced_distances).correlation,
                    spearmanr(orig_similarities, reduced_similarities).correlation
                ])
        
        return pd.DataFrame(
            correlations,
            columns=[
                "method", 
                "dimensions", 
                "euclidean_distance_correlation", 
                "cosine_similarity_correlation"
            ]
        ).sort_values("euclidean_distance_correlation", ascending=False)



    def projection(self, model, D):
        init_word_vectors = np.array(list(self.initial_embeddings.vectors.values()))
        reduced_word_vectors = model.fit_transform(
            init_word_vectors
        )[:, :D]
        reduced_word_vector_lookup = {
            word: reduced_word_vectors[idx] 
            for idx, word in enumerate(self.initial_embeddings.vectors.keys())
        }
        return WordGroupEmbeddings(
            self.word_groups,
            lambda word: reduced_word_vector_lookup[word]
        )

    def stream_hash_projection(self, D: int):
        self.embeddings["Streamhash"][D] = self.projection(
            StreamhashProjection(
                n_components=D,
                random_state=0
            ),
            D
        )
    def random_projection(self, D: int, sparse=False):
        if (sparse):
            model = random_projection.SparseRandomProjection(
                n_components=D,
                random_state=0
            )
            self.embeddings["RandomSparse"][D] = self.projection(
                model,
                D
            )
        else:
            model = random_projection.GaussianRandomProjection(
                n_components=D,
                random_state=0
            )
            self.embeddings["RandomGaussian"][D] = self.projection(
                model,
                D
            )
    def mds_projection(self, D: int, metric=True, eps=1e-3):
        key = "MDS" if metric else "NMDS"
        self.embeddings[key][D] = self.projection(
            MDS(
                metric=metric,
                n_components=D,
                random_state=0,
                eps=eps
            ),
            D
        )

    def PCA_reduce_dimensionality(self, D: int):
        self.embeddings["PCA"][D] = self.projection(
            PCA(random_state=0),
            D
        )

    def TSNE_reduce_dimensionality(self, D: int, perplexity = 5, learning_rate = 200, n_iter = 10000, method = "exact"):
        model = TSNE(
            n_components=D, 
            random_state=0, 
            perplexity=perplexity, 
            learning_rate=learning_rate, 
            method=method,
            n_iter=n_iter
        )
        self.embeddings["TSNE"][D] = self.projection(
            model,
            D
        )
    
    def umap_projection(self, D: int, n_neighbors: int = 10, min_dist: float = 0.1):
        self.embeddings["UMAP"][D] = self.projection(
            UMAP(
                n_components=D,
                n_neighbors=n_neighbors,
                min_dist=min_dist,
                random_state=0
            ),
            D
        )

experiment = EmbeddingsExperiment(5)
experiment.initial_embeddings.df_stats

Unnamed: 0_level_0,similar_words,covariance_trace,mean_variance,mean_coefficient_variation,mean_dispersion,mean_intra_group_similarity,mean_inter_group_similarity
seed_word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
all,,0.484048,0.013872,4.780691,0.447691,,
paramilitary,"para-military, paramilitaries, Paramilitary, q...",0.111328,0.00764,0.572419,0.051593,0.754556,0.337892
deepest,"deep, profoundest, Deepest, darkest",0.105032,0.009429,-1.569054,-0.198294,0.69204,0.292408
jackets,"jacket, coats, vests, shirts",0.095477,0.007044,0.478852,0.030952,0.722785,0.365931
assemble,"gather, organize, assembled, reassemble",0.086641,0.006632,1.337185,0.081475,0.730041,0.263734
discrimination,"Discrimination, prejudice, discrimation, discr...",0.085569,0.006624,-2.322939,-0.218658,0.720238,0.325238


In [187]:
experiment.PCA_reduce_dimensionality(2)
experiment.embeddings["PCA"][2].df_stats

Unnamed: 0_level_0,similar_words,covariance_trace,mean_variance,mean_coefficient_variation,mean_dispersion,mean_intra_group_similarity,mean_inter_group_similarity
seed_word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
all,,8.342419,0.695202,-19790960.0,-15824190.0,,
deepest,"deep, profoundest, Deepest, darkest",4.682849,0.087226,-0.1280274,-0.05230699,0.983781,-0.253352
assemble,"gather, organize, assembled, reassemble",1.550833,0.037865,0.222184,0.03528686,0.995575,-0.340189
paramilitary,"para-military, paramilitaries, Paramilitary, q...",0.749416,0.030752,-0.2525868,-0.04652616,0.993512,-0.170855
jackets,"jacket, coats, vests, shirts",0.711993,0.009443,0.01096294,-0.006018459,0.986266,-0.238034
discrimination,"Discrimination, prejudice, discrimation, discr...",0.647329,0.010273,0.03979887,0.00123188,0.975134,-0.210498


In [188]:
experiment.TSNE_reduce_dimensionality(2)
experiment.embeddings["TSNE"][2].df_stats

Unnamed: 0_level_0,similar_words,covariance_trace,mean_variance,mean_coefficient_variation,mean_dispersion,mean_intra_group_similarity,mean_inter_group_similarity
seed_word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
all,,867353.915109,120716.1875,22.613871,8076.212402,,
assemble,"gather, organize, assembled, reassemble",489486.599741,3432.210205,-0.061795,-3.863424,0.990898,-0.266087
deepest,"deep, profoundest, Deepest, darkest",147545.232276,3709.590576,-0.222006,-13.752774,0.993724,-0.229451
discrimination,"Discrimination, prejudice, discrimation, discr...",104604.037465,3376.944824,-0.263104,-14.800782,0.991535,-0.239943
jackets,"jacket, coats, vests, shirts",94906.990036,3488.594604,-0.282149,-13.733147,0.959249,-0.24762
paramilitary,"para-military, paramilitaries, Paramilitary, q...",30811.055592,4153.963257,0.149532,9.531169,0.99176,-0.265162


In [190]:
experiment.stream_hash_projection(25)
experiment.embeddings["Streamhash"][25].df_stats

Unnamed: 0_level_0,similar_words,covariance_trace,mean_variance,mean_coefficient_variation,mean_dispersion,mean_intra_group_similarity,mean_inter_group_similarity
seed_word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
all,,6.337198,0.184286,-2.555011,-1.206954,,
paramilitary,"para-military, paramilitaries, Paramilitary, q...",1.619762,0.125841,-27.130746,-6.862761,0.753774,0.368112
assemble,"gather, organize, assembled, reassemble",1.216312,0.077711,26.819718,10.798135,0.819098,0.264535
deepest,"deep, profoundest, Deepest, darkest",1.21568,0.144623,-0.539175,-0.283796,0.629316,0.446153
jackets,"jacket, coats, vests, shirts",1.174383,0.106425,0.04686,0.035824,0.675262,0.488069
discrimination,"Discrimination, prejudice, discrimation, discr...",1.111061,0.064572,-0.091615,0.007201,0.8063,0.305607


In [191]:
experiment.mds_projection(2)
experiment.embeddings["MDS"][2].df_stats

Unnamed: 0_level_0,similar_words,covariance_trace,mean_variance,mean_coefficient_variation,mean_dispersion,mean_intra_group_similarity,mean_inter_group_similarity
seed_word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
all,,23.370297,1.904706,6.253304e+16,8.731155e+16,,
paramilitary,"para-military, paramilitaries, Paramilitary, q...",10.053624,0.289351,-0.1835055,-0.1570087,0.934914,-0.226657
assemble,"gather, organize, assembled, reassemble",6.37224,0.219059,-0.3839253,-0.1478682,0.970625,-0.25976
deepest,"deep, profoundest, Deepest, darkest",5.06028,0.398202,-4.883374,-3.418587,0.957196,-0.275163
discrimination,"Discrimination, prejudice, discrimation, discr...",1.436912,0.225053,-0.5124947,-0.2191225,0.961294,-0.254388
jackets,"jacket, coats, vests, shirts",0.447241,0.229148,0.4679133,0.2214094,0.967213,-0.231384


In [192]:
experiment.mds_projection(2, metric=False, eps=1e-12)
experiment.embeddings["NMDS"][2].df_stats

Unnamed: 0_level_0,similar_words,covariance_trace,mean_variance,mean_coefficient_variation,mean_dispersion,mean_intra_group_similarity,mean_inter_group_similarity
seed_word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
all,,1.513754,0.126063,58.757564,19.63112,,
deepest,"deep, profoundest, Deepest, darkest",0.760398,0.057524,-0.147903,-0.093115,0.938495,-0.179449
assemble,"gather, organize, assembled, reassemble",0.333809,0.040649,1.22952,0.22387,0.954618,-0.33634
paramilitary,"para-military, paramilitaries, Paramilitary, q...",0.206896,0.024928,-2.014264,-0.356597,0.923096,-0.228753
jackets,"jacket, coats, vests, shirts",0.190725,0.065693,1.268141,0.349848,0.518253,-0.305418
discrimination,"Discrimination, prejudice, discrimation, discr...",0.021926,0.02059,-1.820229,-0.255396,0.157873,-0.170221


In [139]:
experiment.umap_projection(2)
experiment.embeddings["UMAP"][2].df_stats

Unnamed: 0_level_0,similar_words,covariance_trace,mean_variance,mean_coefficient_variation,mean_dispersion,mean_intra_group_similarity,mean_inter_group_similarity
seed_word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
all,,403.240526,1.490502,0.156435,0.198463,,
nutrients,"nutrient, micronutrients, macronutrients, Nutr...",154.847566,0.084596,0.047325,0.013772,0.999597,0.980588
corpse,"corpses, cadaver, carcass, dead",84.757051,0.10502,0.044226,0.01401,0.999819,0.993351
tip,"tips, tipping, Tip, iceburg",79.626798,0.121166,0.039441,0.013455,0.999734,0.994567
Montenegro,"Montenegrin, Serbia, Podgorica, Montengro",42.873182,0.144142,0.040238,0.01453,0.999609,0.991622
rumor,"rumors, rumour, rumours, Rumors",41.135929,0.084891,0.029,0.008255,0.999531,0.986225


In [195]:
for d in [2, 3, 5, 15, 50, 75, 100]:
    experiment.stream_hash_projection(d)
    experiment.random_projection(d, sparse=True)
    experiment.random_projection(d)
    if d <= 15:
        experiment.umap_projection(d)

experiment.measure_embedding_quality(25)

Unnamed: 0,method,dimensions,euclidean_distance_correlation,cosine_similarity_correlation
0,initial,300,1.0,1.0
18,RandomSparse,100,0.90145,0.839774
26,RandomGaussian,100,0.873107,0.763435
17,RandomSparse,75,0.868474,0.754544
25,RandomGaussian,75,0.836196,0.705097
8,Streamhash,100,0.833401,0.801741
11,MDS,2,0.82082,0.669107
7,Streamhash,75,0.804913,0.790738
6,Streamhash,50,0.78231,0.7589
1,PCA,2,0.782179,0.669533


In [226]:
import plotly.graph_objs as go
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import random

def visualize_embeddings(
    embeddings: WordGroupEmbeddings,
    method="TSNE",
    n_viz_groups=15,
    random_state=0
):
    word_vectors = np.array(list(embeddings.vectors.values()))

    if word_vectors.shape[1] <= 2:
        # already reduced
        reduced_word_vectors = word_vectors
    else:
        if method == "TSNE":
            reduced_word_vectors = TSNE(
                n_components = 2, 
                random_state=random_state, 
                perplexity = 5, 
                learning_rate = 500, 
                n_iter = 1000
            ).fit_transform(word_vectors)[:,:2]
        else:
            reduced_word_vectors = PCA(random_state=random_state).fit_transform(
            word_vectors
            )[:, :2]

    reduced_word_vector_lookup = {
        word: reduced_word_vectors[idx] 
        for idx, word in enumerate(embeddings.vectors.keys())
    }

    data = []
    random.seed(random_state)
    sampled_groups = random.sample(
        embeddings.word_groups, 
        min(n_viz_groups, len(embeddings.word_groups))
    )
    
    for word_group in sampled_groups:
        seed_word = word_group[0]
        group_vectors = np.array([reduced_word_vector_lookup[word] for word in word_group])
        trace = go.Scatter( # for 3d go.Scatter3d and add z
            x = group_vectors[:, 0],
            y = group_vectors[:, 1] if group_vectors.shape[1] > 1 else [50] * group_vectors.shape[0],
            text = word_group,
            name = seed_word,
            textposition = "top center",
            textfont_size = 20,
            mode = 'markers+text',
            marker = {
                'size': 10,
                'opacity': 0.8,
                'color': 2
            }
        )
        data.append(trace)

    # Configure the layout
    layout = go.Layout(
        title=f"Embeddings reduced with {method}",
        margin = {'l': 0, 'r': 0, 'b': 0, 't': 0},
        showlegend=True,
        legend=dict(
        x=1,
        y=0.5,
        font=dict(
            family="Courier New",
            size=25,
            color="black"
        )),
        font = dict(
            family ="Courier New",
            size = 15
        ),
        autosize = False,
        width = 1000,
        height = 500
    )


    plot_figure = go.Figure(data=data, layout=layout)
    plot_figure.show()

visualize_embeddings(
    experiment.initial_embeddings,
    method="PCA"
)
visualize_embeddings(
    experiment.initial_embeddings,
    method="TSNE"
)

In [227]:
import plotly.graph_objs as go
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import random

def visualize_embedding_dimensions(
    embeddings: WordGroupEmbeddings,
    n_viz_groups=15,
    n_dims=15,
    random_state=0
):
    random.seed(random_state)
    data = []
    sampled_groups = random.sample(
        embeddings.word_groups, 
        min(n_viz_groups, len(embeddings.word_groups))
    )
    for word_group in sampled_groups:
        seed_word = word_group[0]
        group_vectors = np.array([embeddings.vectors[word] for word in word_group])[:, :n_dims] # (N, emb size)
        x = [dim_idx * 1 for dim_idx in range(group_vectors.shape[1]) for n in range(group_vectors.shape[0])] # (N * emb size)
        y = [group_vectors[n, dim_idx] for dim_idx in range(group_vectors.shape[1]) for n in range(group_vectors.shape[0])] # (N * emb size)
        trace = go.Scatter( # for 3d go.Scatter3d and add z
            x = x,
            y = y,
            name = ", ".join(word_group),
            textposition = "top center",
            textfont_size = 20,
            mode = 'markers+text',
            marker = {
                'size': 10,
                'opacity': 0.8,
                'color': 2
            }
        )
        data.append(trace)

    # Configure the layout
    layout = go.Layout(
        title=f"Scatter plot of embedding dimensions",
        margin = {'l': 0, 'r': 0, 'b': 0, 't': 0},
        showlegend=True,
        legend=dict(
        x=1,
        y=0.5,
        font=dict(
            family="Courier New",
            size=25,
            color="black"
        )),
        font = dict(
            family ="Courier New",
            size = 15
        ),
        autosize = False,
        width = 1000,
        height = 500
    )


    fig = go.Figure(data=data, layout=layout)
    fig.update_layout(legend=dict(
        orientation="h",
        yanchor="top",
        y=0,
        xanchor="right",
        x=1
    ))
    fig.show()

visualize_embedding_dimensions(
    experiment.initial_embeddings
)

In [228]:
D = 3
experiment.PCA_reduce_dimensionality(D)
experiment.TSNE_reduce_dimensionality(D)
experiment.stream_hash_projection(D)
experiment.umap_projection(D)
experiment.mds_projection(D)

for method in ["PCA", "TSNE", "Streamhash", "MDS", "UMAP"]:
    print(method, D)
    visualize_embedding_dimensions(
        experiment.embeddings[method][D]
    )

PCA 3


TSNE 3


Streamhash 3


MDS 3


UMAP 3


In [229]:
D = 1
experiment.PCA_reduce_dimensionality(D)
experiment.TSNE_reduce_dimensionality(D)
experiment.stream_hash_projection(D)
experiment.umap_projection(D)
experiment.mds_projection(D)
for method in ["PCA", "TSNE", "Streamhash", "MDS", "UMAP"]:
    print(method, D)
    visualize_embedding_dimensions(
        experiment.embeddings[method][D]
    )

PCA 1


TSNE 1


Streamhash 1


MDS 1


UMAP 1


# The experiment

In [209]:
# initialize embeddings, takes ~1 minute per 600 seed words
S = 500
full_experiment = EmbeddingsExperiment(S)
full_experiment.initial_embeddings.df_stats
methods = ["PCA", "TSNE", "Streamhash", "MDS", "UMAP", "RandomGaussian", "RandomSparse"]

In [210]:
# Reduce dimensionality

for d in [1, 2, 3, 5, 25]:
    print(d, "PCA")
    full_experiment.PCA_reduce_dimensionality(d)
    print(d, "Streamhash")
    # full_experiment.TSNE_reduce_dimensionality(d, method="barnes_hut", n_iter=1000)
    full_experiment.stream_hash_projection(d)
    print(d, "Random")
    full_experiment.random_projection(d)
    full_experiment.random_projection(d, sparse=True)
    print(d, "MDS")
    full_experiment.mds_projection(d)
    if (d < 25):
        print(d, "UMAP")
        full_experiment.umap_projection(d)

1 PCA
1 Streamhash
1 Random
1 MDS
1 UMAP
2 PCA
2 Streamhash
2 Random
2 MDS
2 UMAP
3 PCA
3 Streamhash
3 Random
3 MDS
3 UMAP
5 PCA
5 Streamhash
5 Random
5 MDS
5 UMAP
25 PCA
25 Streamhash
25 Random
25 MDS


In [230]:
full_experiment.measure_embedding_quality(25).sort_values("euclidean_distance_correlation", ascending=False)


An input array is constant; the correlation coefficient is not defined.


An input array is constant; the correlation coefficient is not defined.



Unnamed: 0,method,dimensions,euclidean_distance_correlation,cosine_similarity_correlation
0,initial,300,1.0,1.0
10,Streamhash,25,0.539748,0.451023
20,RandomSparse,25,0.499708,0.445654
5,PCA,25,0.483438,0.439943
19,RandomSparse,5,0.416164,0.348605
27,UMAP,2,0.38019,0.531923
28,UMAP,3,0.372934,0.554594
15,RandomGaussian,25,0.372696,0.436516
26,UMAP,1,0.370631,
25,MDS,25,0.363377,0.372035


In [232]:
for method in methods:
    try:
        print(f"{method}-2")
        visualize_embeddings(
            full_experiment.embeddings[method][2],
            n_viz_groups=5,
        )
    except:
        print("not found")
        pass

PCA-2


TSNE-2
not found
Streamhash-2


MDS-2


UMAP-2


RandomGaussian-2


RandomSparse-2


In [233]:
for method in methods:
    try:
        print(f"{method}-1")
        visualize_embeddings(
            full_experiment.embeddings[method][1],
            n_viz_groups=5
        )
    except:
        print("not found")
        pass

PCA-1


TSNE-1
not found
Streamhash-1


MDS-1


UMAP-1


RandomGaussian-1


RandomSparse-1
