In [2]:
import colorsys
import gensim
from glove_code.src.glove import Glove
from gensim.models.callbacks import LossLogger, LossSetter
from gensim.models.keyedvectors import PoincareWordEmbeddingsKeyedVectors
from IPython.display import HTML
import itertools
import logging
import matplotlib.pyplot as plt
from matplotlib import animation, rc, cm
from matplotlib.collections import LineCollection
import numpy as np
from numpy.linalg import norm
from numpy import dot
from scipy.spatial.distance import pdist, squareform
from sklearn.manifold import TSNE
import os
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import sys

init_notebook_mode(connected=True)
logging.basicConfig(level=logging.WARN)

ROOT = "/Users/alext/Documents/Master/Thesis/"

In [18]:
# Get vocabulary of the Google word analogy benchmark + labels for each section
def read_word_dict():
    result_dict = {
        "president": ["obama", "eisenhower", "roosevelt", "clinton", "jefferson", "kennedy", "lincoln", "nixon", "reagan"],
        "mathematics": ["geometry", "inequality", "arithmetic", "calculus", "algebra", "equation", "theorem"],
        "number": ["one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"],
        "chemistry": ["chemical", "oxygen", "carbon", "dioxide", "nitrogen", "chlorium", "zinc", "organic"],
        "sport": ["maradona", "ronaldo", "messi", "lochte", "phelps", "spitz", "comneci", "mayweather", "pacquiao"],
        "country": ["romania", "france", "germany", "england", "russia", "india", "china", "spain"],
    }
    return result_dict

def compute_poincare_aggregate(model):
    """
    Precompute the average between the target and the context vector, for Poincare embeddings.
    We take as average the mid point between w and c on the geodesic that connects the 2 points
    (see page 89 in Ungar book).
    """
    if model.poincare and getattr(model.wv, 'agg_vectors', None) is None:
        print("precomputing aggregated vectors w+c for Poincare embeddings")
        gamma_w_sq = 1 / (1 - np.sum(model.wv.vectors * model.wv.vectors, axis=1))
        gamma_c_sq = 1 / (1 - np.sum(model.trainables.syn1neg * model.trainables.syn1neg, axis=1))
        denominator = gamma_w_sq + gamma_c_sq - 1
        agg = (model.wv.vectors * (gamma_w_sq / denominator)[:, None] +
               model.trainables.syn1neg * (gamma_c_sq / denominator)[:, None])

        return model.wv.moebius_mul_mat(agg, 0.5)

def HSVToRGB(h, s, v):
    (r, g, b) = colorsys.hsv_to_rgb(h, s, v)
    return (int(255*r), int(255*g), int(255*b))

def get_colors(word_dict):
    labels = word_dict.keys()
    huePartition = 1.0 / (len(labels) + 1)
    return dict(zip(labels, [HSVToRGB(huePartition * value, 1.0, 1.0) for value in range(0, len(labels))]))

def plot_wordnet_embeddings(model, word_dict, left_offset=None, right_offset=None, ratio_words=1, aggregate="w"):
    colors = get_colors(labeled_word_dict)
    if aggregate == "w+c":
        vectors = compute_poincare_aggregate(model)
    elif aggregate == "w":
        vectors = model.wv.vectors    
    elif aggregate == "c":
        vectors = model.trainables.syn1neg   
    else:
        return None   
    wv = model.wv
    
    embeddings = np.array([vectors[i] for i in range(len(wv.index2entity))])
    traces = []
    
    for label, words in word_dict.items():
        idxs = [wv.vocab[w].index for w in filter(lambda w: w in wv.vocab, words+[label])]
        idxs = idxs[:int(len(idxs) * ratio_words)]
        
        label_idx = wv.vocab[label].index
        
        # Change the origin of the system.
        embs = embeddings[idxs, :]
        if left_offset is not None:
            offset_mat = np.tile(left_offset.reshape(1, -1), (embs.shape[0], 1))
            embs = PoincareWordEmbeddingsKeyedVectors.moebius_add_mat(offset_mat, embs)
        if right_offset is not None:
            offset_mat = np.tile(right_offset.reshape(1, -1), (embs.shape[0], 1))
            embs = PoincareWordEmbeddingsKeyedVectors.moebius_add_mat(embs, offset_mat)

        text_pos = 'top right'
        if label == "country":
            text_pos = 'bottom right'
        traces.append(
            go.Scatter(
                x=embs[:, 0],
                y=embs[:, 1],
                text=[wv.index2word[idx] for idx in idxs],
                textposition=text_pos,
                name=label,
                mode="markers+text",
                marker=dict(color="rgb"+str(colors[label]))))

    if model.poincare == 1:
        layout = {
            'width': 700,
            'height': 650,
            'xaxis': {
                'range': [-1, 1],
            },
            'yaxis': {
                'range': [-1, 1],
            },
        }
    else:
        layout = {}

    return iplot(dict(data=traces, layout=layout))

labeled_word_dict = read_word_dict()
len(labeled_word_dict)

6

# Word2Vec - 2D dot product embeddings on text8

In [4]:
# model_fn = os.path.join(ROOT, "models/word2vec_baseline/w2v_levy_sg_5_2_A025_a0001_n5_w5_c25000_cosine_OPTsgd")
model_fn = os.path.join(ROOT, "models/word2vec_baseline/w2v_levy_sg_3_2_A025_a0001_n10_w5_c100_cosine")
model = gensim.models.Word2Vec.load(model_fn)
wv = model.wv

#### Note: Notice how for 2D word2vec the words seem to be embedded on a line (perhaps in higher dimensions this is still a relatively limited submanifold. This is similar to what Fig 3 from Hashimoto et al shows

In [5]:
plot_wordnet_embeddings(model, labeled_word_dict, ratio_words=1, aggregate="c")

# Word2Vec - 2D Poincare embeddings

In [4]:
# levy
hyp_model_fn = {
    "NLL_dist_sq": os.path.join(ROOT, "models/geometric_emb/w2v_levy_nll_1_2_A05_a001_n5_w5_c100_poincare_OPTwfullrsgd_burnin1"),
    "NLL_cosh_dist": os.path.join(ROOT, "models/geometric_emb/w2v_levy_nll_3_2_A01_a0001_n5_w5_c100_poincare_OPTwfullrsgd_SIMcosh-dist_burnin1"),
    "NLL_cosh_dist_sq": os.path.join(ROOT, "models/geometric_emb/w2v_levy_nll_3_2_A01_a0001_n5_w5_c100_poincare_OPTwfullrsgd_SIMcosh-dist-sq_burnin1"),
    "NLL_log_dist": os.path.join(ROOT, "models/geometric_emb/w2v_levy_nll_3_2_A005_a0001_n5_w5_c100_poincare_OPTwfullrsgd_SIMlog-dist_burnin1"),
    "NLL_log_dist_sq": os.path.join(ROOT, "models/geometric_emb/w2v_levy_nll_3_2_A005_a0001_n5_w5_c100_poincare_OPTwfullrsgd_SIMlog-dist-sq_burnin1"),
    "SGNS_dist_sq": os.path.join(ROOT, "models/geometric_emb/w2v_levy_sg_3_2_A01_a0001_n5_w5_c100_poincare_OPTwfullrsgd_SIMcosh-dist-sq_burnin1"),
    "SGNS_cosh_dist": os.path.join(ROOT, "models/geometric_emb/w2v_levy_sg_3_2_A01_a0001_n5_w5_c100_poincare_OPTwfullrsgd_SIMcosh-dist_burnin1"),
    "SGNS_cosh_dist_sq": os.path.join(ROOT, "models/geometric_emb/w2v_levy_sg_3_2_A01_a0001_n5_w5_c100_poincare_OPTwfullrsgd_SIMdist-sq_burnin1"),
    "NLL_exp_dist": os.path.join(ROOT, "models/geometric_emb/w2v_levy_nll_3_2_A005_a0001_n5_w5_c100_poincare_OPTwfullrsgd_SIMexp-dist_burnin1"),
}

# text8
# hyp_model_fn = os.path.join(ROOT, "models/geometric_emb/w2v_text8_nll_3_2_A01_a01_n5_w5_c50_poincare_OPTwfullrsgd_SIMcosh-dist-sq")
# hyp_model_fn = os.path.join(ROOT, "models/geometric_emb/w2v_text8_nll_1_2_A05_a001_n5_w5_c50_poincare_OPTwfullrsgd_bias_burnin1")
# hyp_model_fn = os.path.join(ROOT, "models/geometric_emb/w2v_text8_nll_1_2_A005_a005_n5_w5_c50_poincare_OPTwfullrsgd_SIMlog-dist")

In [19]:
hyp_model = gensim.models.Word2Vec.load(hyp_model_fn["NLL_log_dist_sq"])
hyp_wv = hyp_model.wv
offset = None # np.array([0.25, 0])

plot_wordnet_embeddings(hyp_model, labeled_word_dict, ratio_words=1,
                        left_offset=offset, right_offset=offset, aggregate="c")

# GloVe - 2D Poincare embeddings

In [49]:
# vocab=10k
hyp_model_fn_10k = {
    "dist_sq": os.path.join(ROOT, "models/glove/geometric_emb/glove_ep5_size2_lr0.01_vocab10000_poincare_OPTradagrad_COOCCFUNClog_DISTFUNCdist-sq_bias"),
    "dist_sq-scale5": os.path.join(ROOT, "models/glove/geometric_emb/glove_ep5_size2_lr0.01_vocab10000_poincare_OPTradagrad_COOCCFUNClog_DISTFUNCdist-sq_bias_scale5"),
    "cosh_dist": os.path.join(ROOT, "models/glove/geometric_emb/glove_ep5_size2_lr0.01_vocab10000_poincare_OPTradagrad_COOCCFUNClog_DISTFUNCcosh-dist_bias"),
    "cosh_dist-scale5": os.path.join(ROOT, "models/glove/geometric_emb/glove_ep5_size2_lr0.01_vocab10000_poincare_OPTradagrad_COOCCFUNClog_DISTFUNCcosh-dist_bias_scale5"),
}

# vocab=180k
hyp_model_fn_180k = {
    "dist_sq": os.path.join(ROOT, "models/glove/geometric_emb/glove_ep50_size2_lr0.05_vocab200000_poincare_OPTradagrad_COOCCFUNClog_DISTFUNCdist-sq_bias"),
    "dist_sq-scale5": os.path.join(ROOT, "models/glove/geometric_emb/glove_ep50_size2_lr0.01_vocab200000_poincare_OPTradagrad_COOCCFUNClog_DISTFUNCdist-sq_bias_scale5"),
}

In [56]:
hyp_model = Glove.load(hyp_model_fn_10k["dist_sq-scale5"])
hyp_wv = hyp_model.wv
offset = None # np.array([0.25, 0])

plot_wordnet_embeddings(hyp_model, labeled_word_dict, ratio_words=1,
                        left_offset=offset, right_offset=offset, aggregate="c")