In [2]:
import colorsys
import gensim
from gensim.models.callbacks import LossLogger, LossSetter
from gensim.models.keyedvectors import PoincareWordEmbeddingsKeyedVectors
from IPython.display import HTML
import itertools
import logging
import matplotlib.pyplot as plt
from matplotlib import animation, rc, cm
from matplotlib.collections import LineCollection
import numpy as np
from numpy.linalg import norm
from numpy import dot
from scipy.spatial.distance import pdist, squareform
from sklearn.manifold import TSNE
import os
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import sys

init_notebook_mode(connected=True)
logging.basicConfig(level=logging.WARN)

ROOT = "/Users/alext/Documents/Master/Thesis/"

In [7]:
# Get vocabulary of the Google word analogy benchmark + labels for each section
def read_word_dict():
    filename = os.path.join(ROOT, "data/google_analogy_vocab_labeled.txt")

    with open(filename, "r") as f:
        lines = [line.strip().split(" ") for line in f.readlines()]
    result_dict = {}
    for l, w in lines:
        if l not in result_dict:
            result_dict[l] = []
        result_dict[l].append(w)
    return result_dict

def compute_poincare_aggregate(model):
    """
    Precompute the average between the target and the context vector, for Poincare embeddings.
    We take as average the mid point between w and c on the geodesic that connects the 2 points
    (see page 89 in Ungar book).
    """
    if model.poincare and getattr(model.wv, 'agg_vectors', None) is None:
        print("precomputing aggregated vectors w+c for Poincare embeddings")
        gamma_w_sq = 1 / (1 - np.sum(model.wv.vectors * model.wv.vectors, axis=1))
        gamma_c_sq = 1 / (1 - np.sum(model.trainables.syn1neg * model.trainables.syn1neg, axis=1))
        denominator = gamma_w_sq + gamma_c_sq - 1
        agg = (model.wv.vectors * (gamma_w_sq / denominator)[:, None] +
               model.trainables.syn1neg * (gamma_c_sq / denominator)[:, None])

        return model.wv.moebius_mul_mat(agg, 0.5)

def HSVToRGB(h, s, v):
    (r, g, b) = colorsys.hsv_to_rgb(h, s, v)
    return (int(255*r), int(255*g), int(255*b))

def get_colors(word_dict):
    labels = word_dict.keys()
    huePartition = 1.0 / (len(labels) + 1)
    return dict(zip(labels, [HSVToRGB(huePartition * value, 1.0, 1.0) for value in range(0, len(labels))]))

def plot_target_vs_context(model, word_dict, right_offset=None, left_offset=None, ratio_words=0.1):
    wv = model.wv
    
    target_embeddings = np.array([wv.vectors[i] for i in range(len(wv.index2entity))])
    context_embeddings = np.array([model.trainables.syn1neg[i] for i in range(len(wv.index2entity))])

    target_points, context_points = None, None
    for label, words in word_dict.items():
        idxs = [wv.vocab[w].index for w in filter(lambda w: w in wv.vocab, words)]

        idxs = idxs[:int(len(idxs) * ratio_words)]

        if target_points is None:
            target_points = target_embeddings[idxs]
            context_points = context_embeddings[idxs]
        else:
            target_points = np.vstack((target_points, target_embeddings[idxs]))
            context_points = np.vstack((context_points, context_embeddings[idxs]))
            
    # Change the origin of the system.
    if right_offset is not None:
        offset_mat = np.tile(right_offset.reshape(1, -1), (target_points.shape[0], 1))
        target_points = PoincareWordEmbeddingsKeyedVectors.moebius_add_mat(target_points, offset_mat)
        context_points = PoincareWordEmbeddingsKeyedVectors.moebius_add_mat(context_points, offset_mat)
    if left_offset is not None:
        offset_mat = np.tile(left_offset.reshape(1, -1), (target_points.shape[0], 1))
        target_points = PoincareWordEmbeddingsKeyedVectors.moebius_add_mat(offset_mat, target_points)
        context_points = PoincareWordEmbeddingsKeyedVectors.moebius_add_mat(offset_mat, context_points)
        
    traces = [
        go.Scatter(
            x=target_points[:, 0],
            y=target_points[:, 1],
            name="target vectors",
            mode="markers"),
        go.Scatter(
            x=context_points[:, 0],
            y=context_points[:, 1],
            name="context vectors",
            mode="markers")
    ]

    layout = {
        'width': 700,
        'height': 650
    }
    
    if isinstance(model.wv, gensim.models.keyedvectors.PoincareWordEmbeddingsKeyedVectors):
        layout['shapes'] = [
            # unfilled circle
            {
                'type': 'circle',
                'xref': 'x',
                'yref': 'y',
                'x0': -1,
                'y0': -1,
                'x1': 1,
                'y1': 1,
                'line': {
                    'color': 'rgba(0, 0, 0, 1)',
                },
            }
        ]
    
    return iplot(dict(data=traces, layout=layout))

def plot_embeddings(model, word_dict, ratio_words=0.1, aggregate="w"):
    label_whitelist = [
        'capital-world', 'days', 'currency', 'seasons','family', 'city-in-state', 'capital-common-countries', 'digits',
        'gram6-nationality-adjective', 'gram9-plural-verbs', 'gram2-opposite', 'gram5-present-participle', 
        'gram1-adjective-to-adverb', 'gram8-plural', 'gram4-superlative', 'gram7-past-tense', 'gram3-comparative'
    ]
    colors = get_colors(labeled_word_dict)
    if aggregate == "w+c":
        vectors = compute_poincare_aggregate(model)
    elif aggregate == "w":
        vectors = model.wv.vectors    
    elif aggregate == "c":
        vectors = model.trainables.syn1neg   
    else:
        return None   
    wv = model.wv
    
    embeddings = np.array([vectors[i] for i in range(len(wv.index2entity))])
    
    traces = []
    for label, words in word_dict.items():
        if label not in label_whitelist:
            continue
        idxs = [wv.vocab[w].index for w in filter(lambda w: w in wv.vocab, words)]
        idxs = idxs[:int(len(idxs) * ratio_words)]

        traces.append(
            go.Scatter(
                x=embeddings[idxs, 0],
                y=embeddings[idxs, 1],
                text=[wv.index2word[idx] for idx in idxs],
                textposition='top right',
                name=label,
                mode="markers+text",
                marker=dict(color="rgb"+str(colors[label]))))

    if model.poincare == 1:
        layout = {
            'width': 700,
            'height': 650,
            'xaxis': {
                'range': [-1, 1],
            },
            'yaxis': {
                'range': [-1, 1],
            },
        }
    else:
        layout = {}

    return iplot(dict(data=traces, layout=layout))

labeled_word_dict = read_word_dict()
len(labeled_word_dict)

17

# 2D dot product embeddings on text8

In [25]:
model_fn = os.path.join(ROOT, "models/word2vec_baseline/w2v_levy_nll_5_2_A025_a0001_n5_w5_c100_cosine_OPTsgd")
# model_fn = os.path.join(ROOT, "models/word2vec_baseline/w2v_levy_sg_3_2_A025_a0001_n10_w5_c100_cosine")
model = gensim.models.Word2Vec.load(model_fn)
wv = model.wv

#### Note: Notice how for 2D word2vec the words seem to be embedded on a line (perhaps in higher dimensions this is still a relatively limited submanifold. This is similar to what Fig 3 from Hashimoto et al shows

In [26]:
plot_embeddings(model, labeled_word_dict, ratio_words=1, aggregate="w")

In [27]:
plot_target_vs_context(model, labeled_word_dict, ratio_words=1)

# 2D Poincare embeddings

In [4]:
# levy
hyp_model_fn = {
    "NLL_dist_sq": os.path.join(ROOT, "models/geometric_emb/w2v_levy_nll_1_2_A05_a001_n5_w5_c100_poincare_OPTwfullrsgd_burnin1"),
    "NLL_cosh_dist": os.path.join(ROOT, "models/geometric_emb/w2v_levy_nll_3_2_A01_a0001_n5_w5_c100_poincare_OPTwfullrsgd_SIMcosh-dist_burnin1"),
    "NLL_cosh_dist_sq": os.path.join(ROOT, "models/geometric_emb/w2v_levy_nll_3_2_A01_a0001_n5_w5_c100_poincare_OPTwfullrsgd_SIMcosh-dist-sq_burnin1"),
    "NLL_log_dist": os.path.join(ROOT, "models/geometric_emb/w2v_levy_nll_3_2_A005_a0001_n5_w5_c100_poincare_OPTwfullrsgd_SIMlog-dist_burnin1"),
    "NLL_log_dist_sq": os.path.join(ROOT, "models/geometric_emb/w2v_levy_nll_3_2_A005_a0001_n5_w5_c100_poincare_OPTwfullrsgd_SIMlog-dist-sq_burnin1"),
    "SGNS_dist_sq": os.path.join(ROOT, "models/geometric_emb/w2v_levy_sg_3_2_A01_a0001_n5_w5_c100_poincare_OPTwfullrsgd_SIMcosh-dist-sq_burnin1"),
    "SGNS_cosh_dist": os.path.join(ROOT, "models/geometric_emb/w2v_levy_sg_3_2_A01_a0001_n5_w5_c100_poincare_OPTwfullrsgd_SIMcosh-dist_burnin1"),
    "SGNS_cosh_dist_sq": os.path.join(ROOT, "models/geometric_emb/w2v_levy_sg_3_2_A01_a0001_n5_w5_c100_poincare_OPTwfullrsgd_SIMdist-sq_burnin1"),
    "NLL_exp_dist": os.path.join(ROOT, "models/geometric_emb/w2v_levy_nll_3_2_A005_a0001_n5_w5_c100_poincare_OPTwfullrsgd_SIMexp-dist_burnin1"),
}

# text8
# hyp_model_fn = os.path.join(ROOT, "models/geometric_emb/w2v_text8_nll_3_2_A01_a01_n5_w5_c50_poincare_OPTwfullrsgd_SIMcosh-dist-sq")
# hyp_model_fn = os.path.join(ROOT, "models/geometric_emb/w2v_text8_nll_1_2_A05_a001_n5_w5_c50_poincare_OPTwfullrsgd_bias_burnin1")
# hyp_model_fn = os.path.join(ROOT, "models/geometric_emb/w2v_text8_nll_1_2_A005_a005_n5_w5_c50_poincare_OPTwfullrsgd_SIMlog-dist")

In [8]:
hyp_model = gensim.models.Word2Vec.load(hyp_model_fn["SGNS_dist_sq"])
hyp_wv = hyp_model.wv
plot_embeddings(hyp_model, labeled_word_dict, ratio_words=1, aggregate="c")

In [19]:
hyp_model = gensim.models.Word2Vec.load(hyp_model_fn["NLL_cosh_dist"])
hyp_wv = hyp_model.wv
offset = -np.average(hyp_wv.vectors, axis=0)
print(offset)

plot_target_vs_context(hyp_model, labeled_word_dict, left_offset=None, right_offset=None, ratio_words=1)
# plot_target_vs_context(hyp_model, labeled_word_dict, left_offset=None, right_offset=offset, ratio_words=1)
# plot_target_vs_context(hyp_model, labeled_word_dict, left_offset=offset, right_offset=None, ratio_words=1)

[-0.17335184 -0.3077575 ]
