In [2]:
import colorsys
import gensim
from glove_code.src.glove import Glove
from gensim.models.callbacks import LossLogger, LossSetter
from gensim.models.keyedvectors import PoincareWordEmbeddingsKeyedVectors
from IPython.display import HTML
import itertools
import logging
import matplotlib.pyplot as plt
from matplotlib import animation, rc, cm
from matplotlib.collections import LineCollection
from nltk.corpus import wordnet as wn
import numpy as np
from numpy.linalg import norm
from numpy import dot
from scipy.spatial.distance import pdist, squareform
from sklearn.manifold import TSNE
import os
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import sys

init_notebook_mode(connected=True)
logging.basicConfig(level=logging.WARN)

ROOT = "/Users/alext/Documents/Master/Thesis/"

In [4]:
level2words = [[] for i in range(20)]
for ss in wn.all_synsets():
    word, pos, _ = ss.name().rsplit(".", 2)
    if pos != "n":
        continue
    level = ss.max_depth()
    level2words[level].append(word)
for level, word_list in enumerate(level2words):
    print(level, len(word_list), word_list[0])

0 1 entity
1 3 physical_entity
2 22 thing
3 223 whole
4 1541 congener
5 4573 organism
6 8174 benthos
7 14831 beachhead
8 13350 agon
9 13775 abort
10 10847 abdominoplasty
11 6590 realization
12 3706 male_orgasm
13 1967 double_fault
14 1144 spiccato
15 644 highland_fling
16 458 carp
17 223 domestic_carp
18 42 leather_carp
19 1 rock_hind


In [32]:
# Print words of top T and last B levels
T = 5
B = 5
top_levels, bottom_levels = [], []
for level in level2words[:T]:
    top_levels += level
    
for level in level2words[-B:]:
    bottom_levels += level

print(len(top_levels), " ".join(top_levels))
print(len(bottom_levels), " ".join(bottom_levels))

1790 entity physical_entity abstraction thing thing object causal_agent matter psychological_feature attribute process group relation communication measure change freshener horror jimdandy pacifier security_blanket stinker whacker otherworld set substance whole substance cognition motivation state location shape time space event possession social_relation phenomenon publication charm curio draw film hoodoo je_ne_sais_quoi keepsake makeweight part property snake stuff subject triviality human_nature trait character thing common_denominator personality cheerfulness uncheerfulness ballast ethos eidos quality property inheritance position probability depth quantum interval group message contagion language written_communication message didacticism signal sign indication visual_communication display expressive_style paralanguage auditory_communication voice psychic_communication voice paring arrangement straggle kingdom biological_group community people social_group collection edition electr

In [24]:
# Get vocabulary from WordNet + labels for each section
def read_word_dict():
    result_dict = dict([(str(level), word_list[:50] + word_list[50::10]) for level, word_list in enumerate(level2words)])
    return result_dict

def compute_poincare_aggregate(model):
    """
    Precompute the average between the target and the context vector, for Poincare embeddings.
    We take as average the mid point between w and c on the geodesic that connects the 2 points
    (see page 89 in Ungar book).
    """
    if model.poincare and getattr(model.wv, 'agg_vectors', None) is None:
        print("precomputing aggregated vectors w+c for Poincare embeddings")
        gamma_w_sq = 1 / (1 - np.sum(model.wv.vectors * model.wv.vectors, axis=1))
        gamma_c_sq = 1 / (1 - np.sum(model.trainables.syn1neg * model.trainables.syn1neg, axis=1))
        denominator = gamma_w_sq + gamma_c_sq - 1
        agg = (model.wv.vectors * (gamma_w_sq / denominator)[:, None] +
               model.trainables.syn1neg * (gamma_c_sq / denominator)[:, None])

        return model.wv.moebius_mul_mat(agg, 0.5)

def HSVToRGB(h, s, v):
    (r, g, b) = colorsys.hsv_to_rgb(h, s, v)
    return (int(255*r), int(255*g), int(255*b))

def get_colors(word_dict):
    labels = word_dict.keys()
    huePartition = 1.0 / (len(labels) + 1)
    return dict(zip(labels, [HSVToRGB(huePartition * value, 1.0, 1.0) for value in range(0, len(labels))]))

def plot_wordnet_per_level(model, word_dict, right_offset=None, left_offset=None, ratio_words=0.1, aggregate="w"):
    colors = get_colors(labeled_word_dict)

    wv = model.wv
    if aggregate == "w+c":
        vectors = compute_poincare_aggregate(model)
    elif aggregate == "w":
        vectors = model.wv.vectors    
    elif aggregate == "c":
        vectors = model.trainables.syn1neg   
    else:
        return None 
    
    embeddings = np.array([vectors[i] for i in range(len(wv.index2entity))])

    traces = []
    for level, words in word_dict.items():
        idxs = [wv.vocab[w].index for w in filter(lambda w: w in wv.vocab, words)]
        idxs = idxs[:int(len(idxs) * ratio_words)]

        embs = embeddings[idxs, :]
        # Change the origin of the system.
        if left_offset is not None:
            offset_mat = np.tile(left_offset.reshape(1, -1), (embs.shape[0], 1))
            embs = PoincareWordEmbeddingsKeyedVectors.moebius_add_mat(offset_mat, embs)
        if right_offset is not None:
            offset_mat = np.tile(right_offset.reshape(1, -1), (embs.shape[0], 1))
            embs = PoincareWordEmbeddingsKeyedVectors.moebius_add_mat(embs, offset_mat)

        traces.append(
            go.Scatter(
                x=embs[:, 0],
                y=embs[:, 1],
                text=[wv.index2word[idx] for idx in idxs],
                textposition='top right',
                name=level,
                mode="markers",
                marker=dict(color="rgb"+str(colors[level]))))

    layout = {
        'width': 700,
        'height': 650,
    }
    
    if isinstance(model.wv, PoincareWordEmbeddingsKeyedVectors):
        layout['shapes'] = [
            # unfilled circle
            {
                'type': 'circle',
                'xref': 'x',
                'yref': 'y',
                'x0': -1,
                'y0': -1,
                'x1': 1,
                'y1': 1,
                'line': {
                    'color': 'rgba(0, 0, 0, 1)',
                },
            }
        ]
    
    return iplot(dict(data=traces, layout=layout))

labeled_word_dict = read_word_dict()
for lvl in labeled_word_dict:
    print(len(labeled_word_dict[lvl]))

1
3
22
68
200
503
863
1529
1380
1423
1130
704
416
242
160
110
91
68
42
1


# Word2Vec - 2D dot product embeddings on text8

In [11]:
# model_fn = os.path.join(ROOT, "models/word2vec_baseline/w2v_levy_sg_5_2_A025_a0001_n5_w5_c25000_cosine_OPTsgd")
model_fn = os.path.join(ROOT, "models/word2vec_baseline/w2v_levy_sg_3_2_A025_a0001_n10_w5_c100_cosine")
model = gensim.models.Word2Vec.load(model_fn)
wv = model.wv

#### Note: Notice how for 2D word2vec the words seem to be embedded on a line (perhaps in higher dimensions this is still a relatively limited submanifold. This is similar to what Fig 3 from Hashimoto et al shows

In [12]:
plot_wordnet_per_level(model, labeled_word_dict, ratio_words=1, aggregate="c")

# Word2Vec - 2D Poincare embeddings

In [6]:
# levy
hyp_model_fn = {
    "NLL_dist_sq": os.path.join(ROOT, "models/geometric_emb/w2v_levy_nll_1_2_A05_a001_n5_w5_c100_poincare_OPTwfullrsgd_burnin1"),
    "NLL_cosh_dist": os.path.join(ROOT, "models/geometric_emb/w2v_levy_nll_3_2_A01_a0001_n5_w5_c100_poincare_OPTwfullrsgd_SIMcosh-dist_burnin1"),
    "NLL_cosh_dist_sq": os.path.join(ROOT, "models/geometric_emb/w2v_levy_nll_3_2_A01_a0001_n5_w5_c100_poincare_OPTwfullrsgd_SIMcosh-dist-sq_burnin1"),
    "NLL_log_dist": os.path.join(ROOT, "models/geometric_emb/w2v_levy_nll_3_2_A005_a0001_n5_w5_c100_poincare_OPTwfullrsgd_SIMlog-dist_burnin1"),
    "NLL_log_dist_sq": os.path.join(ROOT, "models/geometric_emb/w2v_levy_nll_3_2_A005_a0001_n5_w5_c100_poincare_OPTwfullrsgd_SIMlog-dist-sq_burnin1"),
    "SGNS_dist_sq": os.path.join(ROOT, "models/geometric_emb/w2v_levy_sg_3_2_A01_a0001_n5_w5_c100_poincare_OPTwfullrsgd_SIMcosh-dist-sq_burnin1"),
    "SGNS_cosh_dist": os.path.join(ROOT, "models/geometric_emb/w2v_levy_sg_3_2_A01_a0001_n5_w5_c100_poincare_OPTwfullrsgd_SIMcosh-dist_burnin1"),
    "SGNS_cosh_dist_sq": os.path.join(ROOT, "models/geometric_emb/w2v_levy_sg_3_2_A01_a0001_n5_w5_c100_poincare_OPTwfullrsgd_SIMdist-sq_burnin1"),
    "NLL_exp_dist": os.path.join(ROOT, "models/geometric_emb/w2v_levy_nll_3_2_A005_a0001_n5_w5_c100_poincare_OPTwfullrsgd_SIMexp-dist_burnin1"),
}

# text8
# hyp_model_fn = os.path.join(ROOT, "models/geometric_emb/w2v_text8_nll_3_2_A01_a01_n5_w5_c50_poincare_OPTwfullrsgd_SIMcosh-dist-sq")
# hyp_model_fn = os.path.join(ROOT, "models/geometric_emb/w2v_text8_nll_1_2_A05_a001_n5_w5_c50_poincare_OPTwfullrsgd_bias_burnin1")
# hyp_model_fn = os.path.join(ROOT, "models/geometric_emb/w2v_text8_nll_1_2_A005_a005_n5_w5_c50_poincare_OPTwfullrsgd_SIMlog-dist")

In [19]:
hyp_model = gensim.models.Word2Vec.load(hyp_model_fn["SGNS_cosh_dist"])
hyp_wv = hyp_model.wv
offset = None # np.array([0.25, 0])

plot_wordnet_per_level(hyp_model, labeled_word_dict, ratio_words=1,
                       left_offset=offset, right_offset=offset, aggregate="w+c")

precomputing aggregated vectors w+c for Poincare embeddings


# GloVe - 2D Poincare embeddings

In [22]:
hyp_model_fn = {
    "dist_sq-scale10": os.path.join(ROOT, "models/glove/geometric_emb/glove_ep50_size2_lr0.05_vocab200000_poincare_OPTradagrad_COOCCFUNClog_DISTFUNCdist-sq_bias"),
}

In [27]:
hyp_model = Glove.load(hyp_model_fn["dist_sq-scale10"])
hyp_wv = hyp_model.wv
offset = None # np.array([0.25, 0])

plot_wordnet_per_level(hyp_model, labeled_word_dict, ratio_words=1,
                       left_offset=offset, right_offset=offset, aggregate="w+c")

precomputing aggregated vectors w+c for Poincare embeddings
