In [2]:
import colorsys
import gensim
from glove_code.src.glove import Glove
from gensim.models.callbacks import LossLogger, LossSetter
from gensim.models.keyedvectors import PoincareWordEmbeddingsKeyedVectors
from IPython.display import HTML
import itertools
import logging
import matplotlib.pyplot as plt
from matplotlib import animation, rc, cm
from matplotlib.collections import LineCollection
from nltk.corpus import wordnet as wn
import numpy as np
from numpy.linalg import norm
from numpy import dot
from scipy.spatial.distance import pdist, squareform
from sklearn.manifold import TSNE
import os
import plotly.plotly as py
from plotly import tools
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import sys

init_notebook_mode(connected=True)
logging.basicConfig(level=logging.WARN)

ROOT = "/Users/alext/Documents/Master/Thesis/"
PLOTS_PER_ROW = 3

In [2]:
level2words = [[] for i in range(20)]
for ss in wn.all_synsets():
    word, pos, _ = ss.name().rsplit(".", 2)
    if pos != "n":
        continue
    level = ss.max_depth()
    level2words[level].append(word)
for level, word_list in enumerate(level2words):
    print(level, len(word_list), word_list[0])

0 1 entity
1 3 physical_entity
2 22 thing
3 223 whole
4 1541 congener
5 4573 organism
6 8174 benthos
7 14831 beachhead
8 13350 agon
9 13775 abort
10 10847 abdominoplasty
11 6590 realization
12 3706 male_orgasm
13 1967 double_fault
14 1144 spiccato
15 644 highland_fling
16 458 carp
17 223 domestic_carp
18 42 leather_carp
19 1 rock_hind


In [20]:
# Get vocabulary from WordNet + labels for each section
def read_word_dict():
    result_dict = dict([(str(level), word_list[:50] + word_list[50::10]) for level, word_list in enumerate(level2words)])
    return result_dict

def HSVToRGB(h, s, v):
    (r, g, b) = colorsys.hsv_to_rgb(h, s, v)
    return (int(255*r), int(255*g), int(255*b))

def get_colors(word_dict):
    labels = word_dict.keys()
    huePartition = 1.0 / (len(labels) + 1)
    return dict(zip(labels, [HSVToRGB(huePartition * value, 1.0, 1.0) for value in range(0, len(labels))]))

def plot_wordnet_per_level(model, word_dict, right_offset=None, left_offset=None, ratio_words=0.1, aggregate="w"):
    colors = get_colors(labeled_word_dict)

    wv = model.wv
    if aggregate == "w":
        vectors = model.wv.vectors    
    elif aggregate == "c":
        vectors = model.trainables.syn1neg   
    else:
        return None 
    
    embeddings = np.array([vectors[i] for i in range(len(wv.index2entity))])

    traces = []
    small_emb_size = int(model.vector_size / model.num_embs)
    rows = int(np.ceil(model.num_embs / PLOTS_PER_ROW))
    fig = tools.make_subplots(rows=rows, cols=PLOTS_PER_ROW)
    for level, words in word_dict.items():
        if int(level) in range(4, 20):
            continue
            
        idxs = [wv.vocab[w].index for w in filter(lambda w: w in wv.vocab, words)]
        idxs = idxs[:int(len(idxs) * ratio_words)]

        embs = embeddings[idxs, :]
#         # Change the origin of the system.
#         if left_offset is not None:
#             offset_mat = np.tile(left_offset.reshape(1, -1), (embs.shape[0], 1))
#             embs = PoincareWordEmbeddingsKeyedVectors.moebius_add_mat(offset_mat, embs)
#         if right_offset is not None:
#             offset_mat = np.tile(right_offset.reshape(1, -1), (embs.shape[0], 1))
#             embs = PoincareWordEmbeddingsKeyedVectors.moebius_add_mat(embs, offset_mat)

        for i in range(model.num_embs):
            row = int(i / PLOTS_PER_ROW) + 1
            col = i % PLOTS_PER_ROW + 1

            trace = go.Scatter(
                x=embs[:, small_emb_size * i],
                y=embs[:, small_emb_size * i + 1],
                text=[wv.index2word[idx] for idx in idxs],
                textposition='top right',
                name=level,
                mode="markers",
                marker=dict(color="rgb"+str(colors[level])))
            fig.append_trace(trace, row, col)

    layout = {
        'width': PLOTS_PER_ROW * 300,
        'height': rows * 300
    }
    
    if isinstance(model.wv, PoincareWordEmbeddingsKeyedVectors):
        shapes = []
        for i, trace in enumerate(fig['data']):
            if i > 1: 
                trace['showlegend'] = False
            if i < model.num_embs and isinstance(model.wv, gensim.models.keyedvectors.PoincareWordEmbeddingsKeyedVectors):
                shapes.append(
                    # unfilled circle
                    {
                        'type': 'circle',
                        'xref': 'x'+str(i+1),
                        'yref': 'y'+str(i+1),
                        'x0': -1,
                        'y0': -1,
                        'x1': 1,
                        'y1': 1,
                        'line': {
                            'color': 'rgba(0, 0, 0, 1)',
                        },
                    }
                )

        layout['shapes'] = shapes
        
    fig["layout"].update(layout)
    
    return iplot(fig)

labeled_word_dict = read_word_dict()
for lvl in labeled_word_dict:
    print(len(labeled_word_dict[lvl]))

1
3
22
68
200
503
863
1529
1380
1423
1130
704
416
242
160
110
91
68
42
1


# MIX-GloVe - 10x2D Poincare embeddings

In [5]:
hyp_model_fn = {
    "50_2D": os.path.join(ROOT, "models/glove/geometric_emb/glove_ep15_size20_lr0.05_vocab200000_mix-poincare_OPTmixradagrad_COOCCFUNClog_DISTFUNCdist-sq_NUMEMBS10_bias"),
}

In [21]:
hyp_model = Glove.load(hyp_model_fn["50_2D"])
hyp_wv = hyp_model.wv
offset = None # np.array([0.25, 0])

plot_wordnet_per_level(hyp_model, labeled_word_dict, ratio_words=1,
                       left_offset=offset, right_offset=offset, aggregate="w")

This is the format of your plot grid:
[ (1,1) x1,y1 ]    [ (1,2) x2,y2 ]    [ (1,3) x3,y3 ]  
[ (2,1) x4,y4 ]    [ (2,2) x5,y5 ]    [ (2,3) x6,y6 ]  
[ (3,1) x7,y7 ]    [ (3,2) x8,y8 ]    [ (3,3) x9,y9 ]  
[ (4,1) x10,y10 ]  [ (4,2) x11,y11 ]  [ (4,3) x12,y12 ]

