In [1]:
import colorsys
import gensim
from glove_code.src.glove import Glove
from gensim.models.callbacks import LossLogger, LossSetter
from gensim.models.keyedvectors import PoincareWordEmbeddingsKeyedVectors
from IPython.display import HTML
import itertools
import logging
import matplotlib.pyplot as plt
from matplotlib import animation, rc, cm
from matplotlib.collections import LineCollection
import numpy as np
from numpy.linalg import norm
from numpy import dot
from scipy.spatial.distance import pdist, squareform
from sklearn.manifold import TSNE
import os
from plotly import tools
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import sys

init_notebook_mode(connected=True)
logging.basicConfig(level=logging.WARN)

ROOT = "/Users/alext/Documents/Master/Thesis/"
PLOTS_PER_ROW = 3

In [20]:
# Get vocabulary of the Google word analogy benchmark + labels for each section
def read_word_dict():
    result_dict = {
        "president": ["obama", "eisenhower", "roosevelt", "clinton", "jefferson", "kennedy", "lincoln", "nixon", "reagan"],
        "mathematics": ["geometry", "inequality", "arithmetic", "calculus", "algebra", "equation", "theorem"],
        "number": ["one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"],
        "chemistry": ["chemical", "oxygen", "carbon", "dioxide", "nitrogen", "chlorium", "zinc", "organic"],
        "sport": ["maradona", "ronaldo", "messi", "lochte", "phelps", "spitz", "comneci", "mayweather", "pacquiao"],
        "country": ["romania", "france", "germany", "england", "russia", "india", "china", "spain"],
    }
    return result_dict

def compute_poincare_aggregate(model):
    """
    Precompute the average between the target and the context vector, for Poincare embeddings.
    We take as average the mid point between w and c on the geodesic that connects the 2 points
    (see page 89 in Ungar book).
    """
    if model.poincare and getattr(model.wv, 'agg_vectors', None) is None:
        print("precomputing aggregated vectors w+c for Poincare embeddings")
        gamma_w_sq = 1 / (1 - np.sum(model.wv.vectors * model.wv.vectors, axis=1))
        gamma_c_sq = 1 / (1 - np.sum(model.trainables.syn1neg * model.trainables.syn1neg, axis=1))
        denominator = gamma_w_sq + gamma_c_sq - 1
        agg = (model.wv.vectors * (gamma_w_sq / denominator)[:, None] +
               model.trainables.syn1neg * (gamma_c_sq / denominator)[:, None])

        return model.wv.moebius_mul_mat(agg, 0.5)

def HSVToRGB(h, s, v):
    (r, g, b) = colorsys.hsv_to_rgb(h, s, v)
    return (int(255*r), int(255*g), int(255*b))

def get_colors(word_dict):
    labels = word_dict.keys()
    huePartition = 1.0 / (len(labels) + 1)
    return dict(zip(labels, [HSVToRGB(huePartition * value, 1.0, 1.0) for value in range(0, len(labels))]))

def plot_wordnet_embeddings(model, word_dict, left_offset=None, right_offset=None, ratio_words=1, aggregate="w"):
    colors = get_colors(labeled_word_dict)
    if aggregate == "w+c":
        vectors = compute_poincare_aggregate(model)
    elif aggregate == "w":
        vectors = model.wv.vectors    
    elif aggregate == "c":
        vectors = model.trainables.syn1neg   
    else:
        return None   
    wv = model.wv
    
    embeddings = np.array([vectors[i] for i in range(len(wv.index2entity))])
    traces = []
    
    
    rows = int(np.ceil(model.num_embs / PLOTS_PER_ROW))
    fig = tools.make_subplots(rows=rows, cols=PLOTS_PER_ROW)
    for label, words in word_dict.items():
        idxs = [wv.vocab[w].index for w in filter(lambda w: w in wv.vocab, words+[label])]
        idxs = idxs[:int(len(idxs) * ratio_words)]
        
        label_idx = wv.vocab[label].index
        
        # Change the origin of the system.
        embs = embeddings[idxs, :]
#         if left_offset is not None:
#             offset_mat = np.tile(left_offset.reshape(1, -1), (embs.shape[0], 1))
#             embs = PoincareWordEmbeddingsKeyedVectors.moebius_add_mat(offset_mat, embs)
#         if right_offset is not None:
#             offset_mat = np.tile(right_offset.reshape(1, -1), (embs.shape[0], 1))
#             embs = PoincareWordEmbeddingsKeyedVectors.moebius_add_mat(embs, offset_mat)

        small_emb_size = int(model.vector_size / model.num_embs)
        for i in range(model.num_embs):
            row = int(i / PLOTS_PER_ROW) + 1
            col = i % PLOTS_PER_ROW + 1
            trace = go.Scatter(
                x=embs[:, small_emb_size * i],
                y=embs[:, small_emb_size * i + 1],
                text=[wv.index2word[idx] for idx in idxs],
                textposition='top right',
                name=label,
                mode="markers+text",
                marker=dict(color="rgb"+str(colors[label])))
            fig.append_trace(trace, row, col)
            
    layout = {
        'width': PLOTS_PER_ROW * 300,
        'height': rows * 300
    }
    
    for i, trace in enumerate(fig['data']):
#         if i > 0: 
#             trace['showlegend'] = False
        layout['xaxis'+str(i+1)] = {
                'range': [-1, 1],
            }
        layout['yaxis'+str(i+1)] = {
                'range': [-1, 1],
            }
        
    fig["layout"].update(layout)
    
    return iplot(fig)

labeled_word_dict = read_word_dict()
len(labeled_word_dict)

6

# GloVe - MIX-Poincare embeddings

In [4]:
# vocab=180k
hyp_model_fn_180k = {
    "20D_distsq": os.path.join(ROOT, "models/glove/geometric_emb/glove_ep15_size20_lr0.05_vocab200000_mix-poincare_OPTmixradagrad_COOCCFUNClog_DISTFUNCdist-sq_NUMEMBS10_bias")
}

In [21]:
hyp_model = Glove.load(hyp_model_fn_180k["20D_distsq"])
hyp_wv = hyp_model.wv
offset = None # np.array([0.25, 0])

plot_wordnet_embeddings(hyp_model, labeled_word_dict, ratio_words=1,
                        left_offset=offset, right_offset=offset, aggregate="c")

This is the format of your plot grid:
[ (1,1) x1,y1 ]    [ (1,2) x2,y2 ]    [ (1,3) x3,y3 ]  
[ (2,1) x4,y4 ]    [ (2,2) x5,y5 ]    [ (2,3) x6,y6 ]  
[ (3,1) x7,y7 ]    [ (3,2) x8,y8 ]    [ (3,3) x9,y9 ]  
[ (4,1) x10,y10 ]  [ (4,2) x11,y11 ]  [ (4,3) x12,y12 ]

