## Visualize Word Embeddings with [Tensorflow](http://www.insightsbot.com/blog/27LIms/visualize-word-embeddings-with-tensorflow)

https://projector.tensorflow.org/  
https://stackoverflow.com/questions/50492676/visualize-gensim-word2vec-embeddings-in-tensorboard-projector

In [1]:
import gensim
from gensim.models import Word2Vec,KeyedVectors

In [2]:
#base Folder Path
FOLDER_PATH = "./nlp/"

# Load Google's pre-trained Word2Vec model.
model = KeyedVectors.load_word2vec_format(FOLDER_PATH+'our_model.bin', binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [3]:
print("Vocabulary Size: {0}".format(len(model.vocab)))

Vocabulary Size: 31262


In [4]:
for i,w in enumerate(model.vocab):
    print(w)
    if i>4:
        break
        
#prints </s>, in, for, that, is, on

gewerbe
körper
besonderer
wild
feld
wer


In [6]:
model["gewerbe"].shape

(100,)

In [8]:
import numpy as np

#Important Parameters
VOCAB_SIZE = len(model.vocab)
EMBEDDING_DIM = model["gewerbe"].shape[0]

w2v = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))

In [10]:
tsv_file_path = FOLDER_PATH+"tensorboard/metadata.tsv"
with open(tsv_file_path,'w+', encoding='utf-8') as file_metadata:
    for i,word in enumerate(model.index2word[:VOCAB_SIZE]):
        w2v[i] = model[word]
        file_metadata.write(word+'\n')

In [12]:
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector

TENSORBOARD_FILES_PATH = FOLDER_PATH+"/tensorboard"

In [13]:
#Tensorflow Placeholders
X_init = tf.placeholder(tf.float32, shape=(VOCAB_SIZE, EMBEDDING_DIM), name="embedding")
X = tf.Variable(X_init)

#Initializer
init = tf.global_variables_initializer()

#Start Tensorflow Session
sess = tf.Session()
sess.run(init, feed_dict={X_init: w2v})

#Instance of Saver, save the graph.
saver = tf.train.Saver()
writer = tf.summary.FileWriter(TENSORBOARD_FILES_PATH, sess.graph)

In [14]:
#Configure a Tensorflow Projector
config = projector.ProjectorConfig()
embed = config.embeddings.add()
embed.metadata_path = tsv_file_path

#Write a projector_config
projector.visualize_embeddings(writer,config)

#save a checkpoint
saver.save(sess, TENSORBOARD_FILES_PATH+'/model.ckpt', global_step = VOCAB_SIZE)

#close the session
sess.close()

In [None]:
!python -m tensorboard.main --logdir=~/Documents/repos/exploreAT-Concepts/nlp/tensorboard

In [None]:
#https://gist.github.com/BrikerMan/7bd4e4bd0a00ac9076986148afc06507
# encoding: utf-8
"""
@author: BrikerMan
@contact: eliyar917@gmail.com
@blog: https://eliyar.biz
@version: 1.0
@license: Apache Licence
@file: w2v_visualizer.py
@time: 2017/7/30 上午9:37
"""
import sys
import os
import pathlib
import numpy as np
from gensim.models.keyedvectors import KeyedVectors
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector


def visualize(model, output_path):
    meta_file = "w2x_metadata.tsv"
    placeholder = np.zeros((len(model.wv.index2word), model.vector_size))

    with open(os.path.join(output_path, meta_file), 'wb') as file_metadata:
        for i, word in enumerate(model.wv.index2word):
            placeholder[i] = model[word]
            # temporary solution for https://github.com/tensorflow/tensorflow/issues/9094
            if word == '':
                print("Emply Line, should replecaed by any thing else, or will cause a bug of tensorboard")
                file_metadata.write("{0}".format('<Empty Line>').encode('utf-8') + b'\n')
            else:
                file_metadata.write("{0}".format(word).encode('utf-8') + b'\n')

    # define the model without training
    sess = tf.InteractiveSession()

    embedding = tf.Variable(placeholder, trainable=False, name='w2x_metadata')
    tf.global_variables_initializer().run()

    saver = tf.train.Saver()
    writer = tf.summary.FileWriter(output_path, sess.graph)

    # adding into projector
    config = projector.ProjectorConfig()
    embed = config.embeddings.add()
    embed.tensor_name = 'w2x_metadata'
    embed.metadata_path = meta_file

    # Specify the width and height of a single thumbnail.
    projector.visualize_embeddings(writer, config)
    saver.save(sess, os.path.join(output_path, 'w2x_metadata.ckpt'))
    print('Run `tensorboard --logdir={0}` to run visualize result on tensorboard'.format(output_path))


if __name__ == "__main__":
    """
    Use model.save_word2vec_format to save w2v_model as word2evc format
    Then just run `python w2v_visualizer.py word2vec.text visualize_result`
    """
    try:
        model_path = sys.argv[1]
        output_path = sys.argv[2]
    except:
        print("Please provice model path and output path")
    model = KeyedVectors.load_word2vec_format(model_path)
    pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)
visualize(model, output_path)