In [1]:
import numpy as np
import pandas as pd

import os
import collections
from absl import logging
from scipy.spatial import distance
import tensorflow as tf
import tensorflow_hub as hub
from datetime import datetime
import sentencepiece as spm

In [2]:
module = hub.Module("./model")
input_placeholder = tf.sparse_placeholder(tf.int64, shape=[None, None])
encodings = module(
    inputs=dict(
        values=input_placeholder.values,
        indices=input_placeholder.indices,
        dense_shape=input_placeholder.dense_shape))


INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [3]:
with tf.Session() as sess:
    spm_path = sess.run(module(signature="spm_path"))

sp = spm.SentencePieceProcessor()
sp.Load(spm_path)
print("SentencePiece model loaded at {}.".format(spm_path))

INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
SentencePiece model loaded at b'./model/assets/universal_encoder_8k_spm.model'.


In [4]:
def process_to_IDs_in_sparse_format(sp, sentences):
  ids = [sp.EncodeAsIds(x) for x in sentences]
  max_len = max(len(x) for x in ids)
  dense_shape=(len(ids), max_len)
  values=[item for sublist in ids for item in sublist]
  indices=[[row,col] for row in range(len(ids)) for col in range(len(ids[row]))]
  return (values, indices, dense_shape)

In [5]:
word = "Elephant"
sentence = "I am a sentence for which I would like to get its embedding."
paragraph = (
    "Universal Sentence Encoder embeddings also support short paragraphs. "
    "There is no hard limit on how long the paragraph is. Roughly, the longer "
    "the more 'diluted' the embedding will be.")

sentence3 = "I like to go climbing."
sentence4 = "I prefer mountains"
sentence5 = "We go to run every morning"
sentence6 = "How old are you?"
messages = [word, sentence, paragraph, sentence3, sentence4, sentence5, sentence6]

values, indices, dense_shape = process_to_IDs_in_sparse_format(sp, messages)

# Reduce logging output.
logging.set_verbosity(logging.ERROR)

with tf.Session() as session:
  session.run([tf.global_variables_initializer(), tf.tables_initializer()])
  message_embeddings = session.run(
      encodings,
      feed_dict={input_placeholder.values: values,
                input_placeholder.indices: indices,
                input_placeholder.dense_shape: dense_shape})

  for i, message_embedding in enumerate(np.array(message_embeddings).tolist()):
    print("Message: {}".format(messages[i]))
    print("Embedding size: {}".format(len(message_embedding)))
    message_embedding_snippet = ", ".join(
        (str(x) for x in message_embedding[:3]))
    print("Embedding: [{}, ...]\n".format(message_embedding_snippet))

Message: Elephant
Embedding size: 512
Embedding: [0.053387485444545746, 0.053194381296634674, -0.05235603079199791, ...]

Message: I am a sentence for which I would like to get its embedding.
Embedding size: 512
Embedding: [0.03533292934298515, -0.04714975133538246, 0.012305558659136295, ...]

Message: Universal Sentence Encoder embeddings also support short paragraphs. There is no hard limit on how long the paragraph is. Roughly, the longer the more 'diluted' the embedding will be.
Embedding size: 512
Embedding: [-0.004081725142896175, -0.08954869210720062, 0.03737190365791321, ...]

Message: I like to go climbing.
Embedding size: 512
Embedding: [-0.059705447405576706, 0.018434198573231697, -0.03026321716606617, ...]

Message: I prefer mountains
Embedding size: 512
Embedding: [-0.05673089623451233, 0.021162638440728188, 0.05424637347459793, ...]

Message: We go to run every morning
Embedding size: 512
Embedding: [-0.05583438649773598, 0.058561235666275024, 0.02889111638069153, ...]

M

In [6]:
vectors = np.array(message_embeddings).tolist()
print("Distance 3 - 4 {}".format(distance.cosine(vectors[3], vectors[4])))
print("Distance 3 - 5 {}".format(distance.cosine(vectors[3], vectors[5])))
print("Distance 3 - 6 {}".format(distance.cosine(vectors[3], vectors[6])))

Distance 3 - 4 0.397930018651406
Distance 3 - 5 0.4438911406813929
Distance 3 - 6 0.8192398775599674
