In [9]:
import tensorflow as tf
import json
from os import listdir
from os.path import isfile, join

In [2]:
#only if GPU is available
physical_devices = tf.config.list_physical_devices("GPU")
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [3]:
from absl import logging

import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns

module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)
print ("module %s loaded" % module_url)
def embed(input):
  return model(input)

module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [4]:
word = "Elephant"
sentence = "I am a sentence for which I would like to get its embedding."
paragraph = (
    "Universal Sentence Encoder embeddings also support short paragraphs. "
    "There is no hard limit on how long the paragraph is. Roughly, the longer "
    "the more 'diluted' the embedding will be.")
messages = [word, sentence, paragraph]

# Reduce logging output.
logging.set_verbosity(logging.ERROR)

message_embeddings = embed(messages)

for i, message_embedding in enumerate(np.array(message_embeddings).tolist()):
  print("Message: {}".format(messages[i]))
  print("Embedding size: {}".format(len(message_embedding)))
  message_embedding_snippet = ", ".join((str(x) for x in message_embedding[:3]))
  print("Embedding: [{}, ...]\n".format(message_embedding_snippet))

Message: Elephant
Embedding size: 512
Embedding: [0.008344488218426704, 0.00048081763088703156, 0.06595246493816376, ...]

Message: I am a sentence for which I would like to get its embedding.
Embedding size: 512
Embedding: [0.0508086122572422, -0.016524313017725945, 0.015737785026431084, ...]

Message: Universal Sentence Encoder embeddings also support short paragraphs. There is no hard limit on how long the paragraph is. Roughly, the longer the more 'diluted' the embedding will be.
Embedding size: 512
Embedding: [-0.028332678601145744, -0.0558621846139431, -0.012941466644406319, ...]



In [10]:
dir_path = '../data/meetings'
files = [join(dir_path, file_name) for file_name in listdir(dir_path)]

sentences = []

'''
sentences is a 2d list consisting of corresponding turns' sentences 
'''

NUM_TURNS = 70
    
for file in files :
    with open(file, 'r') as json_file:
        json_list = list(json_file)

    for json_str in json_list:
        res = json.loads(json_str)
        sentences_temp = []
        turns_temp = []
        for obj in res['meeting'][:NUM_TURNS] :
            turns_temp.append(ord(obj['speaker']) - 65) # convert letter to number i.e 'A' -> 0, 'B' -> 1
            sentences_temp.append(' '.join(obj['utt']['word']))
        if len(turns_temp) < NUM_TURNS :
            continue
        
        sentences.append(sentences_temp)

In [11]:
from numpy import savez_compressed

sentence_embeddings = []
for sentence in sentences :
    sentence_embeddings.append(np.array(embed(sentence)))

# sentence_embeddings = np.array(sentence_embeddings).astype(float32)
sentence_embeddings = np.array(sentence_embeddings).astype('float32')[:, :, :100]

    

savez_compressed('../data/obj/sentence_embeddings.npz', sentence_embeddings)

In [14]:
sentence_embeddings.shape

(94, 70, 100)