# Importing libraries

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
import numpy as np
import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='3'
import time
import random
import tensorflow as tf

%env TF_FORCE_GPU_ALLOW_GROWTH=true
# Making sure we cache the models and they are not downloaded all the time
%env TFHUB_CACHE_DIR=./tfhub_modules

env: TF_FORCE_GPU_ALLOW_GROWTH=true
env: TFHUB_CACHE_DIR=./tfhub_modules


## Using pretrained ELMo model

### Downloading the ELMo model from TFHub

In [2]:
import tensorflow_hub as hub
import tensorflow.keras.backend as K

# Remove any ongoing sessions
K.clear_session()

# Download the ELMo model and save to disk
elmo_layer = hub.KerasLayer("https://tfhub.dev/google/elmo/3", signature="tokens",signature_outputs_as_dict=True)

### Formatting the input for ELMo

In [3]:
def format_text_for_elmo(texts, lower=True, split=" ", max_len=None):

    """ Formats a given text for the ELMo model (takes in a list of strings) """

    token_inputs = [] # Maintains individual tokens
    token_lengths = [] # Maintains the length of each sequence

    max_len_inferred = 0 # We keep a variable to matain the max length of the input

    # Go through each text (string)
    for text in texts:

        # Process the text and get a list of tokens
        tokens = tf.keras.preprocessing.text.text_to_word_sequence(text, lower=lower, split=split)

        # Add the tokens
        token_inputs.append(tokens)

        # Compute the max length for the collection of sequences
        if len(tokens)>max_len_inferred:
            max_len_inferred = len(tokens)

    # It's important to make sure the maximum token length is only as large as the longest input in the sequence
    # You can't have arbitrarily large length as the maximum length. Otherwise, you'll get this error.
    #InvalidArgumentError:  Incompatible shapes: [2,6,1] vs. [2,10,1024]
    #    [[node mul (defined at .../python3.6/site-packages/tensorflow_hub/module_v2.py:106) ]] [Op:__inference_pruned_3391]

    # Here we make sure max_len is only as large as the longest input
    if max_len and max_len_inferred < max_len:
        max_len = max_len_inferred
    if not max_len:
        max_len = max_len_inferred

    # Go through each token sequence and modify sequences to have same length
    for i, token_seq in enumerate(token_inputs):

        token_lengths.append(min(len(token_seq), max_len))

        # If the maximum length is less than input length, truncate
        if max_len < len(token_seq):
            token_seq = token_seq[:max_len]
        # If the maximum length is greater than or equal to input length, add padding as needed
        else:
            token_seq = token_seq+[""]*(max_len-len(token_seq))

        assert len(token_seq)==max_len

        token_inputs[i] = token_seq

    # Return the final output
    return {
        "tokens": tf.constant(token_inputs),
        "sequence_len": tf.constant(token_lengths)
    }


print(format_text_for_elmo(["the cat sat on the mat", "the mat sat"], max_len=10))

{'tokens': <tf.Tensor: shape=(2, 6), dtype=string, numpy=
array([[b'the', b'cat', b'sat', b'on', b'the', b'mat'],
       [b'the', b'mat', b'sat', b'', b'', b'']], dtype=object)>, 'sequence_len': <tf.Tensor: shape=(2,), dtype=int32, numpy=array([6, 3], dtype=int32)>}


In [4]:
# Titles of 001.txt - 005.txt in bbc/business
elmo_inputs = format_text_for_elmo([
    "Ad sales boost Time Warner profit",
    "Dollar gains on Greenspan speech",
    "Yukos unit buyer faces loan claim",
    "High fuel prices hit BA's profits",
    "Pernod takeover talk lifts Domecq"
])

# Get the result from ELMo
elmo_result = elmo_layer(elmo_inputs)

# Print the result
for k,v in elmo_result.items():
    print(f"Tensor under key={k} is a {v.shape} shaped Tensor")

Tensor under key=word_emb is a (5, 6, 512) shaped Tensor
Tensor under key=default is a (5, 1024) shaped Tensor
Tensor under key=elmo is a (5, 6, 1024) shaped Tensor
Tensor under key=lstm_outputs2 is a (5, 6, 1024) shaped Tensor
Tensor under key=lstm_outputs1 is a (5, 6, 1024) shaped Tensor
Tensor under key=sequence_len is a (5,) shaped Tensor
