In [None]:
import numpy as np
np.random.seed(42)
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing
import os
import time
import sys

In [None]:
# In case your sys.path does not contain the base repo, cd there.
print(sys.path)
%cd 'PATH_OF_BASE_REPO'  # In the solution it will be the path to my repo. This is such that python loads al the files from the top.

In [None]:
path = "dataset/train_corpus_descriptions_airbnb.csv"
# Only load 1M characters for speed.
text = open(path, 'rb').read().decode(encoding='utf-8')[:1000000]
# length of text is the number of characters in it
print(f'Length of text: {len(text)} characters')


In [None]:
print(text[:250])


In [None]:
vocab = set() #Construct the vocabulary of distinct characters in the text
print(f'{len(vocab)} unique characters')


In [None]:
# Use the preprocessing StringLookup class to create ids from the characters
ids_from_chars = None

In [None]:
# Use the preprocessing StringLookup class to recover the characters
chars_from_ids = None

In [None]:
def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

Notice that with the preprocessors in place, we can come and go from text to the characters and the IDs.

In [None]:
ids = ids_from_chars(tf.strings.unicode_split('Only you can prevent forest fires', input_encoding='UTF-8'))
ids

In [None]:
text_from_ids(ids)

In [None]:
#Prepare the dataset

all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
seq_length = 75  # Maximum alternate query size
examples_per_epoch = len(text)//(seq_length+1)
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)


In [None]:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

A key part of our model is reading the first N characters, and trying to figure out the next one. As dimensionally that is more complex, we maintain the same in and out dimensions, but move the text by one character. For us is hard, but to computers this makes it easier for them to understand the context.

In [None]:
split_input_target(list("Apache Solr"))

In [None]:
dataset = None # apply the method split_input_target to the sequences TF.Dataset. This was we create a generator that will yield ( input_ids, target_ids ) tuples

In [None]:
for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())

In [None]:
# Batch size
BATCH_SIZE = 64
EPOCHS = 10
BUFFER_SIZE = 2000
vocab_size = len(vocab)
embedding_dim = 100
rnn_units = 128   # 1024 is more recommended with enough GPUs


#Create a TF Dataset Generator that shuffles the dataset and batches in the corresponding BATCH_SIZE
dataset = ()

Big Keras models tend to be Python classes inheriting from the keras.Model class. The only method to implement is the call method.

In [None]:
class QueryGenerator(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = None # Create an Embedding of size (vocab_size, embedding_dim)
    self.rnn = None # Create a GRU layer [may be SimpleRNN too] with rnn_units, any activation of your liking, and to return both sequences and states.
    # It is important to return the states because our model will get the embedding from letter N, the states from N-1, and from that predict N+1
    self.dense = None  # Create a Dense layer to predict which character to use. Which activation is best?

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.rnn.get_initial_state(x)
    x, states = self.rnn(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

In [None]:
model = QueryGenerator(
    # Be sure the vocabulary size matches the `StringLookup` layers.
    vocab_size=len(ids_from_chars.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [None]:
# Initialize the model by predicting one input batch:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = None
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

In [None]:
model.summary()

This parameters can grow exponentially if we used all the text or used LSTMs

In [None]:
# Compile the model with any optimizer and categorical cross entropy loss from logits, which is best for classification tasksk as this one.
None

In [None]:
model.fit(dataset, epochs=EPOCHS)

In [None]:
# Do not worry about this, it is the code to sample the next character by preicting the output of letter N and remember the states of N-1.
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states

In [None]:
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)
start = time.time()
states = None
next_char = tf.constant(['Midtown Sunny 2-Bedroom'])
result = [next_char]

# Create a 75 character suggestion to continue the query above!
for n in range(75):
  next_char = None
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result, '\n\n' + '_'*80)
print('\nRun time:', end - start)

Not that bad for somethign we have done in 15 minutes, right? Imagine those models like GPT-2 that trained for hours?

In [None]:
tf.saved_model.save(one_step_model, '3-query-generation/lab6/alternative_queries')