<a href="https://colab.research.google.com/github/axel-sirota/ml_ad_ai_course/blob/main/NLP%20with%20Deep%20Learning/12_RentalGenerator_GRU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Generation with a character based RNN/GRU

In this notebook we will train from scratch in an unsupervised fashion text generation to replicate the descriptions of the Airbnb rentals. For time limitations it will not be the best model but it is structured in a portable fashion to your needs.

Take it easy and pay attention to the model, how we pass states, and the complexities to do the passthrough of passing the states character by character.

You can run this lab both locally or in Colab.

- To run in Colab just go to `https://colab.research.google.com`, sign-in and you upload this notebook. Colab has GPU access for free.
- To run locally just run `jupyter notebook` and access the notebook in this lab. You would need to first install the requirements in `requirements.txt`

Follow the instructions. Good luck!

In [None]:
!nvidia-smi

In [None]:
!pip install textblob 'keras-nlp' 'keras-preprocessing' 'gensim==4.2.0' np_utils

In [None]:
import multiprocessing
import tensorflow as tf
import sys
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda, ELU, Conv1D, MaxPooling1D, Dropout, GRU
from keras import Model, Input
import np_utils
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from textblob import TextBlob, Word
from keras_preprocessing.sequence import pad_sequences
from keras.initializers import Constant
from tensorflow.keras.layers.experimental import preprocessing
import keras_nlp
import os
import time
import sys
import numpy as np
import random
import os
import pandas as pd
import gensim
import warnings
import nltk
import pickle
from tensorflow.nn import leaky_relu

import re
import warnings
from sklearn.model_selection import train_test_split
from textblob import TextBlob


TRACE = False
embedding_dim = 100
rnn_units = 128
epochs=50
buffer_size = 2000
corpus_size=30000
test_corpus_size=1000
# Batch size
batch_size = 64
seq_length = 100
BATCH = True

def set_seeds_and_trace():
  os.environ['PYTHONHASHSEED'] = '0'
  np.random.seed(42)
  tf.random.set_seed(42)
  random.seed(42)
  if TRACE:
    tf.debugging.set_log_device_placement(True)

def set_session_with_gpus_and_cores():
  cores = multiprocessing.cpu_count()
  gpus = len(tf.config.list_physical_devices('GPU'))
  config = tf.compat.v1.ConfigProto( device_count = {'GPU': gpus  , 'CPU': cores} , intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
  sess = tf.compat.v1.Session(config=config)
  tf.compat.v1.keras.backend.set_session(sess)

set_seeds_and_trace()
set_session_with_gpus_and_cores()
warnings.filterwarnings('ignore')
nltk.download('punkt')
textblob_tokenizer = lambda x: TextBlob(x).words

In [None]:
%%writefile get_data.sh
if [ ! -f train_corpus_descriptions_airbnb.csv ]; then
  wget -O train_corpus_descriptions_airbnb.csv https://www.dropbox.com/s/5rp7ibop99qyafo/train_corpus_descriptions_airbnb.csv?dl=0
fi

if [ ! -f test_corpus_descriptions_airbnb.csv ]; then
    wget -O test_corpus_descriptions_airbnb.csv https://www.dropbox.com/s/a29bbkg8hi4q4f4/test_corpus_descriptions_airbnb.csv?dl=0
fi

In [None]:
!bash get_data.sh

In [None]:
train_path = "./train_corpus_descriptions_airbnb.csv"
test_path = "./test_corpus_descriptions_airbnb.csv"
# Read, then decode for py2 compat.
airbnb_reviews = pd.read_csv(train_path, header=None, names=["review"]).dropna().sample(n=corpus_size).reset_index(drop=True)
test_airbnb_reviews = pd.read_csv(test_path, header=None, names=["review"]).dropna().sample(n=test_corpus_size).reset_index(drop=True)


In [None]:
airbnb_reviews

In [None]:
# Take a look at the first review in text
print(airbnb_reviews.iloc[0].review)


In [None]:
vocab = set()

# Construct the vocabulary, you are free to iterate the DF or use TF utilities. What is key is that the vocab will be of characters!

In [None]:
print(f'{len(vocab)} unique characters')


In [None]:
# Use StringLookup to construct a map with ids_from_chars
ids_from_chars = None

In [None]:
# And construct the inverse
chars_from_ids = None

In [None]:
def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

In [None]:
ids = ids_from_chars(tf.strings.unicode_split('Only you can prevent forest fires', input_encoding='UTF-8'))
ids

In [None]:
text_from_ids(ids)

In [None]:
def preprocess_text(text, should_join=True):
    text = str(text)
    text = ' '.join(str(word).lower() for word in textblob_tokenizer(text))
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    if should_join:
      return ' '.join(gensim.utils.simple_preprocess(text))
    else:
      return gensim.utils.simple_preprocess(text)

In [None]:
def get_ids_tensor(df):
  # This is a cool trick. Since tf.concat requires same shapes, start with a row os 0's, concat all that's needed and then return all rows but the first
  all_ids = tf.constant(np.zeros((1, seq_length)), dtype='int64')

  for review in df.review:
      review = None # Preprocess the review (should it output text or a list?)
      review_length = len(review)
      batches = review_length // seq_length
      for batch in range(batches):  # As a review may be longer than the seq_length, and the output tensor must have the same width, we need to batch
        lower_limit = None  # Fill lower limit of batch in the review
        upper_limit = None  # Fill upper limit of batch in the review (we will drop the final remainder)
        value = None # Get tensor of ids for each character position
        value = tf.reshape(value, [1, 100])  # In case the end shape is different
        output = tf.concat([all_ids,value], axis=0)  # Add row of that batch
        all_ids = tf.reshape(output, [-1, 100])  # Ensure column width
  return all_ids[1:]

all_ids = get_ids_tensor(df=airbnb_reviews)

In [None]:
print(all_ids.shape)

In [None]:
ids_from_chars(tf.strings.unicode_split(airbnb_reviews.review[0][0:100], 'UTF-8'))

In [None]:
test_all_ids = get_ids_tensor(df=test_airbnb_reviews)

In [None]:
#Prepare the dataset
ids_dataset = None # Create tensorflow Dataset object from all_ids tensor
test_ids_dataset = None # Do likewise for test set

In [None]:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [None]:
dataset = ids_dataset.map(split_input_target)
test_dataset = test_ids_dataset.map(split_input_target)


In [None]:
for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())

In [None]:
dataset = (
    dataset
    .shuffle(buffer_size)
    .batch(batch_size=batch_size, drop_remainder=True)
)
test_dataset = (
    test_dataset
    .shuffle(buffer_size)
    .batch(batch_size=batch_size, drop_remainder=True)
)

In [None]:
class RentalGenerator(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units, seq_length):
    super().__init__(self)
    self.vocab_size = vocab_size
    self.seq_length = seq_length
    self.embedding = None # Create Embedding layer, which should be its shape? If you want test out training a word2vec and set it here to fine-tune
    self.rnn = None # Create a GRU layer with tanh activation to ensure you can use cuDNN libraries. You should return both sequences and states
    self.dense = None # Final Dense layer as always, should you return a softmax?

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = None # Apply embedding, pass the training var
    if states is None:
      states = self.rnn.get_initial_state(x)
    x, states = None, None # Apply rnn layer, passing the states
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

  def build_graph(self):
    x = Input(shape=(self.seq_length, ))   # doesn't consider the batch
    Model(inputs=x, outputs=self.call(x))
    self.build((None, self.seq_length, ))   # takes into consideration the batch size

In [None]:
model = RentalGenerator(
    # Be sure the vocabulary size matches the `StringLookup` layers.
    vocab_size=len(ids_from_chars.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    seq_length=seq_length)

In [None]:
model.build_graph()

In [None]:
model.summary()

In [None]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

In [None]:
# Set a perplexity metric
perplexity = None
# Compile the model, passing perplexity as the metric and check that the loss is correct

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5, min_delta=0.05)
history = None # train the model

In [None]:
import matplotlib.pyplot as plt

# function for plotting loss
def plot_metrics(train_metric, val_metric=None, metric_name=None, title=None, ylim=5):
    plt.title(title)
    plt.ylim(0,ylim)
    plt.plot(train_metric,color='blue',label=metric_name)
    if val_metric is not None: plt.plot(val_metric,color='green',label='val_' + metric_name)
    plt.legend(loc="upper right")

# plot loss history
plot_metrics(history.history['loss'], val_metric=history.history['val_loss'], metric_name="Loss", title="Loss", ylim=5.0)

In [None]:
plot_metrics(history.history['perplexity'], val_metric=history.history['val_perplexity'], metric_name="perplexity", title="perplexity", ylim=20.0)

In [None]:
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states

In [None]:
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)
start = time.time()
states = None
next_char = tf.constant(['Midtown Sunny 2-Bedroom'])
result = [next_char]

for n in range(250):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result, '\n\n' + '_'*80)
print('\nRun time:', end - start)



In [None]:
tf.saved_model.save(one_step_model, 'rental_generator')