# Transfer Learning from GPT-2


In this notebook we will generate rental descriptions by using transfer learning from GPT-2 as an encoder.

Take it easy and check all the outputs from an inference. Remember to set is as non trainable or you can easily wait for weeks until it ends!

You can run this lab both locally or in Colab.

- To run in Colab just go to `https://colab.research.google.com`, sign-in and you upload this notebook. Colab has GPU access for free.
- To run locally just run `jupyter notebook` and access the notebook in this lab. You would need to first install the requirements in `requirements.txt`

Follow the instructions. Good luck!

In [None]:
!nvidia-smi

In [None]:
!pip install textblob 'gensim==4.2.0' 'keras-nlp' 'transformers'

In [None]:
import multiprocessing
import tensorflow as tf
import sys
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda, ELU, Conv1D, MaxPooling1D, Dropout
from keras import Model, Input
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from textblob import TextBlob, Word
from keras_preprocessing.sequence import pad_sequences
from keras.initializers import Constant
from tensorflow.keras.layers.experimental import preprocessing
from transformers import GPT2Tokenizer, TFGPT2Model
import keras_nlp
import os
import time
import sys
import numpy as np
import random
import os
import pandas as pd
import gensim
import warnings
import nltk
import pickle
from tensorflow.nn import leaky_relu

import re
import warnings
from sklearn.model_selection import train_test_split
from textblob import TextBlob
from collections import defaultdict


TRACE = False
embedding_dim = 100
rnn_units = 128
epochs=25
buffer_size = 64
corpus_size=25000
test_corpus_size=5000
# Batch size
batch_size = 64
min_count_words = 3
BATCH = True

def set_seeds_and_trace():
  os.environ['PYTHONHASHSEED'] = '0'
  np.random.seed(42)
  tf.random.set_seed(42)
  random.seed(42)
  if TRACE:
    tf.debugging.set_log_device_placement(True)

def set_session_with_gpus_and_cores():
  cores = multiprocessing.cpu_count()
  gpus = len(tf.config.list_physical_devices('GPU'))
  config = tf.compat.v1.ConfigProto( device_count = {'GPU': gpus  , 'CPU': cores} , intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
  sess = tf.compat.v1.Session(config=config)
  tf.compat.v1.keras.backend.set_session(sess)

set_seeds_and_trace()
set_session_with_gpus_and_cores()
warnings.filterwarnings('ignore')
nltk.download('punkt')
tokenizer = lambda x: TextBlob(x).words

In [1]:
%%writefile get_data.sh
if [ ! -f train_corpus_descriptions_airbnb.csv ]; then
  wget -O train_corpus_descriptions_airbnb.csv https://www.dropbox.com/scl/fi/rbrynlq7871cshi0krftj/train_corpus_descriptions_airbnb.csv?rlkey=td1pfjgqjccap0xu9g4eliube&dl=0
fi

if [ ! -f test_corpus_descriptions_airbnb.csv ]; then
    wget -O test_corpus_descriptions_airbnb.csv https://www.dropbox.com/scl/fi/eys05bzwwnhskadqh7aux/test_corpus_descriptions_airbnb.csv?rlkey=p1zuz90khh5t7dx3hkfba1dzm&dl=0
fi

Writing get_data.sh


In [None]:
!bash get_data.sh

In [None]:
train_path = "./train_corpus_descriptions_airbnb.csv"
test_path = "./test_corpus_descriptions_airbnb.csv"
# Read, then decode for py2 compat.
airbnb_reviews = pd.read_csv(train_path, header=None, names=["review"]).dropna().sample(n=corpus_size).reset_index(drop=True)
test_airbnb_reviews = pd.read_csv(test_path, header=None, names=["review"]).dropna().sample(n=test_corpus_size).reset_index(drop=True)


In [None]:
airbnb_reviews.head()

In [None]:
def preprocess_text(text, should_join=True):
    text = str(text)
    text = ' '.join(str(word).lower() for word in tokenizer(text))
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    if should_join:
      return ' '.join(gensim.utils.simple_preprocess(text))
    else:
      return gensim.utils.simple_preprocess(text)

In [None]:
def get_maximum_review_length(df):
    maximum = 0
    for ix, row in df.iterrows():
        candidate = len(preprocess_text(row.review, should_join=False))
        if candidate > maximum:
            maximum = candidate
    return maximum


maximum = get_maximum_review_length(airbnb_reviews)


In [None]:
tokenizer = None # Load the GPT-2 tokenizer with max_len as maximum
gpt_model = None # Load the GPT-2 model

In [None]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})


In [None]:
vocab_size = len(tokenizer.get_vocab())

In [None]:
tokenizer(preprocess_text(airbnb_reviews.review[1], should_join=True), return_tensors='tf', padding=True)

In [None]:
def get_ids_tensor(df):
  all_ids = tf.constant(np.zeros((1, maximum)), dtype='int32')
  for review in df.review:
      review = None # preprocess text noting that its output should be input to the tokenizer
      value = None  # Use the tokenizer with return_tensors='tf' and padding=True on the preprocessed review
      value = None # Pad zeroes until the tensor has size (1, maximum)
      value = tf.reshape(value, [1, maximum])
      output = tf.concat([all_ids,value], axis=0)
      all_ids = tf.reshape(output, [-1, maximum])
  return all_ids[1:]

all_ids = get_ids_tensor(df=airbnb_reviews)
print(all_ids)

In [None]:
test_all_ids = get_ids_tensor(df=test_airbnb_reviews)

In [None]:
#Prepare the dataset
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
test_ids_dataset = tf.data.Dataset.from_tensor_slices(test_all_ids)

In [None]:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [None]:
dataset = ids_dataset.map(split_input_target)
test_dataset = test_ids_dataset.map(split_input_target)

In [None]:
dataset = (
    dataset
    .shuffle(buffer_size)
    .batch(batch_size=batch_size, drop_remainder=True)
)
test_dataset = (
    test_dataset
    .batch(batch_size=batch_size, drop_remainder=True)
)

In [None]:
dataset.take(1)

In [None]:
class RentalGenerator(tf.keras.Model):
  def __init__(self, model, vocab_size, rnn_units):
    super().__init__(self)
    self.pretrained_layer = model   # -> This is the GPT-2 model
    self.rnn = tf.keras.layers.LSTM(rnn_units,
                                   activation='tanh',
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    # Implement the forward pass
    pass

In [None]:
model = None # Instantiate the model and set the first layer as non trainable

In [None]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

In [None]:
model.summary()  # -> Validate the non-trainable parameters

In [None]:
# Compile and train the model such that it early stops if the perplexity in the val set does not decrease

In [None]:
import matplotlib.pyplot as plt

# function for plotting loss
def plot_metrics(train_metric, val_metric=None, metric_name=None, title=None, ylim=5):
    plt.title(title)
    plt.ylim(0,ylim)
    plt.plot(train_metric,color='blue',label=metric_name)
    if val_metric is not None: plt.plot(val_metric,color='green',label='val_' + metric_name)
    plt.legend(loc="upper right")

# plot loss history
plot_metrics(history.history['loss'], val_metric=history.history['val_loss'], metric_name="Loss", title="Loss", ylim=5.0)

In [None]:
plot_metrics(history.history['perplexity'], val_metric=history.history['val_perplexity'], metric_name="perplexity", title="perplexity", ylim=2000.0)

In [None]:
class OneStep(tf.keras.Model):
  def __init__(self, model, tokenizer, ix_to_word, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.tokenizer = tokenizer
    self.ix_to_word = ix_to_word

  def expand_dims_if_neccesary(self, input):
    if len(input.shape) < 3:
      input = tf.expand_dims(input, axis=0)
    return input

  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_words = preprocess_text(inputs, should_join=True)
    input_ids = self.tokenizer(input_words)
    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    pred_id = predicted_ids.numpy()[0][0]
    # Convert from token ids to characters
    predicted_word = self.expand_dims_if_neccesary(tf.constant(self.ix_to_word[pred_id]))

    # Return the characters and model state.
    return predicted_word, states

In [None]:
word_to_ix = tokenizer.get_vocab()
ix_to_word = {ix: word for word, ix in word_to_ix.items()}

In [None]:
one_step_model = None # Instantiate the OneStepModel
start = time.time()
states = None
description = tf.constant(['Midtown Sunny 2-Bedroom'])

for n in range(200):
  next_word, states = None # Generate the next word and the states
  description = None # Append the word


result = tf.strings.join(description, separator=" ")
end = time.time()
print(result, '\n\n' + '_'*80)
print('\nRun time:', end - start)



In [None]:
tf.saved_model.save(one_step_model, 'lstm_rental_generator')