# Lab 10.1
## Text Generation with GPT-2


In this notebook we will generate rental descriptions with the trained GPT-2 inference, which is an enormous pre-trained transformer model from HuggingFace. As it was not fine-tuned to rental descriptions we can expect a lot of language understanding but not a high accuracy

You can run this lab both locally or in Colab.

- To run in Colab just go to `https://colab.research.google.com`, sign-in and you upload this notebook. Colab has GPU access for free.
- To run locally just run `jupyter notebook` and access the notebook in this lab. You would need to first install the requirements in `requirements.txt`

Follow the instructions. Good luck!

In [None]:
!pip install textblob 'gensim==4.2.0' 'keras_nlp' 'transformers'

In [None]:
import multiprocessing
import os
import random
import re
import warnings

import gensim
import keras.backend as K
import nltk
import numpy as np
import pandas as pd
import smart_open
import tensorflow as tf
from gensim.models.callbacks import CallbackAny2Vec
from transformers import set_seed, pipeline
from textblob import TextBlob

TRACE = False
embedding_dim = 100
rnn_units = 128
epochs=25
buffer_size = 64
corpus_size=25000
test_corpus_size=5000
# Batch size
batch_size = 64
min_count_words = 5
BATCH = True

def set_seeds_and_trace():
  os.environ['PYTHONHASHSEED'] = '0'
  np.random.seed(42)
  tf.random.set_seed(42)
  random.seed(42)
  set_seed(42)
  if TRACE:
    tf.debugging.set_log_device_placement(True)

def set_session_with_gpus_and_cores():
  cores = multiprocessing.cpu_count()
  gpus = len(tf.config.list_physical_devices('GPU'))
  config = tf.compat.v1.ConfigProto( device_count = {'GPU': gpus  , 'CPU': cores} , intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
  sess = tf.compat.v1.Session(config=config) 
  K.set_session(sess)

set_seeds_and_trace()
set_session_with_gpus_and_cores()
warnings.filterwarnings('ignore')
nltk.download('punkt')
tokenizer = lambda x: TextBlob(x).words

In [None]:
generator = None # Create a gpt-2 pipeline generator
query = 'Midtown Sunny 2-Bedroom'
generated_descriptions = None # Create 10 expansions of 200 words

In [None]:
%%writefile get_data.sh
if [ ! -f train_corpus_descriptions_airbnb.csv ]; then
  wget -O train_corpus_descriptions_airbnb.csv https://www.dropbox.com/s/5rp7ibop99qyafo/train_corpus_descriptions_airbnb.csv?dl=0
fi


In [None]:
!bash get_data.sh

In [None]:
def preprocess_text(text, should_join=True):
    text = str(text)
    text = ' '.join(str(word).lower() for word in tokenizer(text))
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    if should_join:
      return ' '.join(gensim.utils.simple_preprocess(text))
    else:
      return gensim.utils.simple_preprocess(text)

In [None]:
# Create a doc2vec model of embedding_dim size, min_count of 2. Build the vocabulary and train it. Use the read_corpus TaggedDocuments method we used before

doc2vec_model = None

In [None]:
descriptions = []
for description in generated_descriptions:
    tokenized = None # tokenize the generated description
    similarity = None # Find the similarity with the query
    descriptions.append({"description": description["generated_text"], "similarity": similarity})

df_desc = pd.DataFrame(descriptions).sort_values(by="similarity", ascending=False, inplace=False)
df_desc.head()

In [None]:
df_desc.iloc[0].description

In [None]:
df_desc.iloc[1].description