<a href="https://colab.research.google.com/github/c-w-m/anlp-tf2/blob/master/chapter5-nlg-with-transformer-gpt/charRNN-text-generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd

tf.__version__

In [None]:
######## GPU CONFIGS FOR RTX 2070 ###############
## Please ignore if not training on GPU       ##
## this is important for running CuDNN on GPU ##

tf.keras.backend.clear_session() #- for easy reset of notebook state

# chck if GPU can be seen by TF
tf.config.list_physical_devices('GPU')
#tf.debugging.set_log_device_placement(True)
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only use the first GPU
  try:
    tf.config.experimental.set_memory_growth(gpus[0], True)
    tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
  except RuntimeError as e:
    # Visible devices must be set before GPUs have been initialized
    print(e)
###############################################

# Setup Tokenization

In [None]:
chars = sorted(set("abcdefghijklmnopqrstuvwxyz0123456789 -,;.!?:’’’/\|_@#$%ˆ&*˜‘+-=()[]{}' ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
chars = list(chars)
EOS = '<EOS>'
UNK = "<UNK>"
PAD = "<PAD>"  # need to move mask to '0'index for TF
chars.append(UNK)
chars.append(EOS)  #end of sentence


## need to handle padding characters as well
chars.insert(0, PAD)  # now padding should get index of 0

In [None]:
# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(chars)}
idx2char = np.array(chars)

In [None]:
def char_idx(c):
    # takes a character and returns an index
    # if character is not in list, returns the unknown token
    if c in chars:
        return char2idx[c]
    
    return char2idx[UNK]

# Load the Model

In [None]:
# Length of the vocabulary in chars
vocab_size = len(chars)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

# Batch size
BATCH_SIZE=1

In [None]:
# Define the model
# this one is without padding masking or dropout layer
def build_gen_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model


gen_model = build_gen_model(vocab_size, embedding_dim, rnn_units, BATCH_SIZE)

In [None]:
# Now setup the location of the checkpoint
# and load the latest checkpoint
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints/2020-Oct-01-14-29-55' 

gen_model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

gen_model.build(tf.TensorShape([1, None]))

In [None]:
def generate_text(model, start_string, temperature=0.7, num_generate=75):
  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
  # Experiment to find the best setting.

  # Converting our start string to numbers (vectorizing)
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []

  # Here batch size == 1
  for i in range(num_generate):
      predictions = model(input_eval)
      # remove the batch dimension
      predictions = tf.squeeze(predictions, 0)

      # using a categorical distribution to predict the word returned by the model
      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      # We pass the predicted word as the next input to the model
      # along with the previous hidden state
      input_eval = tf.expand_dims([predicted_id], 0)
        
      text_generated.append(idx2char[predicted_id])
      # lets break is <EOS> token is generated
      #if idx2char[predicted_id] == EOS:
      #  break #end of a sentence reached, lets stop

  return (start_string + ''.join(text_generated))

In [None]:
print(generate_text(gen_model, start_string=u"Obama"))

In [None]:
print(generate_text(gen_model, start_string=u"Apple"))

## 0.6993 model

In [None]:
print(generate_text(gen_model, start_string=u"Google"))

In [None]:
print(generate_text(gen_model, start_string=u"S&P"))

In [None]:
print(generate_text(gen_model, start_string=u"Market"))

In [None]:
print(generate_text(gen_model, start_string=u"Beyonce"))

## 0.7031 model

In [None]:
print(generate_text(gen_model, start_string=u"Apple"))

In [None]:
print(generate_text(gen_model, start_string=u"Google"))

In [None]:
print(generate_text(gen_model, start_string=u"S&P"))

In [None]:
print(generate_text(gen_model, start_string=u"Market"))

In [None]:
print(generate_text(gen_model, start_string=u"Beyonce"))

## Different temperature settings with the same model

In [None]:
print(generate_text(gen_model, start_string=u"S&P", temperature=0.1))

In [None]:
print(generate_text(gen_model, start_string=u"S&P", temperature=0.3))

In [None]:
print(generate_text(gen_model, start_string=u"S&P", temperature=0.5))

In [None]:
print(generate_text(gen_model, start_string=u"S&P", temperature=0.7))

In [None]:
print(generate_text(gen_model, start_string=u"S&P", temperature=0.9))

In [None]:
print(generate_text(gen_model, start_string=u"Kim", temperature=0.9))

In [None]:
print(generate_text(gen_model, start_string=u"Kim", temperature=0.7))

In [None]:
print(generate_text(gen_model, start_string=u"Kim", temperature=0.5))

In [None]:
print(generate_text(gen_model, start_string=u"Kim", temperature=0.3))

In [None]:
print(generate_text(gen_model, start_string=u"Kim", temperature=0.1))

## in progress model

In [None]:
gen_model2 = build_gen_model(vocab_size, embedding_dim, rnn_units, BATCH_SIZE)
checkpoint_dir = './training_checkpoints/'+ '2020-Jun-02-22-38-17'  # -> with 0.6993 loss
#''2020-Jun-02-01-02-14' # -> .7031 loss
gen_model2.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

gen_model2.build(tf.TensorShape([1, None]))

In [None]:
print(generate_text(gen_model2, start_string=u"S&P", 
                    temperature=1, num_generate=75))

In [None]:
print(generate_text(gen_model2, start_string=u"S", temperature=0.7))

In [None]:
print(generate_text(gen_model2, start_string=u"NBA", temperature=0.4))

In [None]:
print(generate_text(gen_model2, start_string=u"Sta", temperature=0.7))

# Greedy Search with Bigrams

In [None]:
!wget http://norvig.com/tsv/ngrams-all.tsv.zip

In [None]:
# gzip file masquerading as a ZIP file
!mv ngrams-all.tsv.zip ngrams-all.tsv.z
!gunzip ngrams-all.tsv.z

In [None]:
# First 27 rows are characters
# next 669 are bigrams
!head -n 697 ngrams-all.tsv | tail -n 669 > bigrams.tsv

In [None]:
# file format col1: bigram, col2: overall freq, ignore other cols
from collections import Counter
import csv
counts = Counter()
bitree = {}
totals = 2819662855499
with open("bigrams.tsv", 'r') as fl:
    big = csv.reader(fl, delimiter='\t')
    for bigram in big:
        key = bigram[0]
        count = int(bigram[1]) / totals 
        counts[key] = count 
        if(key[0] in bitree):
            bitree[key[0]][key[1]] = count
        else:
            #need new subdict
            bitree[key[0]] = { key[1] : count }

In [None]:
counts.most_common(10)

In [None]:
print(bitree['T'])

In [None]:
!pip install anytree

In [None]:
from anytree import Node, RenderTree
# construct 5 letter word o given a bigram
start = 'WI'
compl = Node(start[0], prob=1)  # to store comlpetions and probabilities
cnt = 0

def recurse(letter, prob, level, parent):
    if level > 2:
        return Node(letter, parent=parent, prob=prob*parent.prob)
    
    items = Counter(bitree[letter]).most_common(3)
    nd = Node(letter, parent=parent, prob=parent.prob*prob)
    for item in items:
        rslt = recurse(item[0], item[1], level+1, nd)
    return nd


recurse(start[1], 1, 0, compl)
    

In [None]:
for pre, fill, node in RenderTree(compl):
    print("%s%s (%2.8f)" % (pre, node.name, node.prob))