### Text Generation with RNN

In [278]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, TimeDistributed


In [279]:
ds = tfds.load('bool_q')

In [280]:
ds

{'train': <_PrefetchDataset element_spec={'answer': TensorSpec(shape=(), dtype=tf.bool, name=None), 'passage': TensorSpec(shape=(), dtype=tf.string, name=None), 'question': TensorSpec(shape=(), dtype=tf.string, name=None), 'title': TensorSpec(shape=(), dtype=tf.string, name=None)}>,
 'validation': <_PrefetchDataset element_spec={'answer': TensorSpec(shape=(), dtype=tf.bool, name=None), 'passage': TensorSpec(shape=(), dtype=tf.string, name=None), 'question': TensorSpec(shape=(), dtype=tf.string, name=None), 'title': TensorSpec(shape=(), dtype=tf.string, name=None)}>}

In [281]:
train_ds, valid_ds = ds['train'], ds['validation']

In [282]:
train_ds = train_ds.map(lambda sample : (sample['passage'], sample['title']))
valid_ds = valid_ds.map(lambda sample : (sample['passage'], sample['title']))

In [283]:
for text,label in train_ds.take(1):
    print(text, label)

tf.Tensor(b'There are four ways an individual can acquire Canadian citizenship: by birth on Canadian soil; by descent (being born to a Canadian parent); by grant (naturalization); and by adoption. Among them, only citizenship by birth is granted automatically with limited exceptions, while citizenship by descent or adoption is acquired automatically if the specified conditions have been met. Citizenship by grant, on the other hand, must be approved by the Minister of Immigration, Refugees and Citizenship.', shape=(), dtype=string) tf.Tensor(b'Canadian nationality law', shape=(), dtype=string)


In [284]:
texts = []
labels = []
def general_function(dataset):
    for text,label in dataset:
        texts.append(text.numpy().decode('utf-8'))
        labels.append(label.numpy().decode('utf-8'))

general_function(train_ds)
general_function(valid_ds)

print(len(texts), len(labels), texts[1], labels[1])

12697 12697 Star Trek: Discovery is an American television series created for CBS All Access by Bryan Fuller and Alex Kurtzman. It is the first series developed specifically for that service, and the first Star Trek series since Star Trek: Enterprise concluded in 2005. Set roughly a decade before the events of the original Star Trek series and separate from the timeline of the concurrently produced feature films, Discovery explores the Federation--Klingon war while following the crew of the USS Discovery. Gretchen J. Berg and Aaron Harberts serve as showrunners on the series, with producing support from Akiva Goldsman. Star Trek: Discovery


In [285]:
# Tokenize the complete dataset (training and testing) for text generation
data_to_tokenize = texts + labels
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data_to_tokenize)
tokenizer.word_index

{'the': 1,
 'of': 2,
 'and': 3,
 'in': 4,
 'a': 5,
 'to': 6,
 'is': 7,
 'as': 8,
 'on': 9,
 'by': 10,
 'for': 11,
 'with': 12,
 'or': 13,
 'was': 14,
 'that': 15,
 'it': 16,
 'are': 17,
 'from': 18,
 'an': 19,
 'be': 20,
 'at': 21,
 'states': 22,
 'which': 23,
 'united': 24,
 'not': 25,
 'has': 26,
 'have': 27,
 'also': 28,
 'series': 29,
 'one': 30,
 'their': 31,
 'its': 32,
 'film': 33,
 'they': 34,
 'this': 35,
 'first': 36,
 'his': 37,
 'season': 38,
 'but': 39,
 'two': 40,
 'new': 41,
 'other': 42,
 'he': 43,
 'may': 44,
 'after': 45,
 'world': 46,
 'can': 47,
 'all': 48,
 'who': 49,
 'her': 50,
 'american': 51,
 'when': 52,
 'only': 53,
 'been': 54,
 'most': 55,
 'were': 56,
 'time': 57,
 'used': 58,
 's': 59,
 'state': 60,
 'such': 61,
 'more': 62,
 'into': 63,
 '1': 64,
 'than': 65,
 'she': 66,
 'known': 67,
 'u': 68,
 'some': 69,
 'while': 70,
 'between': 71,
 'if': 72,
 '2': 73,
 'had': 74,
 'no': 75,
 '2018': 76,
 '2017': 77,
 'cup': 78,
 'three': 79,
 'game': 80,
 'second':

In [286]:
for text, label in train_ds.take(1):
    print(text, label)

tf.Tensor(b'There are four ways an individual can acquire Canadian citizenship: by birth on Canadian soil; by descent (being born to a Canadian parent); by grant (naturalization); and by adoption. Among them, only citizenship by birth is granted automatically with limited exceptions, while citizenship by descent or adoption is acquired automatically if the specified conditions have been met. Citizenship by grant, on the other hand, must be approved by the Minister of Immigration, Refugees and Citizenship.', shape=(), dtype=string) tf.Tensor(b'Canadian nationality law', shape=(), dtype=string)


In [287]:
def tokenize_map(text,label):
    text = tokenizer.texts_to_sequences([text.numpy().decode('utf-8')])[0]
    label = tokenizer.texts_to_sequences([label.numpy().decode('utf-8')])[0]
    return tf.constant(text), tf.constant(label)

train_ds = train_ds.map(lambda text, label : tf.py_function(tokenize_map, [text, label], [tf.int32, tf.int32]))
valid_ds = valid_ds.map(lambda text, label : tf.py_function(tokenize_map, [text, label], [tf.int32, tf.int32]))

In [288]:
for text, label in train_ds.take(1):
    print(text, label)

tf.Tensor(
[  82   17  114 2132   19  593   47 4386  540  439   10  683    9  540
 3281   10 3977   93  329    6    5  540 1195   10 2795 4824    3   10
 2402  437  159   53  439   10  683    7 1742 2133   12  615 1434   70
  439   10 3977   13 2402    7 1435 2133   72    1 2273 1322   27   54
 1436  439   10 2795    9    1   42  505  163   20 1929   10    1 1539
    2 2371 5115    3  439], shape=(75,), dtype=int32) tf.Tensor([ 540 1460   90], shape=(3,), dtype=int32)


In [289]:
length = [len(text) for text,_ in train_ds]
label_len = [len(label) for _, label in train_ds]
print(max(length), max(label_len)) # max length
print(type(length))
print(np.percentile(length, 30), np.percentile(label_len, 30)) # length at 30th percentile
print(np.percentile(length, 60), np.percentile(label_len, 60)) # length at 60th percentile
print(np.percentile(length, 90), np.percentile(label_len, 90)) # length at 90th percentile

763 13
<class 'list'>
62.0 2.0
98.0 3.0
161.0 6.0


In [290]:
def pad_sequence(text,label):
    text = pad_sequences([text.numpy()], maxlen=128, padding='post')[0]
    # Make the label of the size 1, with only keeping the most used sequence from the list
    label = [np.bincount(label.numpy()).argmax()]
    return text, label

def gen_func(text, label):
    text,label = tf.py_function(pad_sequence, [text, label], [tf.int32, tf.int32])
    text = tf.convert_to_tensor(text, dtype=tf.int32)
    label = tf.convert_to_tensor(label, dtype=tf.int32)
    text.set_shape(tf.TensorShape([128]))
    label.set_shape(tf.TensorShape([1]))
    return text, label

train_ds = train_ds.map(gen_func)
valid_ds = valid_ds.map(gen_func)


In [291]:
for text, label in train_ds.take(1):
    print(text, label)

tf.Tensor(
[  82   17  114 2132   19  593   47 4386  540  439   10  683    9  540
 3281   10 3977   93  329    6    5  540 1195   10 2795 4824    3   10
 2402  437  159   53  439   10  683    7 1742 2133   12  615 1434   70
  439   10 3977   13 2402    7 1435 2133   72    1 2273 1322   27   54
 1436  439   10 2795    9    1   42  505  163   20 1929   10    1 1539
    2 2371 5115    3  439    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0], shape=(128,), dtype=int32) tf.Tensor([90], shape=(1,), dtype=int32)


In [292]:
batch_size = 32
train_ds = train_ds.shuffle(1000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
valid_ds = valid_ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)

In [293]:
for text,label in train_ds.take(1):
    print(text.shape, label.shape)

(32, 128) (32, 1)


In [294]:
# Build the model
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=32),
    LSTM(64),
    Dense(len(tokenizer.word_index), activation='softmax')
])

In [295]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [296]:
model.summary()

In [297]:
history = model.fit(train_ds, validation_data=valid_ds, epochs=20)

Epoch 1/20
[1m295/295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 195ms/step - accuracy: 0.1318 - loss: 8.6206 - val_accuracy: 0.1257 - val_loss: 6.8437
Epoch 2/20
[1m295/295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 190ms/step - accuracy: 0.1376 - loss: 6.0204 - val_accuracy: 0.1257 - val_loss: 6.8230
Epoch 3/20
[1m295/295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 197ms/step - accuracy: 0.1403 - loss: 5.9532 - val_accuracy: 0.1257 - val_loss: 6.8741
Epoch 4/20
[1m295/295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 186ms/step - accuracy: 0.1372 - loss: 5.9496 - val_accuracy: 0.1257 - val_loss: 6.9350
Epoch 5/20
[1m295/295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 195ms/step - accuracy: 0.1399 - loss: 5.9180 - val_accuracy: 0.1257 - val_loss: 6.9671
Epoch 6/20
[1m295/295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 179ms/step - accuracy: 0.1382 - loss: 5.8742 - val_accuracy: 0.1257 - val_loss: 7.0422
Epoch 7/20

In [312]:
for text_batch, label_batch in valid_ds.take(1):
    prediction = model.predict(text_batch)
    # print(prediction, label_batch)
    predicted_id = tf.argmax(prediction, axis=-1).numpy()
    predicted_title = tokenizer.sequences_to_texts([predicted_id])
    predicted_title = predicted_title[0].split()
    label_batch_list = label_batch.numpy()
    actual_title = tokenizer.sequences_to_texts(label_batch_list)
    for i in range(len(predicted_title)):
        print(f"Predicted Title: {predicted_title[i]}, Actual Title: {actual_title[i]}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
Predicted Title: the, Actual Title: disney
Predicted Title: and, Actual Title: from
Predicted Title: four, Actual Title: drop
Predicted Title: of, Actual Title: the
Predicted Title: film, Actual Title: of
Predicted Title: and, Actual Title: 8
Predicted Title: tract, Actual Title: strand
Predicted Title: league, Actual Title: red
Predicted Title: and, Actual Title: king
Predicted Title: and, Actual Title: ball
Predicted Title: states, Actual Title: season
Predicted Title: tract, Actual Title: full
Predicted Title: in, Actual Title: money
Predicted Title: s, Actual Title: and
Predicted Title: season, Actual Title: birth
Predicted Title: system, Actual Title: league
Predicted Title: states, Actual Title: 3
Predicted Title: water, Actual Title: energy
Predicted Title: red, Actual Title: randy
Predicted Title: cougar, Actual Title: character
Predicted Title: film, Actual Title: in
Predicted Title: puerto, Actual Title: 