### Natural Language Processing with RNNs

In [450]:
import pandas as np
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf 
import tensorflow_datasets as tfds
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [451]:
# Load the dataset of bool_q, this dataset contains:

# passage: A short paragraph of text.
# question: A question related to the passage.
# answer: A binary label (True or False) indicating whether the answer to the question is "Yes" or "No".
# title: Context or metadata for the passage (not critical for training).

ds = tfds.load('bool_q')


In [452]:
train_ds, valid_ds = ds['train'], ds['validation']

In [453]:
# Split the train data into train and test data
train_ds = train_ds.take(int(len(train_ds)*0.9))
test_ds = train_ds.skip(int(len(train_ds)*0.9))

In [454]:
len(train_ds), len(test_ds)

(8484, 849)

In [455]:
# Inspect the dataset
for sample in train_ds.take(1):
    print(sample['passage'].numpy())

b'There are four ways an individual can acquire Canadian citizenship: by birth on Canadian soil; by descent (being born to a Canadian parent); by grant (naturalization); and by adoption. Among them, only citizenship by birth is granted automatically with limited exceptions, while citizenship by descent or adoption is acquired automatically if the specified conditions have been met. Citizenship by grant, on the other hand, must be approved by the Minister of Immigration, Refugees and Citizenship.'


In [456]:
# Preprocess the data
def preprocess(sample):
    passage = sample['passage']
    question = sample['question']
    answer = sample['answer']

    #Combine passage and question
    combine_text = tf.strings.join([passage, question], separator= ' ')
    return combine_text, answer

# Apply the preprocessing
train_ds = train_ds.map(preprocess)

In [457]:
# Analyze the preprocessed data
for sample in train_ds.take(1):
    print(sample[0])

tf.Tensor(b'There are four ways an individual can acquire Canadian citizenship: by birth on Canadian soil; by descent (being born to a Canadian parent); by grant (naturalization); and by adoption. Among them, only citizenship by birth is granted automatically with limited exceptions, while citizenship by descent or adoption is acquired automatically if the specified conditions have been met. Citizenship by grant, on the other hand, must be approved by the Minister of Immigration, Refugees and Citizenship. can i get canadian citizenship if my grandfather was canadian', shape=(), dtype=string)


In [458]:
# Tokenization
# This step split the text into words(token) and create the word index mapping each token with a unique integer
# i.e. ["I love Python"] --> {"I":1, "love":2, "Python":3}
tokenizer = Tokenizer(oov_token="<OOV>")  # use oov to handle the out of vacab words, for testing and validation
text_data = [text.numpy().decode('utf-8') for text,_ in train_ds]
tokenizer.fit_on_texts(text_data)

In [459]:
for text in text_data:
    print(type(text))
    print(tokenizer.word_index)
    break


<class 'str'>


In [460]:
# Convert the text to sequences (of numbers)
def tokenize_map(text,label):
    """
    This function maps the text into sequence of numbers (encoding) from the word index created above
    i.e. ["I love Python", "I love Movies"] ---> [[1,2,3], [1,2,4]]
    """
    tokenized_text = tokenizer.texts_to_sequences([text.numpy().decode('utf-8')])[0]
    return tf.constant(tokenized_text), label

In [461]:
# Use tf.py_function for tokenization
def tokenize_dataset(text, label):
    """"
    This Function function bridges the gap between TensorFlow's tensor operations 
    and Python functions like tokenize_map by using tf.py_function, so we can excute the python function 
    in tensorflow compatible way.
    """
    tokenized_text, label = tf.py_function(tokenize_map, [text, label], [tf.int32, tf.bool])
    tokenized_text.set_shape([None])
    label.set_shape([])
    return tokenized_text, label

train_ds = train_ds.map(tokenize_dataset)

In [462]:
# Analyze the tokenized dataset
for sample in train_ds.take(1):
    print(sample)
    print(sample[0].shape)
    print(tf.expand_dims(sample[0], 0))
    print(sample[0].shape)

(<tf.Tensor: shape=(85,), dtype=int32, numpy=
array([  36,   17,  123, 1965,   20,  615,   28, 4407,  490,  377,   12,
        616,   10,  490, 3276,   12, 4733,  104,  276,    7,    5,  490,
       1146,   12, 2536, 3968,    4,   12, 2537,  449,  175,   54,  377,
         12,  616,    8, 1611, 1704,   13,  636, 1333,   77,  377,   12,
       4733,   16, 2537,    8, 1232, 1704,   83,    2, 2272, 1363,   23,
         56, 1345,  377,   12, 2536,   10,    2,   44,  589,  173,   21,
       1904,   12,    2, 1098,    3, 2191, 6939,    4,  377,   28,  134,
        219,  490,  377,   83,  838, 4939,   14,  490], dtype=int32)>, <tf.Tensor: shape=(), dtype=bool, numpy=False>)
(85,)
tf.Tensor(
[[  36   17  123 1965   20  615   28 4407  490  377   12  616   10  490
  3276   12 4733  104  276    7    5  490 1146   12 2536 3968    4   12
  2537  449  175   54  377   12  616    8 1611 1704   13  636 1333   77
   377   12 4733   16 2537    8 1232 1704   83    2 2272 1363   23   56
  1345  377   12 25

In [463]:
length = [len(seq[0]) for seq in train_ds]
print(max(length)) # max length
print(type(length))
print(np.percentile(length, 30)) # length at 30th percentile
print(np.percentile(length, 60)) # length at 60th percentile
print(np.percentile(length, 90)) # length at 90th percentile

770
<class 'list'>
72.0
106.0
171.0


In [464]:
# Padding -- to make the length of the data equals,
# If most sequences are much shorter than max_length(770), choose a smaller length (e.g., 128 or 256)
# we can see that the 90% of the lengths are below 171, so we can use 128 as the max length
def pad_sequence(text, label):
    text = pad_sequences([text.numpy()], maxlen=128, padding = 'post')[0]
    return text, label

def general_function(text, label):
    text, label = tf.py_function(pad_sequence, [text, label], [tf.int32, tf.bool])
    text = tf.convert_to_tensor(text, dtype=tf.int32)
    label = tf.convert_to_tensor(label, dtype=tf.bool)
    text.set_shape(tf.TensorShape([128]))
    label.set_shape(tf.TensorShape([]))
    return text, label

train_ds = train_ds.map(general_function)


In [465]:
for text, label in train_ds.take(2):
    print(text)

tf.Tensor(
[  36   17  123 1965   20  615   28 4407  490  377   12  616   10  490
 3276   12 4733  104  276    7    5  490 1146   12 2536 3968    4   12
 2537  449  175   54  377   12  616    8 1611 1704   13  636 1333   77
  377   12 4733   16 2537    8 1232 1704   83    2 2272 1363   23   56
 1345  377   12 2536   10    2   44  589  173   21 1904   12    2 1098
    3 2191 6939    4  377   28  134  219  490  377   83  838 4939   14
  490    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0], shape=(128,), dtype=int32)
tf.Tensor(
[  269  3106  3277     8    20    57   135    33   272    11  1399    49
   948    12  6177  7398     4  1461 10266    15     8     2    40    33
   463  1131    11    18   282     4     2    40   269  3106    33    97
   269  3106  4734  1054     6   633   176  2381     5  5632   113     2
   565 

In [466]:
batch_size = 32
train_ds = train_ds.shuffle(1000).batch(batch_size).prefetch(tf.data.AUTOTUNE)

In [467]:
# Build the model
model = Sequential([
    # This embedding layer generates a vector of output_dim for each unique sequence or word(token) 
    # with similar(not same) vector for the words used in similar context, the
    # input_dim is the length of word_index + 1(for padded 0's)
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=16),
    LSTM(16),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid') #Binary classication Yes or No
])

In [468]:
#Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [469]:
# preprocess the validation and test data
valid_ds = valid_ds.map(preprocess).map(tokenize_dataset).map(general_function)
test_ds = test_ds.map(preprocess).map(tokenize_dataset).map(general_function)

valid_ds = valid_ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
test_ds = test_ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)


In [470]:
for text, label in train_ds.take(1):
    print(type(text), text.shape, label.shape)

for text, label in valid_ds.take(1):
    print(type(text), text.shape, label.shape)

for sample in test_ds.take(1):
    print(type(sample), sample[0].shape)

<class 'tensorflow.python.framework.ops.EagerTensor'> (32, 128) (32,)
<class 'tensorflow.python.framework.ops.EagerTensor'> (32, 128) (32,)
<class 'tuple'> (32, 128)


In [471]:
len(train_ds)*32, len(valid_ds)*32, len(test_ds)*32

(8512, 3296, 864)

In [472]:
model.summary()

In [473]:
# Train the model
history = model.fit(train_ds, validation_data=valid_ds, epochs=10)

Epoch 1/10
[1m266/266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 103ms/step - accuracy: 0.6258 - loss: 0.6666 - val_accuracy: 0.6217 - val_loss: 0.6633
Epoch 2/10
[1m266/266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 101ms/step - accuracy: 0.6291 - loss: 0.6581 - val_accuracy: 0.6217 - val_loss: 0.6660
Epoch 3/10
[1m266/266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 96ms/step - accuracy: 0.6769 - loss: 0.6077 - val_accuracy: 0.6312 - val_loss: 0.6857
Epoch 4/10
[1m266/266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 98ms/step - accuracy: 0.7191 - loss: 0.5388 - val_accuracy: 0.6272 - val_loss: 0.7054
Epoch 5/10
[1m266/266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 90ms/step - accuracy: 0.7200 - loss: 0.4818 - val_accuracy: 0.6300 - val_loss: 0.7783
Epoch 6/10
[1m266/266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 100ms/step - accuracy: 0.7984 - loss: 0.4115 - val_accuracy: 0.6086 - val_loss: 0.8510
Epoch 7/10
[

In [474]:
# Evaluate the model
evaluation = model.evaluate(test_ds)
print(evaluation)

[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 47ms/step - accuracy: 0.9655 - loss: 0.1162
[0.13022999465465546, 0.9587750434875488]


In [475]:
for text_batch, label_batch in test_ds.take(1):
    prediction = model.predict(text_batch)
    predict = [[1 if prob >= 0.5 else 0 for prob in prob_list] for prob_list in prediction]
    actual_label = label_batch.numpy().tolist()
    for i in range(len(predict)):
        print(f"Actual Label : {actual_label[i]}, Prediction : {predict[i]}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 492ms/step
Actual Label : True, Prediction : [1]
Actual Label : False, Prediction : [0]
Actual Label : False, Prediction : [0]
Actual Label : True, Prediction : [1]
Actual Label : True, Prediction : [1]
Actual Label : True, Prediction : [1]
Actual Label : True, Prediction : [1]
Actual Label : False, Prediction : [0]
Actual Label : False, Prediction : [0]
Actual Label : True, Prediction : [1]
Actual Label : True, Prediction : [1]
Actual Label : True, Prediction : [1]
Actual Label : False, Prediction : [0]
Actual Label : False, Prediction : [0]
Actual Label : True, Prediction : [1]
Actual Label : True, Prediction : [1]
Actual Label : True, Prediction : [1]
Actual Label : True, Prediction : [1]
Actual Label : True, Prediction : [1]
Actual Label : True, Prediction : [1]
Actual Label : False, Prediction : [0]
Actual Label : True, Prediction : [1]
Actual Label : False, Prediction : [0]
Actual Label : False, Prediction : [0]
Actual