### Installs and Imports

In [None]:
!pip install -q transformers

In [None]:
import re
import json
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras
import tensorflow_datasets as tfds

from transformers import AutoTokenizer, TFAutoModel
from transformers import logging
logging.set_verbosity_error()  # skip warnings

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# BoolQ Dataset

### Download, Extract, and Load the bool_q dataset

In [None]:
boolq, boolq_info = tfds.load('bool_q', with_info=True)

In [None]:
boolq

{Split('train'): <_PrefetchDataset element_spec={'answer': TensorSpec(shape=(), dtype=tf.bool, name=None), 'passage': TensorSpec(shape=(), dtype=tf.string, name=None), 'question': TensorSpec(shape=(), dtype=tf.string, name=None), 'title': TensorSpec(shape=(), dtype=tf.string, name=None)}>,
 Split('validation'): <_PrefetchDataset element_spec={'answer': TensorSpec(shape=(), dtype=tf.bool, name=None), 'passage': TensorSpec(shape=(), dtype=tf.string, name=None), 'question': TensorSpec(shape=(), dtype=tf.string, name=None), 'title': TensorSpec(shape=(), dtype=tf.string, name=None)}>}

In [None]:
boolq_info.features

FeaturesDict({
    'answer': bool,
    'passage': Text(shape=(), dtype=string),
    'question': Text(shape=(), dtype=string),
    'title': Text(shape=(), dtype=string),
})

In [None]:
train_len = boolq['train'].cardinality().numpy()
print('bool_q train examples: ', train_len)
val_len = boolq['validation'].cardinality().numpy()
print('bool_q val examples: ', val_len)

bool_q train examples:  9427
bool_q val examples:  3270


In [None]:
boolq_train_data = next(iter(boolq['train'].batch(train_len)))
boolq_val_data = next(iter(boolq['validation'].batch(val_len)))

In [None]:
# Look at the first few examples in the data
for i in range(4):
  print()
  for key, value in boolq_train_data.items():
      print(f"{key:9s}: {value[i].numpy()}")


answer   : False
passage  : b'There are four ways an individual can acquire Canadian citizenship: by birth on Canadian soil; by descent (being born to a Canadian parent); by grant (naturalization); and by adoption. Among them, only citizenship by birth is granted automatically with limited exceptions, while citizenship by descent or adoption is acquired automatically if the specified conditions have been met. Citizenship by grant, on the other hand, must be approved by the Minister of Immigration, Refugees and Citizenship.'
question : b'can i get canadian citizenship if my grandfather was canadian'
title    : b'Canadian nationality law'

answer   : True
passage  : b'Star Trek: Discovery is an American television series created for CBS All Access by Bryan Fuller and Alex Kurtzman. It is the first series developed specifically for that service, and the first Star Trek series since Star Trek: Enterprise concluded in 2005. Set roughly a decade before the events of the original Star Trek s

In [None]:
pd.DataFrame(boolq_train_data['answer']).value_counts()

True     5874
False    3553
dtype: int64

### Baseline metric: predict most common class (YES)

In [None]:
# Baseline accuracy:
pd.DataFrame(boolq_train_data['answer']).value_counts()[True] / boolq_train_data['answer'].shape[0]

0.6231038506417736

# BERT

### Load BERT Models

In [None]:
bert_model_names = ['bert-base-uncased',
                    'distilbert-base-cased-distilled-squad']

tokenizers = [AutoTokenizer.from_pretrained(bert_model_name)
              for bert_model_name in bert_model_names]
bert_models = [TFAutoModel.from_pretrained(bert_model_name)
               for bert_model_name in bert_model_names]
bert_last_layer_num = [11, 5]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

In [None]:
checkpoint_dir = '/content/drive/MyDrive/266 - Natural Language Processing/Final Project/model_checkpoints/'

### Preprocess data for BERT

In [None]:
def preprocess_boolq(df, tokenizer, max_length=128):
    passages = df['passage'].numpy()
    questions = df['question'].numpy()

    text_pairs = [(passages[i].decode(), questions[i].decode())
                  for i in range(questions.shape[0])]
    encoded = tokenizer.batch_encode_plus(
            text_pairs,
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=True,
            return_tensors="tf"
        )

    return [encoded["input_ids"],
            encoded["attention_mask"],
            encoded["token_type_ids"]], df['answer'].numpy()

### Build model API

In [None]:
# Define the function to build our Github model. It takes a pretrained bert model,
# freezes all layers except the last transformer block, then takes the CLS
# token output and passes it to a binary classification layer.

def build_boolq_model(bert_model, max_length, last_layer_num):
    # Freeze all but layer_11 and pooler layers
    for w in bert_model.weights:
        if 'layer_._%d' % last_layer_num not in w.name:
            w._trainable = False

    input_ids = keras.layers.Input(shape=(max_length), dtype=tf.int32, name='input_ids')
    attention_masks = keras.layers.Input(shape=(max_length), dtype=tf.int32, name='attention_masks')
    token_type_ids = keras.layers.Input(shape=(max_length), dtype=tf.int32, name='token_type_ids')
    if bert_model.name == 'tf_distil_bert_model':
        bert_output = bert_model(input_ids, attention_mask=attention_masks)
    else:
        bert_output = bert_model(input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids)
    cls_output = bert_output[0][:, 0, :]

    dropout_output = keras.layers.Dropout(0.3)(cls_output)
    final_output = keras.layers.Dense(1, activation="sigmoid")(dropout_output)

    model = tf.keras.models.Model(inputs=[input_ids, attention_masks, token_type_ids],
                                  outputs=[final_output])
    model.compile(optimizer=tf.keras.optimizers.Adam(),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

### Train and save models

In [None]:
max_length = 128

In [None]:
boolq_models = {}
for i in range(len(bert_model_names)):

    boolq_train_inputs, boolq_train_labels = preprocess_boolq(
        boolq_train_data, tokenizers[i], max_length)
    boolq_val_inputs, boolq_val_labels = preprocess_boolq(
        boolq_val_data, tokenizers[i], max_length)

    sample_weight = np.ones(shape=(len(boolq_train_labels),))
    for label in range(np.max(boolq_train_labels)+1):
        class_weight = 1 - np.sum(boolq_train_labels == label) / len(boolq_train_labels)
        sample_weight[boolq_train_labels == label] = class_weight

    checkpoint_prefix = checkpoint_dir + 'boolq_weights_%s' % bert_model_names[i]
    checkpoint_filepath = checkpoint_prefix + '.{epoch:02d}-{val_accuracy:.2f}.hdf5'
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=True
    )

    boolq_models[bert_model_names[i]] = build_boolq_model(
        bert_models[i],
        max_length,
        bert_last_layer_num[i]
    )
    boolq_models[bert_model_names[i]].fit(
        boolq_train_inputs,
        boolq_train_labels,
        sample_weight=sample_weight,
        validation_data=[boolq_val_inputs, boolq_val_labels],
        epochs=5,
        callbacks=[model_checkpoint_callback]
    )

### BERT analysis

In [None]:
# Load weights of epochs with highest val accuracy
bert_models[0].load_weights(checkpoint_dir + 'boolq_weights_bert-base-uncased.03-0.64.hdf5', by_name=True)
bert_models[1].load_weights(checkpoint_dir + 'boolq_weights_distilbert-base-cased-distilled-squad.01-0.65.hdf5', by_name=True)

In [None]:
def plot_confusion_matrix(y_true, y_pred, label_names):
    cm = tf.math.confusion_matrix(y_true, y_pred)
    cm = cm/cm.numpy().sum(axis=1)[:, tf.newaxis]

    plt.figure(figsize=(20,7))
    sns.heatmap(
        cm, annot=True,
        xticklabels=label_names,
        yticklabels=label_names)
    plt.xlabel("Predicted")
    plt.ylabel("True")

In [None]:
boolq_wrong_guesses = {}  # Save indices of wrong guesses so we can look at them later

for i in range(len(bert_model_names)):
    print(bert_model_names[i])

    boolq_val_inputs, boolq_val_labels = preprocess_boolq(
        boolq_val_data, tokenizers[i], max_length)

    boolq_y_probs = boolq_models[bert_model_names[i]].predict(boolq_val_inputs)
    boolq_y_pred = np.squeeze(boolq_y_probs >= 0.5)
    print(classification_report(boolq_val_labels, boolq_y_pred))

    plot_confusion_matrix(boolq_val_labels, boolq_y_pred, ['No', 'Yes'])

    boolq_wrong_guesses[bert_model_names[i]] = np.where(boolq_val_labels != boolq_y_pred)

# T5

### Import and load T5 libs

In [None]:
# Must INSTALL sentencepiece and THEN RESTART runtime to be able to instantiate the T5-finetuned tokenizer
!pip install -q sentencepiece
!pip install -q accelerate

In [None]:
from transformers import TFT5ForConditionalGeneration

### T5 Base

In [None]:
tokenizer = AutoTokenizer.from_pretrained('t5-base')
model = TFT5ForConditionalGeneration.from_pretrained('t5-base', from_pt=True)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [None]:
def get_answer_t5(data, i=0, max_length=128):
    questions = data['question'].numpy()
    passages = data['passage'].numpy()
    input_text = []
    input_text.append(f"question: {questions[i].decode('utf-8')}? (1) Yes (2) No  context: {passages[i].decode('utf-8')}")
    features = tokenizer(input_text,
                         return_tensors='tf',
                         padding=True,
                         truncation=True,
                         max_length=max_length)

    output = model.generate(input_ids=features['input_ids'],
                attention_mask=features['attention_mask'],
                max_length=2) # Only want one word, but all answers start with a '<pad> ' token

    return tokenizer.decode(output[0])

In [None]:
t5_base_candidates = []
for i in range(100):
    t5_base_candidates.append(get_answer_t5(boolq_train_data, i).split('<pad> ')[1])
t5_base_candidates[90:100]

['Philadelphia', 'No', 'Hal', 'No', 'Richard', 'No', 'No', 'No', 'No', 'No']

In [None]:
for i in range(90, 100):
  print()
  for key, value in boolq_train_data.items():
      print(f"{key:9s}: {value[i].numpy()}")


answer   : True
passage  : b"The NFC East is a division of the National Football League (NFL)'s National Football Conference (NFC). It currently has four members: the Philadelphia Eagles, the New York Giants, the Dallas Cowboys, and the Washington Redskins."
question : b'are the eagles and cowboys in the same division'
title    : b'NFC East'

answer   : False
passage  : b'The first season was released on Netflix on July 15, 2016. It received critical acclaim for its characterization, pacing, atmosphere, acting, soundtrack, directing, writing, and homages to 1980s genre films. The series has received several industry nominations and awards, including winning the Screen Actors Guild Award for Outstanding Performance by an Ensemble in a Drama Series in 2016, and receiving eighteen nominations for the 69th Primetime Emmy Awards, including Outstanding Drama Series. On August 31, 2016, Netflix renewed the series for a second season of nine episodes, which was released on October 27, 2017. I

In [None]:
pd.DataFrame(t5_base_candidates).value_counts(dropna=False)[:10]

No        45
           2
third      2
Tom        2
The        2
no         2
comedy     2
in         1
double     1
each       1
dtype: int64

The model seems to be outputting either 'No' or a word that explains why the answer is 'Yes'.  
So, treat any answer that is not 'No' as a 'Yes'.

In [None]:
t5_base_candidates_bool = [candidate!='No' for candidate in t5_base_candidates]
t5_base_candidates_bool[90:100]

[True, False, True, False, True, False, False, False, False, False]

In [None]:
def get_accuracy(data, candidates):
    return np.sum(np.equal(data['answer'][:len(candidates)], np.asarray(candidates))) / len(candidates)

In [None]:
get_accuracy(boolq_train_data, t5_base_candidates_bool)

0.47

### T5 Base Fine-tuned for QA via Sentence Composition

In [None]:
tokenizer = AutoTokenizer.from_pretrained('mrm8488/t5-small-finetuned-boolq')
model = TFT5ForConditionalGeneration.from_pretrained('mrm8488/t5-small-finetuned-boolq', from_pt=True)

Downloading pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

In [None]:
def get_answer_t5ft(data, i=0, max_length=128):
    questions = data['question'].numpy()
    passages = data['passage'].numpy()
    input_text = []
    input_text.append(f"question: {questions[i].decode('utf-8')}? (A) Yes (B) No  context: {passages[i].decode('utf-8')}")
    features = tokenizer(input_text,
                         return_tensors='tf',
                         padding=True,
                         truncation=True,
                         max_length=max_length)

    output = model.generate(input_ids=features['input_ids'],
                attention_mask=features['attention_mask'],
                max_length=2) # Only want one word, but all answers start with a '<pad> ' token

    return tokenizer.decode(output[0])

In [None]:
t5ft_candidates = []
for i in range(100):
    t5ft_candidates.append(get_answer_t5ft(boolq_train_data, i).split('<pad> ')[1])
t5ft_candidates[90:100]

['yes', 'no', 'yes', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes']

In [None]:
pd.DataFrame(t5ft_candidates).value_counts()

yes    59
no     41
dtype: int64

The finetuned model performs the task much better, producing mostly only 'Yes' and 'No' answers. We will consider anything but a 'Yes' or 'No' to be a non-answer and assign it 'None'.

In [None]:
t5ft_candidates_bool = []
for candidate in t5ft_candidates:
    if str.lower(candidate) == 'yes':
        val = True
    elif str.lower(candidate) == 'no':
        val = False
    else:
        val = None
    t5ft_candidates_bool.append(val)
t5ft_candidates_bool[90:100]

[True, False, True, True, False, True, True, True, True, True]

In [None]:
get_accuracy(boolq_train_data, t5ft_candidates_bool)

0.85