In [1]:
from transformers import (
    AutoTokenizer,
    TFAutoModelForQuestionAnswering,
    DefaultDataCollator,
    GPT2TokenizerFast
)
import tensorflow as tf
from question_answering.constants import constants
from question_answering.custom_models.tf_gpt2_for_question_answering import TFGPT2ForQuestionAnswering
from question_answering.utils import core_qa_utils
from question_answering.utils.extractive_qa import (
    graphs,
    model_management,
    predictions,
    squad_metrics,
    pure_metrics,
    squad_preprocessing,
    medical_preprocessing,
)
from question_answering.paths import extractive_qa_paths
import ipynbname
from question_answering.keras_callbacks.time_measure_callback import TimeMeasureCallback
import numpy as np
from datasets import Dataset

In [15]:
model_name = "gpt2"  # Change to your desired GPT variant
tokenizer = GPT2TokenizerFast.from_pretrained(model_name, add_bos_token = True)

In [16]:
tokenizer.add_bos_token

True

In [17]:
tokenizer

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}

In [3]:
from transformers import TFAutoModelForQuestionAnswering

model = TFAutoModelForQuestionAnswering.from_pretrained("gpt2")

ValueError: Unrecognized configuration class <class 'transformers.models.gpt2.configuration_gpt2.GPT2Config'> for this kind of AutoModel: TFAutoModelForQuestionAnswering.
Model type should be one of AlbertConfig, BertConfig, CamembertConfig, ConvBertConfig, DebertaConfig, DebertaV2Config, DistilBertConfig, ElectraConfig, FlaubertConfig, FunnelConfig, GPTJConfig, LayoutLMv3Config, LongformerConfig, MobileBertConfig, MPNetConfig, RemBertConfig, RobertaConfig, RobertaPreLayerNormConfig, RoFormerConfig, XLMConfig, XLMRobertaConfig, XLNetConfig.

In [19]:
from question_answering.paths import extractive_qa_paths
from question_answering.utils import core_qa_utils

raw_train_dataset, raw_test_dataset = core_qa_utils.load_datasets_from_json(
    dataset_path=extractive_qa_paths.squad_dataset_dir,
    filenames=["original_train.json", "original_test.json"],
)

In [20]:
train_dataset = raw_train_dataset.select(range(80000))
val_dataset = raw_train_dataset.select(range(80000, 87599))
test_dataset = raw_test_dataset

In [21]:
train_dataset, val_dataset, test_dataset

(Dataset({
     features: ['id', 'title', 'context', 'question', 'answers', 'answer_text', 'answer_start'],
     num_rows: 80000
 }),
 Dataset({
     features: ['id', 'title', 'context', 'question', 'answers', 'answer_text', 'answer_start'],
     num_rows: 7599
 }),
 Dataset({
     features: ['id', 'title', 'context', 'question', 'answers', 'answer_text', 'answer_start'],
     num_rows: 10570
 }))

In [22]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.sep_token = '<|sep|>'
tokenizer.cls_token = '<|cls|>'

In [23]:
special_tokens_map = {'additional_special_tokens': ['<|sep|>', '<|cls|>', '<|question|>', '<|context|>']}
num_added_tokens = tokenizer.add_special_tokens(special_tokens_map)

In [24]:
tokenizer

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'sep_token': '<|sep|>', 'pad_token': '<|endoftext|>', 'cls_token': '<|cls|>', 'additional_special_tokens': ['<|context|>', '<|question|>', '<|cls|>', '<|sep|>']}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50257: AddedToken("<|context|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	50258: AddedToken("<|question|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	50259: AddedToken("<|cls|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	50260: AddedToken("<|sep|>", rstrip=False, lstrip=False, single_word=False,

In [25]:
sampled_set = train_dataset.select(range(5))

In [26]:
max_length = 384
stride = 128

"[Q] question [C] context"

def preprocess_samples(samples):
    questions = [f"<|question|> {q.strip()} " for q in samples["question"]]
    contexts = [f"<|context|> {c.strip()} {c.strip()} {c.strip()} " for c in samples["context"]]

    inputs = tokenizer(
        questions,
        contexts,
        max_length=max_length,
        padding="max_length",
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
    )
    input_ids = inputs.pop("input_ids")

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answer_starts = samples["answer_start"]
    answer_texts = samples["answer_text"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        decoded = tokenizer.decode(input_ids[i])
        sample_idx = sample_map[i]
        answer_start = answer_starts[sample_idx][0]
        answer_text = answer_texts[sample_idx][0]
        start_char = answer_start
        end_char = start_char + len(answer_text)
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if (
                offset[context_start][0] > start_char
                or offset[context_end][1] < end_char
        ):
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end TOKEN positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs


a = sampled_set.map(
    preprocess_samples, batched=True, remove_columns=sampled_set.column_names
)

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

KeyboardInterrupt: 

In [40]:
a

Dataset({
    features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 5
})

In [39]:
decoded = tokenizer.decode(a[0]["input_ids"])

In [42]:
sampled_set["answer_text"]

[['Saint Bernadette Soubirous'],
 ['a copper statue of Christ'],
 ['the Main Building'],
 ['a Marian place of prayer and reflection'],
 ['a golden statue of the Virgin Mary']]

In [41]:
decoded[a[0]["start_positions"]:a[0]["end_positions"] + 1]

'e Main B'

In [11]:
from question_answering.utils.extractive_qa import squad_preprocessing

tokenized_train_dataset = squad_preprocessing.preprocess_squad_training_dataset(
    dataset=train_dataset,
    tokenizer=tokenizer,
    max_length=max_length,
    stride=stride,
    remove_columns=train_dataset.column_names,
)
tokenized_val_dataset = squad_preprocessing.preprocess_squad_training_dataset(
    dataset=val_dataset,
    tokenizer=tokenizer,
    max_length=max_length,
    stride=stride,
    remove_columns=val_dataset.column_names,
)

In [12]:
tokenized_test_dataset = squad_preprocessing.preprocess_squad_test_dataset(
    dataset=test_dataset,
    tokenizer=tokenizer,
    max_length=max_length,
    stride=stride,
    remove_columns=test_dataset.column_names,
)

In [13]:
from question_answering.constants import constants
import ipynbname

# Model parameters
full_model_name = "-".join(ipynbname.name().split("_"))

# Checkpoints
checkpoint_filename_template = constants.checkpoint_filename_template
checkpoints_path = (
        extractive_qa_paths.training_checkpoints_dir
        / full_model_name
        / checkpoint_filename_template
)

# Saved models
saved_models_path = extractive_qa_paths.saved_models_dir / full_model_name

# Evaluation
model_evaluation_dir = extractive_qa_paths.model_evaluation_dir / full_model_name
figures_dir = model_evaluation_dir / constants.figures_dir_name

# Hyper parameters
batch_size = 4
train_epochs = 10
initial_learning_rate = 2e-5
end_learning_rate = 0

In [10]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator(return_tensors="tf")

tf_train_dataset = core_qa_utils.convert_to_tf_dataset(
    hf_dataset=tokenized_train_dataset,
    columns=["input_ids", "token_type_ids", "attention_mask"],
    label_cols=["start_positions", "end_positions"],
    collator=data_collator,
    batch_size=batch_size,
)
tf_val_dataset = core_qa_utils.convert_to_tf_dataset(
    hf_dataset=tokenized_val_dataset,
    columns=["input_ids", "token_type_ids", "attention_mask"],
    label_cols=["start_positions", "end_positions"],
    collator=data_collator,
    batch_size=batch_size,
)
tf_test_dataset = core_qa_utils.convert_to_tf_dataset(
    hf_dataset=tokenized_test_dataset,
    columns=["input_ids", "token_type_ids", "attention_mask"],
    label_cols=None,
    collator=data_collator,
    batch_size=batch_size,
)