In [4]:
import pandas as pd
from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering, DefaultDataCollator, create_optimizer
import datasets
import tensorflow as tf

In [3]:
# def load_train_val_test_datasets(dataset_path='./../data/datasets/squad'):
#     train = pd.read_csv(f'{dataset_path}/train.csv').dropna()
#     val = pd.read_csv(f'{dataset_path}/dev.csv').dropna()
#     test = pd.read_csv(f'{dataset_path}/test.csv').dropna()
#     return train, val, test


# def convert_dataframes_to_datasets(dataframes: list):
#     return tuple(
#         [datasets.Dataset.from_pandas(dataframe, preserve_index=False) for dataframe in
#          dataframes])


# df_train, df_val, df_test = load_train_val_test_datasets()

# train_dataset, val_dataset, test_dataset = convert_dataframes_to_datasets([df_train, df_val, df_test])

In [5]:
from datasets import load_dataset

raw_datasets = load_dataset("squad")
train_dataset = raw_datasets['train']
val_dataset = raw_datasets['validation']

Found cached dataset squad (C:/Users/Artur/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
train_dataset, val_dataset

(Dataset({
     features: ['id', 'title', 'context', 'question', 'answers'],
     num_rows: 87599
 }),
 Dataset({
     features: ['id', 'title', 'context', 'question', 'answers'],
     num_rows: 10570
 }))

In [8]:
model_checkpoint = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [9]:
def tokenize_sample(sample, max_tokens=None, padding=False):
    question = sample['question'].strip()
    context = sample['context'].strip()

    return tokenizer(
        question,
        context,
        max_length=max_tokens,
        padding=padding
    )


tokenized_train_dataset = train_dataset.map(tokenize_sample)
tokenized_val_dataset = val_dataset.map(tokenize_sample)
# tokenized_test_dataset = test_dataset.map(tokenize_sample)

print('Max number of tokens in tokenized train dataset: ', len(max(tokenized_train_dataset['input_ids'], key=len)))
print('Max number of tokens in tokenized val dataset: ', len(max(tokenized_val_dataset['input_ids'], key=len)))
# print('Max number of tokens in tokenized test dataset: ', len(max(tokenized_test_dataset['input_ids'], key=len)))

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (551 > 512). Running this sequence through the model will result in indexing errors
Loading cached processed dataset at C:\Users\Artur\.cache\huggingface\datasets\squad\plain_text\1.0.0\d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453\cache-7db0a997d2ba11fd.arrow


Max number of tokens in tokenized train dataset:  882
Max number of tokens in tokenized val dataset:  833


In [10]:
tokenized_train_dataset

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 87599
})

In [11]:
tokenized_train_dataset[0]

{'id': '5733be284776f41900661182',
 'title': 'University_of_Notre_Dame',
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]},
 'input_ids': [101,
  1706,
  2292,
  1225,
  1103

In [12]:
max_length = 384


def filter_samples_below_number_of_tokens(dataset, max_tokens: int):
    indices_to_remove = []

    # Find indices of samples where number of tokens exceeds max number of tokens
    for index, sample in enumerate(dataset):
        tokenized_sample = tokenize_sample(sample)
        if len(tokenized_sample['input_ids']) > max_tokens:
            indices_to_remove.append(index)

    # Keep only samples with number of tokens less or equal than max number of tokens
    dataset_indices = range(len(dataset))
    filtered_dataset = dataset.select(
        index for index in dataset_indices if index not in set(indices_to_remove)
    )

    return filtered_dataset


filtered_train_dataset = filter_samples_below_number_of_tokens(train_dataset, max_tokens=max_length)
filtered_val_dataset = filter_samples_below_number_of_tokens(val_dataset, max_tokens=max_length)
# filtered_test_dataset = filter_samples_below_number_of_tokens(test_dataset, max_tokens=max_length)



In [13]:
print('Number of samples in tokenized train dataset before filtering: ', len(train_dataset))
print('Number of samples in tokenized val dataset before filtering: ', len(val_dataset))
# print('Number of samples in tokenized test dataset before filtering: ', len(test_dataset))

print('\n---------------\n')

print('Number of samples in tokenized train dataset after filtering: ', len(filtered_train_dataset))
print('Number of samples in tokenized val dataset after filtering: ', len(filtered_val_dataset))
# print('Number of samples in tokenized test dataset after filtering: ', len(filtered_test_dataset))

Number of samples in tokenized train dataset before filtering:  87599
Number of samples in tokenized val dataset before filtering:  10570

---------------

Number of samples in tokenized train dataset after filtering:  86512
Number of samples in tokenized val dataset after filtering:  10353


In [19]:
def tokenize(examples):
    questions = [q.strip() for q in examples['question']]
    contexts = [c.strip() for c in examples['context']]

    inputs = tokenizer(
        questions,
        contexts,
        max_length=max_length,
        padding='max_length'
    )

    return inputs


def preprocess_dataset(dataset):
    dataset = dataset.map(tokenize, batched=True)

    answers = dataset['answers']
    start_positions = []
    end_positions = []

    for index, _ in enumerate(dataset):
        start_char = answers[index]['answer_start'][0]
        end_char = start_char + len(answers[index]['text'][0])

        start_positions.append(start_char)
        end_positions.append(end_char)

    dataset = dataset.add_column('start_positions', start_positions)
    dataset = dataset.add_column('end_positions', end_positions)
    return dataset


tokenized_train_dataset = preprocess_dataset(filtered_train_dataset)
tokenized_val_dataset = preprocess_dataset(filtered_val_dataset)
# tokenized_test_dataset = preprocess_dataset(filtered_test_dataset)

Loading cached processed dataset at C:\Users\Artur\.cache\huggingface\datasets\squad\plain_text\1.0.0\d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453\cache-8e6dd48790a0de3f.arrow


Map:   0%|          | 0/10353 [00:00<?, ? examples/s]

In [20]:
tokenized_train_dataset

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 86512
})

In [18]:
filtered_train_dataset['answers'][0]['answer_start'][0]

515

In [21]:
tokenizer

BertTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [23]:
print(f'All tokenized train dataset entries have {max_length} tokens: ',
      all([len(input_ids) == max_length for input_ids in tokenized_train_dataset['input_ids']]))
print(f'All tokenized val dataset entries have {max_length} tokens: ',
      all([len(input_ids) == max_length for input_ids in tokenized_val_dataset['input_ids']]))
# print(f'All tokenized test dataset entries have {max_length} tokens: ',
      # all([len(input_ids) == max_length for input_ids in tokenized_test_dataset['input_ids']]))

All tokenized train dataset entries have 384 tokens:  True
All tokenized val dataset entries have 384 tokens:  True


In [24]:
model = TFAutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

All PyTorch model weights were used when initializing TFBertForQuestionAnswering.

Some weights or buffers of the TF 2.0 model TFBertForQuestionAnswering were not initialized from the PyTorch model and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
data_collator = DefaultDataCollator(return_tensors='tf')

In [26]:
tokenized_train_dataset['start_positions']

[515,
 188,
 279,
 381,
 92,
 248,
 441,
 598,
 126,
 908,
 119,
 145,
 234,
 356,
 675,
 487,
 46,
 126,
 271,
 155,
 496,
 68,
 155,
 647,
 358,
 624,
 1163,
 92,
 757,
 4,
 466,
 303,
 377,
 360,
 136,
 145,
 188,
 344,
 394,
 109,
 138,
 213,
 488,
 618,
 32,
 362,
 565,
 155,
 918,
 0,
 353,
 406,
 638,
 85,
 3,
 136,
 123,
 222,
 49,
 0,
 963,
 1049,
 1099,
 86,
 0,
 68,
 233,
 4,
 80,
 118,
 427,
 753,
 891,
 71,
 196,
 1446,
 1588,
 49,
 6,
 136,
 350,
 32,
 368,
 73,
 197,
 331,
 1237,
 251,
 702,
 90,
 228,
 385,
 862,
 244,
 595,
 8,
 66,
 430,
 117,
 204,
 354,
 251,
 274,
 297,
 571,
 1193,
 819,
 842,
 11,
 11,
 292,
 321,
 587,
 428,
 522,
 720,
 4,
 575,
 37,
 181,
 262,
 82,
 439,
 82,
 625,
 921,
 1199,
 141,
 64,
 314,
 403,
 576,
 191,
 6,
 68,
 138,
 488,
 596,
 162,
 202,
 349,
 474,
 730,
 56,
 73,
 157,
 284,
 535,
 120,
 336,
 398,
 613,
 1755,
 142,
 471,
 596,
 750,
 198,
 289,
 535,
 210,
 0,
 85,
 122,
 221,
 424,
 78,
 60,
 134,
 242,
 275,
 4,
 159,
 179,

In [27]:
tokenized_train_dataset

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 86512
})

In [28]:
tf_train_dataset = model.prepare_tf_dataset(
    tokenized_train_dataset,
    collate_fn=data_collator,
    shuffle=True,
    batch_size=8,
)
tf_val_dataset = model.prepare_tf_dataset(
    tokenized_val_dataset,
    collate_fn=data_collator,
    shuffle=False,
    batch_size=8,
)

In [29]:
tf_train_dataset

<PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(8, 384), dtype=tf.int64, name=None), 'token_type_ids': TensorSpec(shape=(8, 384), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(8, 384), dtype=tf.int64, name=None)}, {'start_positions': TensorSpec(shape=(8,), dtype=tf.int64, name=None), 'end_positions': TensorSpec(shape=(8,), dtype=tf.int64, name=None)})>

In [30]:
# <_PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(16, 384), dtype=tf.int64, name=None), 'token_type_ids': TensorSpec(shape=(16, 384), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(16, 384), dtype=tf.int64, name=None)}, {'start_positions': TensorSpec(shape=(16,), dtype=tf.int64, name=None), 'end_positions': TensorSpec(shape=(16,), dtype=tf.int64, name=None)})>

In [None]:
# tf_train_dataset = tokenized_train_dataset.to_tf_dataset(
#     columns=['input_ids', 'token_type_ids', 'attention_mask'],
#     label_cols=['start_positions', 'end_positions'],
#     batch_size=8,
#     shuffle=False
# )

# tf_val_dataset = tokenized_val_dataset.to_tf_dataset(
#     columns=['input_ids', 'token_type_ids', 'attention_mask'],
#     label_cols=['start_positions', 'end_positions'],
#     batch_size=8,
#     shuffle=False
# )

In [None]:
# tf_train_dataset

<PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(None, 384), dtype=tf.int64, name=None), 'token_type_ids': TensorSpec(shape=(None, 384), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(None, 384), dtype=tf.int64, name=None)}, {'start_positions': TensorSpec(shape=(None,), dtype=tf.int64, name=None), 'end_positions': TensorSpec(shape=(None,), dtype=tf.int64, name=None)})>

In [31]:
num_train_epochs = 3
num_train_steps = len(tf_train_dataset) * num_train_epochs

optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: NVIDIA GeForce RTX 4070, compute capability 8.9


In [32]:
train_dataset["question"][0]


'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?'

In [33]:
train_dataset["context"][0]


'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.'

In [92]:
input_test = tokenizer(train_dataset["question"][0], train_dataset["context"][0], return_tensors="tf")
output = model(**input_test)

answer_start_index = int(tf.math.argmax(output.start_logits, axis=-1)[0])
answer_end_index = int(tf.math.argmax(output.end_logits, axis=-1)[0])

predict_answer_tokens = input_test.input_ids[0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens)

''

In [93]:
input_test = question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"

input_test = tokenizer(question, text, return_tensors="tf")
output = model(**input_test)

answer_start_index = int(tf.math.argmax(output.start_logits, axis=-1)[0])
answer_end_index = int(tf.math.argmax(output.end_logits, axis=-1)[0])

predict_answer_tokens = input_test.input_ids[0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens)

''

In [94]:
input_test

{'input_ids': <tf.Tensor: shape=(1, 16), dtype=int32, numpy=
array([[  101,  2627,  1108,  3104,  1124, 15703,   136,   102,  3104,
         1124, 15703,  1108,   170,  3505, 16797,   102]])>, 'token_type_ids': <tf.Tensor: shape=(1, 16), dtype=int32, numpy=array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]])>, 'attention_mask': <tf.Tensor: shape=(1, 16), dtype=int32, numpy=array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])>}

In [95]:
output

TFQuestionAnsweringModelOutput(loss=None, start_logits=<tf.Tensor: shape=(1, 16), dtype=float16, numpy=
array([[-0.3826 , -0.02153, -0.414  , -0.1685 ,  0.11456, -0.08124,
         0.0786 , -0.5254 , -0.00936,  0.328  ,  0.1414 , -0.273  ,
        -0.1273 , -0.1489 , -0.2788 , -0.5254 ]], dtype=float16)>, end_logits=<tf.Tensor: shape=(1, 16), dtype=float16, numpy=
array([[ 0.7124 , -0.0422 , -0.367  , -0.1194 ,  0.09766, -0.01522,
         0.1018 , -0.2605 , -0.09937, -0.1198 , -0.2312 , -0.1819 ,
        -0.3196 , -0.1366 ,  0.06235, -0.2605 ]], dtype=float16)>, hidden_states=None, attentions=None)

In [63]:
model.loss

'categorical_crossentropy'

In [75]:
model

<transformers.models.bert.modeling_tf_bert.TFBertForQuestionAnswering at 0x1e93d01b460>

In [34]:
model.fit(tf_train_dataset, validation_data=tf_val_dataset, epochs=num_train_epochs)

Epoch 1/3
  278/10814 [..............................] - ETA: 38:39 - loss: nan

KeyboardInterrupt: 

In [69]:
model.summary()

Model: "tf_bert_for_question_answering"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  108891648 
_________________________________________________________________
qa_outputs (Dense)           multiple                  1538      
Total params: 108,893,186
Trainable params: 108,893,186
Non-trainable params: 0
_________________________________________________________________
