In [1]:
from datasets import load_dataset
import torch
from transformers import AutoModelForQuestionAnswering
import numpy as np
import evaluate
from tqdm.auto import tqdm

raw_datasets = load_dataset("squad")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
example = raw_datasets["train"][0]
example

{'id': '5733be284776f41900661182',
 'title': 'University_of_Notre_Dame',
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}

In [3]:
example["context"][515:]

'Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.'

In [4]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [5]:
test = "In a shocking finding, scientists a herd of unicorns. Some spaces:     . Some Chinese: 边界. "
token_ids = tokenizer.encode(test)
token_ids

[101,
 1130,
 170,
 19196,
 4006,
 117,
 6479,
 170,
 17804,
 1104,
 8362,
 23941,
 1116,
 119,
 1789,
 6966,
 131,
 119,
 1789,
 1922,
 131,
 100,
 100,
 119,
 102]

In [6]:
for token in token_ids:
    print(tokenizer.decode(token))
tokenizer.decode(token_ids)

[CLS]
In
a
shocking
finding
,
scientists
a
herd
of
un
##icorn
##s
.
Some
spaces
:
.
Some
Chinese
:
[UNK]
[UNK]
.
[SEP]


'[CLS] In a shocking finding, scientists a herd of unicorns. Some spaces :. Some Chinese : [UNK] [UNK]. [SEP]'

In [7]:
test2 = "In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English."
tokens = tokenizer(
    "Here is a question. ",
    test2,
    max_length=40,
    truncation="only_second",
    stride=5,
    return_overflowing_tokens=True,
    return_offsets_mapping=True,
    padding="max_length",
)
len(tokens["input_ids"]), token_ids

(2,
 [101,
  1130,
  170,
  19196,
  4006,
  117,
  6479,
  170,
  17804,
  1104,
  8362,
  23941,
  1116,
  119,
  1789,
  6966,
  131,
  119,
  1789,
  1922,
  131,
  100,
  100,
  119,
  102])

In [8]:
print(tokenizer.decode(tokens["input_ids"][0]))
print(tokenizer.decode(tokens["input_ids"][1]))

[CLS] Here is a question. [SEP] In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even [SEP]
[CLS] Here is a question. [SEP] the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


In [9]:
tokens.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping'])

In [10]:
tokens["token_type_ids"]

[[0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0]]

In [11]:
for token_id in tokens["input_ids"][0][:10]:
    print(tokenizer.decode(token_id))

[CLS]
Here
is
a
question
.
[SEP]
In
a
shocking


In [12]:
tokens["offset_mapping"]

[[(0, 0),
  (0, 4),
  (5, 7),
  (8, 9),
  (10, 18),
  (18, 19),
  (0, 0),
  (0, 2),
  (3, 4),
  (5, 13),
  (14, 21),
  (21, 22),
  (23, 32),
  (33, 43),
  (44, 45),
  (46, 50),
  (51, 53),
  (54, 56),
  (56, 61),
  (61, 62),
  (63, 69),
  (70, 72),
  (73, 74),
  (75, 81),
  (81, 82),
  (83, 93),
  (94, 97),
  (97, 98),
  (98, 99),
  (99, 103),
  (103, 104),
  (105, 111),
  (111, 112),
  (113, 115),
  (116, 119),
  (120, 125),
  (126, 135),
  (135, 136),
  (137, 141),
  (0, 0)],
 [(0, 0),
  (0, 4),
  (5, 7),
  (8, 9),
  (10, 18),
  (18, 19),
  (0, 0),
  (116, 119),
  (120, 125),
  (126, 135),
  (135, 136),
  (137, 141),
  (142, 146),
  (147, 157),
  (158, 160),
  (161, 164),
  (165, 176),
  (177, 180),
  (181, 184),
  (185, 189),
  (190, 194),
  (195, 198),
  (199, 201),
  (201, 206),
  (206, 207),
  (208, 213),
  (214, 221),
  (222, 229),
  (229, 230),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0)]]

In [13]:
tokens["overflow_to_sample_mapping"]

[0, 0]

In [14]:
inputs = tokenizer(
    raw_datasets["train"][2:6]["question"],
    raw_datasets["train"][2:6]["context"],
    max_length=100,
    truncation="only_second",
    stride=50,
    return_overflowing_tokens=True,
    return_offsets_mapping=True,
)

print(f"The 4 examples gave {len(inputs['input_ids'])} features.")
print(f"Here is where each comes from: {inputs['overflow_to_sample_mapping']}.")

The 4 examples gave 19 features.
Here is where each comes from: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3].


In [15]:
max_length = 384
stride = 128


def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [16]:
train_dataset = raw_datasets["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)
len(raw_datasets["train"]), len(train_dataset)

(87599, 88729)

In [17]:
train_dataset[2:4]

{'input_ids': [[101,
   1109,
   19349,
   1104,
   1103,
   11373,
   1762,
   1120,
   10360,
   8022,
   1110,
   3148,
   1106,
   1134,
   2401,
   136,
   102,
   22182,
   1193,
   117,
   1103,
   1278,
   1144,
   170,
   2336,
   1959,
   119,
   1335,
   4184,
   1103,
   4304,
   4334,
   112,
   188,
   2284,
   10945,
   1110,
   170,
   5404,
   5921,
   1104,
   1103,
   6567,
   2090,
   119,
   13301,
   1107,
   1524,
   1104,
   1103,
   4304,
   4334,
   1105,
   4749,
   1122,
   117,
   1110,
   170,
   7335,
   5921,
   1104,
   4028,
   1114,
   1739,
   1146,
   14089,
   5591,
   1114,
   1103,
   7051,
   107,
   159,
   21462,
   1566,
   24930,
   2508,
   152,
   1306,
   3965,
   107,
   119,
   5893,
   1106,
   1103,
   4304,
   4334,
   1110,
   1103,
   19349,
   1104,
   1103,
   11373,
   4641,
   119,
   13301,
   1481,
   1103,
   171,
   17506,
   9538,
   1110,
   1103,
   144,
   10595,
   2430,
   117,
   170,
   14789,
   1282,
   1104,
   8

In [18]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [19]:
validation_dataset = raw_datasets["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw_datasets["validation"].column_names,
)
len(raw_datasets["validation"]), len(validation_dataset)


(10570, 10822)

In [26]:
?tokenizer.get_special_tokens_mask

[0;31mSignature:[0m
[0mtokenizer[0m[0;34m.[0m[0mget_special_tokens_mask[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mtoken_ids_0[0m[0;34m:[0m [0mList[0m[0;34m[[0m[0mint[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtoken_ids_1[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mint[0m[0;34m][0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0malready_has_special_tokens[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0mList[0m[0;34m[[0m[0mint[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.

Args:
    token_ids_0 (`List[int]`):
        List of ids of the first sequence.
    token_ids_1 (`List[int]`, *optional*):
        List 

In [20]:
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
from transformers import TrainingArguments

args = TrainingArguments(
    "bert-finetuned-squad",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
    push_to_hub=False,
)

from transformers import Trainer

# Disable wandb logging

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
)
trainer.train()



Step,Training Loss


KeyboardInterrupt: 