Load the SQuAD (Standford Question and Answer Dataset) Dataset

In [63]:
from datasets import load_dataset

raw_datasets = load_dataset("squad")
#raw_datasets

Print some value of the Datasets

In [64]:
print("Summary:",raw_datasets)

raw_datasets["train"][1]

Summary: DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})


{'id': '5733be284776f4190066117f',
 'title': 'University_of_Notre_Dame',
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'What is in front of the Notre Dame Main Building?',
 'answers': {'text': ['a copper statue of Christ'], 'answer_start': [188]}}

# Validation DS can have multiple answers
raw_datasets["validation"][2]["answers"]


Start Training Process

In [65]:
from transformers import AutoTokenizer

model_checkpoint = "distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [66]:
# Try with sample data
context = raw_datasets["train"][1]["context"]
question = raw_datasets["train"][1]["question"]

# Note: Inputs only contain a single row of data
inputs = tokenizer(question, context)
print("Raw token ids:",inputs)

# Decode the token ids
print("Decoded tokens:",tokenizer.decode(inputs["input_ids"]))

Raw token ids: {'input_ids': [101, 1327, 1110, 1107, 1524, 1104, 1103, 10360, 8022, 4304, 4334, 136, 102, 22182, 1193, 117, 1103, 1278, 1144, 170, 2336, 1959, 119, 1335, 4184, 1103, 4304, 4334, 112, 188, 2284, 10945, 1110, 170, 5404, 5921, 1104, 1103, 6567, 2090, 119, 13301, 1107, 1524, 1104, 1103, 4304, 4334, 1105, 4749, 1122, 117, 1110, 170, 7335, 5921, 1104, 4028, 1114, 1739, 1146, 14089, 5591, 1114, 1103, 7051, 107, 159, 21462, 1566, 24930, 2508, 152, 1306, 3965, 107, 119, 5893, 1106, 1103, 4304, 4334, 1110, 1103, 19349, 1104, 1103, 11373, 4641, 119, 13301, 1481, 1103, 171, 17506, 9538, 1110, 1103, 144, 10595, 2430, 117, 170, 14789, 1282, 1104, 8070, 1105, 9284, 119, 1135, 1110, 170, 16498, 1104, 1103, 176, 10595, 2430, 1120, 10111, 20500, 117, 1699, 1187, 1103, 6567, 2090, 25153, 1193, 1691, 1106, 2216, 17666, 6397, 3786, 1573, 25422, 13149, 1107, 8109, 119, 1335, 1103, 1322, 1104, 1103, 1514, 2797, 113, 1105, 1107, 170, 2904, 1413, 1115, 8200, 1194, 124, 11739, 1105, 1103, 3487, 

In [67]:
# More complex tokenizer
# Split context into multiple chunks
# Split it into multiple samples using overlapping chunks
inputs = tokenizer(
  question,
  context,
  max_length=100, # max length of the string
  truncation="only_second", # only chunk/truncate second string which is the context
  stride=50, # overlap between chunks
  # better name for return_overflowing_tokens would be return_overlapping_tokens 
  return_overflowing_tokens=True, # Set to True, will chunk other tokens beyond the max_length
  return_offsets_mapping=True  
)

# Print the chunks inputs for the 1 data row
# each chunk token id will start for 101(for CLS) and end with 102 (for SEP)
print("Chunked context ids:",inputs["input_ids"])

# Decode the individual token ids
# Note the decoded question will be same in all cases
for ids in inputs["input_ids"]:
  print("Decoded and chunked tokens:",tokenizer.decode(ids))

# Format is [CLS] question [SEP] context [SEP]

Chunked context ids: [[101, 1327, 1110, 1107, 1524, 1104, 1103, 10360, 8022, 4304, 4334, 136, 102, 22182, 1193, 117, 1103, 1278, 1144, 170, 2336, 1959, 119, 1335, 4184, 1103, 4304, 4334, 112, 188, 2284, 10945, 1110, 170, 5404, 5921, 1104, 1103, 6567, 2090, 119, 13301, 1107, 1524, 1104, 1103, 4304, 4334, 1105, 4749, 1122, 117, 1110, 170, 7335, 5921, 1104, 4028, 1114, 1739, 1146, 14089, 5591, 1114, 1103, 7051, 107, 159, 21462, 1566, 24930, 2508, 152, 1306, 3965, 107, 119, 5893, 1106, 1103, 4304, 4334, 1110, 1103, 19349, 1104, 1103, 11373, 4641, 119, 13301, 1481, 1103, 171, 17506, 9538, 1110, 1103, 144, 102], [101, 1327, 1110, 1107, 1524, 1104, 1103, 10360, 8022, 4304, 4334, 136, 102, 4749, 1122, 117, 1110, 170, 7335, 5921, 1104, 4028, 1114, 1739, 1146, 14089, 5591, 1114, 1103, 7051, 107, 159, 21462, 1566, 24930, 2508, 152, 1306, 3965, 107, 119, 5893, 1106, 1103, 4304, 4334, 1110, 1103, 19349, 1104, 1103, 11373, 4641, 119, 13301, 1481, 1103, 171, 17506, 9538, 1110, 1103, 144, 10595, 2430,

Note the new keys overflow_to_sample_mapping and offset_mapping

overflow_to_sample_mapping: will contain [0, 0, 0, 1, 1, 2, 2] which shows how input data is split into multiple samples

[0, 0, 0, 1, 1, 2, 2] it will show how first sample [0] is split into 3 samples, [1] is split into 2 samples and [2] is split into 1, 2 are
split into 2 samples

offset_mapping: Shows offset/character positions of tokens in the mapping. For example, if the decoded tokens are:

[CLS] What is in front of the Notre Dame Main Building?

The offset mapping will be like: [(0, 0), (0, 4), (5, 7), (8, 10), ...

(0,0) : for CLS
(0,4) : For 'What', start from 0 and go till offset 4 as length is 4
(5,7) : For 'is', start from 5 and go till offset 7 as length is 2
(8,10): For 'in', start from 8 and go till offset 10 as length is 2

Note this list restart from (0,0) for a new chunk, but the index still refers to the original string.

In [68]:
print(inputs.keys())
inputs['overflow_to_sample_mapping']

# Output will be (0,0,0,0), which show that all the 4 chunks belong to the same data row

dict_keys(['input_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping'])


[0, 0, 0, 0]

In [69]:
# Each SEP token will show up as (0,0) in this
inputs['offset_mapping'] # Shows index of each token in the string

[[(0, 0),
  (0, 4),
  (5, 7),
  (8, 10),
  (11, 16),
  (17, 19),
  (20, 23),
  (24, 29),
  (30, 34),
  (35, 39),
  (40, 48),
  (48, 49),
  (0, 0),
  (0, 13),
  (13, 15),
  (15, 16),
  (17, 20),
  (21, 27),
  (28, 31),
  (32, 33),
  (34, 42),
  (43, 52),
  (52, 53),
  (54, 56),
  (56, 58),
  (59, 62),
  (63, 67),
  (68, 76),
  (76, 77),
  (77, 78),
  (79, 83),
  (84, 88),
  (89, 91),
  (92, 93),
  (94, 100),
  (101, 107),
  (108, 110),
  (111, 114),
  (115, 121),
  (122, 126),
  (126, 127),
  (128, 139),
  (140, 142),
  (143, 148),
  (149, 151),
  (152, 155),
  (156, 160),
  (161, 169),
  (170, 173),
  (174, 180),
  (181, 183),
  (183, 184),
  (185, 187),
  (188, 189),
  (190, 196),
  (197, 203),
  (204, 206),
  (207, 213),
  (214, 218),
  (219, 223),
  (224, 226),
  (226, 229),
  (229, 232),
  (233, 237),
  (238, 241),
  (242, 248),
  (249, 250),
  (250, 251),
  (251, 254),
  (254, 256),
  (257, 259),
  (260, 262),
  (263, 264),
  (264, 265),
  (265, 268),
  (268, 269),
  (269, 270),
 

In [70]:
# For out example, there are 4 inputs chunks, and each chunk has question and chunked context
# the sequence id will be 0 for question and 1 for context, for each of the chunk ids passed to it 
# this is similar to token type ids

inputs.sequence_ids(0)  # Shows the sequence id of each token for chunk 0

# NOTE: inputs.sequence_ids(4) will throw an error as there are only 4 chunks
# SEP and CLS tokens will be shown as none 

[None,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 None,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 None]

In [71]:
answer = raw_datasets["train"][1]["answers"]
answer

{'text': ['a copper statue of Christ'], 'answer_start': [188]}

In [72]:
# Find index in sequence_id where context starts
# Remember that the sequence_id is 0 for question and 1 for context

sequence_ids = inputs.sequence_ids(0)

#x:y:z in Python means to start at x, end at y, and step by z 
# find where sequence_id changes from 0 to 1
ctx_start = sequence_ids.index(1) # .index() will return the first index where the value is 1
# sequence_ids[::-1].index(1) will returns from the other side of the list where the value is 1
idx = sequence_ids[::-1].index(1) # will return 1 => second index from right where 1 starts 
ctx_end= len(sequence_ids) - idx - 1 # find the last index where the value is 1
# Example: sequence_ids = [0,0,1,1,1,1,None]
# ctx_start = 2, ctx_end = 7 - 1 - 1 = 5
ctx_start, ctx_end

(13, 98)

In [73]:
# check whether or not the answer is fully contained within the context
# if not, target is (start, end) = (0, 0)
print("answer:",answer)

ans_start_char = answer['answer_start'][0] # location in first index, will return 188 for the sample
ans_end_char = ans_start_char + len(answer['text'][0])

offset = inputs['offset_mapping'][0] #[0] give first question and context chunk
print("offset:",offset)

answer: {'text': ['a copper statue of Christ'], 'answer_start': [188]}
offset: [(0, 0), (0, 4), (5, 7), (8, 10), (11, 16), (17, 19), (20, 23), (24, 29), (30, 34), (35, 39), (40, 48), (48, 49), (0, 0), (0, 13), (13, 15), (15, 16), (17, 20), (21, 27), (28, 31), (32, 33), (34, 42), (43, 52), (52, 53), (54, 56), (56, 58), (59, 62), (63, 67), (68, 76), (76, 77), (77, 78), (79, 83), (84, 88), (89, 91), (92, 93), (94, 100), (101, 107), (108, 110), (111, 114), (115, 121), (122, 126), (126, 127), (128, 139), (140, 142), (143, 148), (149, 151), (152, 155), (156, 160), (161, 169), (170, 173), (174, 180), (181, 183), (183, 184), (185, 187), (188, 189), (190, 196), (197, 203), (204, 206), (207, 213), (214, 218), (219, 223), (224, 226), (226, 229), (229, 232), (233, 237), (238, 241), (242, 248), (249, 250), (250, 251), (251, 254), (254, 256), (257, 259), (260, 262), (263, 264), (264, 265), (265, 268), (268, 269), (269, 270), (271, 275), (276, 278), (279, 282), (283, 287), (288, 296), (297, 299), (30

In [74]:
# The answer is provided in terms of character positions in the context
# However for neurl network, we need to provide the answer in terms of token positions
# This function will find the token positions of the answer in the context
def find_answer_token_idx(
    ctx_start,
    ctx_end,
    ans_start_char,
    ans_end_char,
    offset):
  
  start_idx = 0
  end_idx = 0

  if offset[ctx_start][0] > ans_start_char or offset[ctx_end][1] < ans_end_char:
    pass
    # print("target is (0, 0)")
    # nothing else to do
  else:
    # find the start and end TOKEN positions

    # the 'trick' is knowing what is in units of tokens and what is in
    # units of characters

    # recall: the offset_mapping contains the character positions of each token

    i = ctx_start
    for start_end_char in offset[ctx_start:]:
      start, end = start_end_char
      if start == ans_start_char:
        start_idx = i
        # don't break yet
      
      if end == ans_end_char:
        end_idx = i
        break

      i += 1
  return start_idx, end_idx

# Token positions where the answer starts and ends
start_idx, end_idx = find_answer_token_idx(ctx_start, ctx_end, ans_start_char, ans_end_char, offset)

print (f"start_idx, end_idx: {start_idx, end_idx}")

start_idx, end_idx: (53, 57)


In [75]:
# Verify the values of the answers based on the start_idx and end_idx
# Check the Token ids
input_ids = inputs['input_ids'][0]
print("Token ids for answer",input_ids[start_idx : end_idx + 1])

# Decoded values of the tokens
print("Decoded Values:",tokenizer.decode(input_ids[start_idx : end_idx + 1]))

Token ids for answer [170, 7335, 5921, 1104, 4028]
Decoded Values: a copper statue of Christ


Start the process of tokenizing the entire data set.

In [76]:
# Create a tokenize function for the entire batch which will be called from the map function

# Use these values as Google used 384 for SQuAD
max_length = 384
stride =  128

# This function is used only for the train data
def tokenize_fn_train(batch):
  # some questions have leading and/or trailing whitespace
  questions = [q.strip() for q in batch["question"]]

  # tokenize the data (with padding this time)
  # since most contexts are long, we won't bother to pad per-minibatch
  inputs = tokenizer(
    questions,
    batch["context"],
    max_length=max_length,
    truncation="only_second",
    stride=stride,
    return_overflowing_tokens=True,
    return_offsets_mapping=True,
    padding="max_length",
  )

  # we don't need these later so remove them from the dict
  # offset mapping will have the question first followed by the chunked context 
  offset_mapping = inputs.pop("offset_mapping")
  orig_sample_idxs = inputs.pop("overflow_to_sample_mapping") # Shows which chunk belongs to which sample
  # e.g. orig_sample_idxs = [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
  # => Chunk 0-3 belongs to sample 0, Chunk 4-7 belongs to sample 1, Chunk 8-11 belongs to sample 2
  answers = batch['answers']
  start_idxs, end_idxs = [], []

  # Put the start and end position of the answers
  # in the end positions, we will use the function defined previously
  # offset_mapping = [[(0,1),(2,5)...],[(0,6),(7,12)...],...]
  for i, offset in enumerate(offset_mapping): # i, offset =   0, [(0,1),(2,5)...] 
    sample_idx = orig_sample_idxs[i] # Sample index will be sample for multiple chunks of same sample
    
    # Searching for the answer in the specific context
    answer = answers[sample_idx]
    ans_start_char = answer['answer_start'][0]
    ans_end_char = ans_start_char + len(answer['text'][0])

    sequence_ids = inputs.sequence_ids(i)

    # find start + end of context (first 1 and last 1)
    # We will find if the answer is in this chunked context or not
    ctx_start = sequence_ids.index(1)
    ctx_end = len(sequence_ids) - sequence_ids[::-1].index(1) - 1

    start_idx, end_idx = find_answer_token_idx(
      ctx_start,
      ctx_end,
      ans_start_char,
      ans_end_char,
      offset)

    # Note that due to stride the answer can appear in multiple context 
    # windows
    start_idxs.append(start_idx) # if start_idx = end_idx = 0, then answer is not in the context
    end_idxs.append(end_idx)
  
  # Add new fields in the input.
  inputs["start_positions"] = start_idxs
  inputs["end_positions"] = end_idxs
  return inputs

In [None]:
# Prepare the train dataset
# Use the mapping functions to tokenize the data
train_dataset = raw_datasets["train"].map(
  tokenize_fn_train,
  batched=True,
  # Will remove all columns present in original data in the new dataset
  # This will insure that none of the original columns are present in the new dataset
  # as they are not used 
  remove_columns=raw_datasets["train"].column_names, 
)

# remove_columns will remove the columns from the dataset
# See the difference in the length of the raw dataset and the tokenized dataset
len(raw_datasets["train"]), len(train_dataset)

Map: 100%|██████████| 87599/87599 [00:29<00:00, 3016.50 examples/s]


KeyboardInterrupt: 

Do data prep for validation data set

In [78]:
# Check one Sample
raw_datasets["validation"][0]

{'id': '56be4db0acb8001400a502ec',
 'title': 'Super_Bowl_50',
 'context': 'Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi\'s Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.',
 'question': 'Which NFL team represented the AFC at Super Bowl 50?',
 'answers': {'text': ['Denver Broncos', 'Denver Broncos', 'Denver Broncos'],


In [79]:
# tokenize the validation set differently
# we won't need the targets since we will just compare with the original answer
# also: overwrite offset_mapping with Nones in place of question
def tokenize_fn_validation(batch):
  # some questions have leading and/or trailing whitespace, strip them
  questions = [q.strip() for q in batch["question"]]

  # tokenize the data (with padding this time)
  # since most contexts are long, we won't bother to pad per-minibatch
  inputs = tokenizer(
    questions,
    batch["context"],
    max_length=max_length,
    truncation="only_second",
    stride=stride,
    return_overflowing_tokens=True,
    return_offsets_mapping=True,
    padding="max_length",
  )

  # we don't need these later so remove them
  # keep the offset mapping as it will be used to find the answer
  orig_sample_idxs = inputs.pop("overflow_to_sample_mapping")
  sample_ids = []

  # rewrite offset mapping by replacing question tuples with None
  # this will be helpful later on when we compute metrics
  for i in range(len(inputs["input_ids"])):
    sample_idx = orig_sample_idxs[i]
    sample_ids.append(batch['id'][sample_idx])

    sequence_ids = inputs.sequence_ids(i)
    offset = inputs["offset_mapping"][i]
    # Change any value that does not belong to the context to None
    # Remember that the sequence_id is 0 for question and 1 for context
    inputs["offset_mapping"][i] = [
      x if sequence_ids[j] == 1 else None for j, x in enumerate(offset)]
    
  inputs['sample_id'] = sample_ids
  return inputs

In [80]:
# Generate the validation dataset using map function defined earlier
validation_dataset = raw_datasets["validation"].map(
  tokenize_fn_validation,
  batched=True,
    remove_columns=raw_datasets["validation"].column_names,
)
# The length will differ as chunking will create additional samples
len(raw_datasets["validation"]), len(validation_dataset)

Map: 100%|██████████| 10570/10570 [00:04<00:00, 2219.55 examples/s]


(10570, 10822)

Build code for the Metrics

In [83]:
# ----------- This is not used anymore
# from datasets import load_metric
# metric = load_metric("squad")
#------------------------
#pip install evaluate
import evaluate

# Most standards datasets for NLP tasks have associated metrics for them
metric = evaluate.load("squad")




Downloading builder script: 100%|██████████| 4.53k/4.53k [00:00<00:00, 9.92MB/s]
Downloading extra modules: 100%|██████████| 3.32k/3.32k [00:00<?, ?B/s]


This shows a sample structures of predicted and true answers, and how they are passed to the compute function  

In [84]:
predicted_answers = [
  {'id': '1', 'prediction_text': 'Albert Einstein'},
  {'id': '2', 'prediction_text': 'physicist'},
  {'id': '3', 'prediction_text': 'general relativity'},
]
true_answers = [
  {'id': '1', 'answers': {'text': ['Albert Einstein'], 'answer_start': [100]}},
  {'id': '2', 'answers': {'text': ['physicist'], 'answer_start': [100]}},
  {'id': '3', 'answers': {'text': ['special relativity'], 'answer_start': [100]}},
]

# id and answer_start seem superfluous but you'll get an error if not included
# metrics.compute will give accuracy and F1 score
metric.compute(predictions=predicted_answers, references=true_answers)

{'exact_match': 66.66666666666667, 'f1': 83.33333333333333}

Create a smaller validation data set

In [85]:
# next problem: how to go from logits to prediction text?
small_validation_dataset = raw_datasets["validation"].select(range(100)) # select 1 to 100

# Let's work on an already-trained question-answering model
# Model name will be used in both AutoTokenizer and AutoModelForQuestionAnswering
trained_checkpoint = "distilbert-base-cased-distilled-squad"

tokenizer2 = AutoTokenizer.from_pretrained(trained_checkpoint)

# temporarily assign tokenizer2 to tokenizer since it's used as a global
# in tokenize_fn_validation. The original tokenizer is declared earlier in the code
old_tokenizer = tokenizer
tokenizer = tokenizer2

small_validation_processed = small_validation_dataset.map(
    tokenize_fn_validation,
    batched=True,
    remove_columns=raw_datasets["validation"].column_names,
)

# change it back
tokenizer = old_tokenizer

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Map: 100%|██████████| 100/100 [00:00<00:00, 1700.09 examples/s]


Start the definition of the model here

In [None]:
# Get the model prepped for training