In [1]:
import pandas as pd
from transformers import AutoTokenizer
import datasets

In [2]:
df_train = pd.read_csv('../data/datasets/squad/train.csv')
df_val = pd.read_csv('../data/datasets/squad/dev.csv')
df_test = pd.read_csv('../data/datasets/squad/test.csv')

In [3]:
model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [4]:
train_dataset = datasets.Dataset.from_pandas(df_train)
val_dataset = datasets.Dataset.from_pandas(df_val)
test_dataset = datasets.Dataset.from_pandas(df_test)

In [5]:
train_dataset

Dataset({
    features: ['Unnamed: 0', 'context', 'question', 'answers', '__index_level_0__'],
    num_rows: 68718
})

In [6]:
def firstly_tokenize(row):
    return tokenizer(
        row["question"],
        row["context"]
    )

In [7]:
max_length = 384

tokenized_datasets = train_dataset.map(firstly_tokenize, batched=True)
print(len(tokenized_datasets))
tokenized_datasets = tokenized_datasets.filter(lambda example: len(example['input_ids']) <= max_length)
print(len(tokenized_datasets))

Map:   0%|          | 0/68718 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (520 > 512). Running this sequence through the model will result in indexing errors


68718


Filter:   0%|          | 0/68718 [00:00<?, ? examples/s]

67966


In [16]:
indices_to_remove = []

for i, row in enumerate(train_dataset):
    tokenized_row = firstly_tokenize(row)
    if len(tokenized_row["input_ids"]) > max_length:
        indices_to_remove.append(i)

filtered_dataset = train_dataset.select(
    i for i in range(len(train_dataset))
    if i not in set(indices_to_remove)
)



In [17]:
print(len(indices_to_remove))

67966


In [18]:
print(68718 - 67966)

752


In [52]:
len(max(tokenized_datasets["input_ids"], key=len))

384

In [53]:
tokenized_datasets

Dataset({
    features: ['Unnamed: 0', 'context', 'question', 'answers', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 67966
})

In [None]:
max_length = 384

def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]

    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        padding="max_length"
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [7]:
def map_word_counts(row):
    return len(row['text'].split())

def map_count_belonging(row, divider):
    int_division_result = int(row['token_count'] / divider) + 1
    lower_boundary = divider * int_division_result - divider
    upper_boundary = divider * int_division_result - 1
    return f"{lower_boundary}-{upper_boundary}"

def map_correctly_predicted(row):
    return int(row['label'] == row['prediction'])


test_df['token_count'] = test_df.apply(lambda row: map_token_counts(row), axis=1)
test_df['count_belonging'] = test_df.apply(lambda row: map_count_belonging(row, divider=6), axis=1)
test_df['correctly_predicted'] = test_df.apply(lambda row: map_correctly_predicted(row), axis=1)
test_df

NameError: name 'test_df' is not defined

In [17]:
context = train_dataset[0]["context"]
question = train_dataset[0]["question"]

inputs = tokenizer(
    question,
    context,
    max_length=384,
    truncation="only_second",
    return_overflowing_tokens=True,
    return_offsets_mapping=True,
)
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping'])

In [12]:
tokenizer.decode(inputs.input_ids[0])
# len(inputs.input_ids[0])

"[CLS] along with the united democratic party, what party currently rules the marshall islands? [SEP] legislative power lies with the nitijela. the upper house of parliament, called the council of iroij, is an advisory body comprising twelve tribal chiefs. the executive branch consists of the president and the presidential cabinet, which consists of ten ministers appointed by the president with the approval of the nitijela. the twenty - four electoral districts into which the country is divided correspond to the inhabited islands and atolls. there are currently four political parties in the marshall islands : aelon kein ad ( aka ), united people's party ( upp ), kien eo am ( kea ) and united democratic party ( udp ). rule is shared by the aka and the udp. the following senators are in the legislative body : [SEP]"

In [19]:
question

'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?'

In [20]:
inputs["offset_mapping"][0]

[(0, 0),
 (0, 2),
 (3, 7),
 (8, 11),
 (12, 15),
 (16, 22),
 (23, 27),
 (28, 37),
 (38, 44),
 (45, 47),
 (48, 52),
 (53, 55),
 (56, 59),
 (59, 63),
 (64, 70),
 (70, 71),
 (0, 0),
 (0, 13),
 (13, 15),
 (15, 16),
 (17, 20),
 (21, 27),
 (28, 31),
 (32, 33),
 (34, 42),
 (43, 52),
 (52, 53),
 (54, 56),
 (56, 58),
 (59, 62),
 (63, 67),
 (68, 76),
 (76, 77),
 (77, 78),
 (79, 83),
 (84, 88),
 (89, 91),
 (92, 93),
 (94, 100),
 (101, 107),
 (108, 110),
 (111, 114),
 (115, 121),
 (122, 126),
 (126, 127),
 (128, 139),
 (140, 142),
 (143, 148),
 (149, 151),
 (152, 155),
 (156, 160),
 (161, 169),
 (170, 173),
 (174, 180),
 (181, 183),
 (183, 184),
 (185, 187),
 (188, 189),
 (190, 196),
 (197, 203),
 (204, 206),
 (207, 213),
 (214, 218),
 (219, 223),
 (224, 226),
 (226, 229),
 (229, 232),
 (233, 237),
 (238, 241),
 (242, 248),
 (249, 250),
 (250, 251),
 (251, 254),
 (254, 256),
 (257, 259),
 (260, 262),
 (263, 264),
 (264, 265),
 (265, 268),
 (268, 269),
 (269, 270),
 (271, 275),
 (276, 278),
 (279, 2

In [21]:
inputs = tokenizer(
    raw_datasets["train"][2:6]["question"],
    raw_datasets["train"][2:6]["context"],
    max_length=100,
    truncation="only_second",
    stride=50,
    return_overflowing_tokens=True,
    return_offsets_mapping=True,
)

print(f"The 4 examples gave {len(inputs['input_ids'])} features.")
print(f"Here is where each comes from: {inputs['overflow_to_sample_mapping']}.")

The 4 examples gave 19 features.
Here is where each comes from: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3].


In [22]:
answers = raw_datasets["train"][2:6]["answers"]
start_positions = []
end_positions = []

for i, offset in enumerate(inputs["offset_mapping"]):
    sample_idx = inputs["overflow_to_sample_mapping"][i]
    answer = answers[sample_idx]
    start_char = answer["answer_start"][0]
    end_char = answer["answer_start"][0] + len(answer["text"][0])
    sequence_ids = inputs.sequence_ids(i)

    # Find the start and end of the context
    idx = 0
    while sequence_ids[idx] != 1:
        idx += 1
    context_start = idx
    while sequence_ids[idx] == 1:
        idx += 1
    context_end = idx - 1

    # If the answer is not fully inside the context, label is (0, 0)
    if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
        start_positions.append(0)
        end_positions.append(0)
    else:
        # Otherwise it's the start and end token positions
        idx = context_start
        while idx <= context_end and offset[idx][0] <= start_char:
            idx += 1
        start_positions.append(idx - 1)

        idx = context_end
        while idx >= context_start and offset[idx][1] >= end_char:
            idx -= 1
        end_positions.append(idx + 1)

start_positions, end_positions

([83, 51, 19, 0, 0, 64, 27, 0, 34, 0, 0, 0, 67, 34, 0, 0, 0, 0, 0],
 [85, 53, 21, 0, 0, 70, 33, 0, 40, 0, 0, 0, 68, 35, 0, 0, 0, 0, 0])

In [23]:
idx = 0
sample_idx = inputs["overflow_to_sample_mapping"][idx]
answer = answers[sample_idx]["text"][0]

start = start_positions[idx]
end = end_positions[idx]
labeled_answer = tokenizer.decode(inputs["input_ids"][idx][start : end + 1])

print(f"Theoretical answer: {answer}, labels give: {labeled_answer}")

Theoretical answer: the Main Building, labels give: the Main Building


In [24]:
max_length = 384
stride = 128


def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [25]:
train_dataset = raw_datasets["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)
len(raw_datasets["train"]), len(train_dataset)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

(87599, 88729)

In [26]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [27]:
validation_dataset = raw_datasets["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw_datasets["validation"].column_names,
)
len(raw_datasets["validation"]), len(validation_dataset)

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

(10570, 10822)

In [32]:
validation_dataset["offset_mapping"][0]

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 [0, 5],
 [6, 10],
 [11, 13],
 [14, 17],
 [18, 20],
 [21, 29],
 [30, 38],
 [39, 43],
 [44, 46],
 [47, 56],
 [57, 60],
 [61, 69],
 [70, 72],
 [73, 76],
 [77, 85],
 [86, 94],
 [95, 101],
 [102, 103],
 [103, 106],
 [106, 107],
 [108, 111],
 [112, 115],
 [116, 120],
 [121, 127],
 [127, 128],
 [129, 132],
 [133, 141],
 [142, 150],
 [151, 161],
 [162, 163],
 [163, 166],
 [166, 167],
 [168, 176],
 [177, 183],
 [184, 191],
 [192, 200],
 [201, 204],
 [205, 213],
 [214, 222],
 [223, 233],
 [234, 235],
 [235, 238],
 [238, 239],
 [240, 248],
 [249, 257],
 [258, 266],
 [267, 269],
 [269, 270],
 [270, 272],
 [273, 275],
 [276, 280],
 [281, 286],
 [287, 292],
 [293, 298],
 [299, 303],
 [304, 309],
 [309, 310],
 [311, 314],
 [315, 319],
 [320, 323],
 [324, 330],
 [331, 333],
 [334, 342],
 [343, 344],
 [344, 345],
 [346, 350],
 [350, 351],
 [352, 354],
 [355, 359],
 [359, 360],
 [360, 361],
 [362, 369],
 [370, 37

In [47]:
answers = raw_datasets["train"][2:6]["answers"]
start_positions = []
end_positions = []

In [48]:
answers

[{'text': ['the Main Building'], 'answer_start': [279]},
 {'text': ['a Marian place of prayer and reflection'], 'answer_start': [381]},
 {'text': ['a golden statue of the Virgin Mary'], 'answer_start': [92]},
 {'text': ['September 1876'], 'answer_start': [248]}]

In [68]:
raw_datasets["train"][2]

{'id': '5733be284776f41900661180',
 'title': 'University_of_Notre_Dame',
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'The Basilica of the Sacred heart at Notre Dame is beside to which structure?',
 'answers': {'text': ['the Main Building'], 'answer_start': [279]}}

In [65]:
a = "Guam is served by the Antonio B. Won Pat International Airport, which is a hub for United Airlines. The island is outside the United States customs zone so Guam is responsible for establishing and operating its own customs and quarantine agency and jurisdiction. Therefore, the U.S. Customs and Border Protection only carries immigration (but not customs) functions. Since Guam is under federal immigration jurisdiction, passengers arriving directly from the United States skip immigration and proceed directly to Guam Customs and Quarantine."
a[279:]

'.S. Customs and Border Protection only carries immigration (but not customs) functions. Since Guam is under federal immigration jurisdiction, passengers arriving directly from the United States skip immigration and proceed directly to Guam Customs and Quarantine.'

In [62]:
len(inputs["offset_mapping"])

[(0, 0),
 (0, 3),
 (4, 12),
 (13, 15),
 (16, 19),
 (20, 26),
 (27, 32),
 (33, 35),
 (36, 41),
 (42, 46),
 (47, 49),
 (50, 56),
 (57, 59),
 (60, 65),
 (66, 75),
 (75, 76),
 (0, 0),
 (0, 13),
 (13, 15),
 (15, 16),
 (17, 20),
 (21, 27),
 (28, 31),
 (32, 33),
 (34, 42),
 (43, 52),
 (52, 53),
 (54, 58),
 (59, 62),
 (63, 67),
 (68, 76),
 (76, 77),
 (77, 78),
 (79, 83),
 (84, 88),
 (89, 91),
 (92, 93),
 (94, 100),
 (101, 107),
 (108, 110),
 (111, 114),
 (115, 121),
 (122, 126),
 (126, 127),
 (128, 139),
 (140, 142),
 (143, 148),
 (149, 151),
 (152, 155),
 (156, 160),
 (161, 169),
 (170, 173),
 (174, 180),
 (181, 183),
 (183, 184),
 (185, 187),
 (188, 189),
 (190, 196),
 (197, 203),
 (204, 206),
 (207, 213),
 (214, 218),
 (219, 223),
 (224, 226),
 (226, 229),
 (229, 232),
 (233, 237),
 (238, 241),
 (242, 248),
 (249, 250),
 (250, 252),
 (252, 254),
 (254, 256),
 (257, 259),
 (260, 262),
 (263, 265),
 (265, 268),
 (268, 269),
 (269, 270),
 (271, 275),
 (276, 278),
 (279, 282),
 (283, 287),
 (28

In [70]:
sample_idx = inputs["overflow_to_sample_mapping"][0]
answer = answers[sample_idx]
start_char = answer["answer_start"][0]
end_char = answer["answer_start"][0] + len(answer["text"][0])
sequence_ids = inputs.sequence_ids(0)
sequence_ids

[None,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 None,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 None]

In [50]:
answer

{'text': ['the Main Building'], 'answer_start': [279]}

In [51]:
idx = 0
while sequence_ids[idx] != 1:
    idx += 1
context_start = idx
while sequence_ids[idx] == 1:
    idx += 1
context_end = idx - 1
context_end

98

In [58]:
inputs["offset_mapping"][2][17][0]

279

In [23]:
max_length = 384
stride = 128


def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [24]:
train_dataset = raw_datasets["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)
len(raw_datasets["train"]), len(train_dataset)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

(87599, 88524)

In [25]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [26]:
validation_dataset = raw_datasets["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw_datasets["validation"].column_names,
)
len(raw_datasets["validation"]), len(validation_dataset)

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

(10570, 10784)

In [27]:
small_eval_set = raw_datasets["validation"].select(range(100))
trained_checkpoint = "distilbert-base-cased-distilled-squad"

tokenizer = AutoTokenizer.from_pretrained(trained_checkpoint)
eval_set = small_eval_set.map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw_datasets["validation"].column_names,
)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)lve/main/config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [28]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [30]:
import tensorflow as tf
from transformers import TFAutoModelForQuestionAnswering

eval_set_for_model = eval_set.remove_columns(["example_id", "offset_mapping"])
eval_set_for_model.set_format("numpy")

batch = {k: eval_set_for_model[k] for k in eval_set_for_model.column_names}
trained_model = TFAutoModelForQuestionAnswering.from_pretrained(trained_checkpoint)

outputs = trained_model(**batch)

Downloading model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFDistilBertForQuestionAnswering.

All the weights of TFDistilBertForQuestionAnswering were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForQuestionAnswering for predictions without further training.


In [31]:
start_logits = outputs.start_logits.numpy()
end_logits = outputs.end_logits.numpy()

In [32]:
import collections

example_to_features = collections.defaultdict(list)
for idx, feature in enumerate(eval_set):
    example_to_features[feature["example_id"]].append(idx)

In [33]:
import numpy as np

n_best = 20
max_answer_length = 30
predicted_answers = []

for example in small_eval_set:
    example_id = example["id"]
    context = example["context"]
    answers = []

    for feature_index in example_to_features[example_id]:
        start_logit = start_logits[feature_index]
        end_logit = end_logits[feature_index]
        offsets = eval_set["offset_mapping"][feature_index]

        start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
        end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
        for start_index in start_indexes:
            for end_index in end_indexes:
                # Skip answers that are not fully in the context
                if offsets[start_index] is None or offsets[end_index] is None:
                    continue
                # Skip answers with a length that is either < 0 or > max_answer_length.
                if (
                    end_index < start_index
                    or end_index - start_index + 1 > max_answer_length
                ):
                    continue

                answers.append(
                    {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                )

    best_answer = max(answers, key=lambda x: x["logit_score"])
    predicted_answers.append({"id": example_id, "prediction_text": best_answer["text"]})

In [35]:
import evaluate

metric = evaluate.load("squad")

Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

In [36]:
theoretical_answers = [
    {"id": ex["id"], "answers": ex["answers"]} for ex in small_eval_set
]

In [37]:
print(predicted_answers[0])
print(theoretical_answers[0])

{'id': '56be4db0acb8001400a502ec', 'prediction_text': 'Denver Broncos'}
{'id': '56be4db0acb8001400a502ec', 'answers': {'text': ['Denver Broncos', 'Denver Broncos', 'Denver Broncos'], 'answer_start': [177, 177, 177]}}


In [38]:
metric.compute(predictions=predicted_answers, references=theoretical_answers)

{'exact_match': 83.0, 'f1': 88.25000000000004}

In [39]:
from tqdm.auto import tqdm


def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

In [40]:
compute_metrics(start_logits, end_logits, eval_set, small_eval_set)

  0%|          | 0/100 [00:00<?, ?it/s]

{'exact_match': 83.0, 'f1': 88.25000000000004}

In [7]:
from transformers import TFAutoModelForQuestionAnswering


model = TFAutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
All PyTorch model weights were used when initializing TFBertForQuestionAnswering.

Some weights or buffers of the TF 2.0 model TFBertForQuestionAnswering were not initialized from the PyTorch model and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
for name, param in model.parameters():
    print(name, param.requires_grad)

AttributeError: 'TFBertForQuestionAnswering' object has no attribute 'parameters'

In [13]:
import tensorflow as tf

tf.keras.utils.plot_model(model=model, show_shapes=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [14]:
for layer in model.layers:
    print(layer.trainable)

True
True


In [18]:
model.summary()

Model: "tf_bert_for_question_answering"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  107719680 
                                                                 
 qa_outputs (Dense)          multiple                  1538      
                                                                 
Total params: 107,721,218
Trainable params: 107,721,218
Non-trainable params: 0
_________________________________________________________________


In [43]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator(return_tensors="tf")

In [53]:
tf_train_dataset = model.prepare_tf_dataset(
    train_dataset,
    collate_fn=data_collator,
    shuffle=True,
    batch_size=8,
)
tf_eval_dataset = model.prepare_tf_dataset(
    validation_dataset,
    collate_fn=data_collator,
    shuffle=False,
    batch_size=8,
)

In [54]:
from transformers import create_optimizer
from transformers.keras_callbacks import PushToHubCallback
import tensorflow as tf

# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
# by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,
# not the original Hugging Face Dataset, so its len() is already num_samples // batch_size.
num_train_epochs = 3
num_train_steps = len(tf_train_dataset) * num_train_epochs
optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

In [55]:
from transformers.keras_callbacks import PushToHubCallback

# We're going to do validation afterwards, so no validation mid-training
model.fit(tf_train_dataset, epochs=num_train_epochs)

Epoch 1/3

KeyboardInterrupt: 