# QA finetuning
Notebook demonstrating fine-tuning bert model for qa task

In [1]:
import pandas as pd
import transformers
from sympy import field
from transformers import BertForQuestionAnswering, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

#Tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Running on SQuAD [1] dataset

In [5]:
from datasets import load_dataset

raw_datasets = load_dataset("squad")

In [6]:
max_length = 384
stride = 128


def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [14]:
raw_datasets["train"] = raw_datasets["train"].select(range(1000))
train_dataset = raw_datasets["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)
len(raw_datasets["train"]), len(train_dataset)

Map: 100%|██████████| 1000/1000 [00:00<00:00, 3721.27 examples/s]


(1000, 1032)

In [15]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [16]:
raw_datasets["validation"] = raw_datasets["validation"].select(range(200))
validation_dataset = raw_datasets["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw_datasets["validation"].column_names,
)
len(raw_datasets["validation"]), len(validation_dataset)

Map: 100%|██████████| 200/200 [00:00<00:00, 1509.46 examples/s]


(200, 200)

In [63]:
import torch

eval_set_for_model = validation_dataset.remove_columns(["example_id", "offset_mapping"])
eval_set_for_model.set_format("torch")

batch = {k: eval_set_for_model[k] for k in eval_set_for_model.column_names}
batch

{'input_ids': tensor([[ 101, 2029, 5088,  ...,    0,    0,    0],
         [ 101, 2029, 5088,  ...,    0,    0,    0],
         [ 101, 2073, 2106,  ...,    0,    0,    0],
         ...,
         [ 101, 2029, 3792,  ...,    0,    0,    0],
         [ 101, 2129, 2116,  ...,    0,    0,    0],
         [ 101, 1999, 2054,  ...,    0,    0,    0]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

In [18]:
from tqdm.auto import tqdm
import collections
import numpy as np

import evaluate

metric = evaluate.load("squad")

n_best = 20
max_answer_length = 30

def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

In [20]:
with torch.no_grad():
    outputs = model(**batch)

In [21]:
start_logits = outputs.start_logits.cpu().numpy()
end_logits = outputs.end_logits.cpu().numpy()

In [22]:
import collections

example_to_features = collections.defaultdict(list)
for idx, feature in enumerate(validation_dataset):
    example_to_features[feature["example_id"]].append(idx)

In [24]:
import numpy as np

n_best = 20
max_answer_length = 30
predicted_answers = []

for example in raw_datasets["validation"]:
    example_id = example["id"]
    context = example["context"]
    answers = []

    for feature_index in example_to_features[example_id]:
        start_logit = start_logits[feature_index]
        end_logit = end_logits[feature_index]
        offsets = validation_dataset["offset_mapping"][feature_index]

        start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
        end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
        for start_index in start_indexes:
            for end_index in end_indexes:
                # Skip answers that are not fully in the context
                if offsets[start_index] is None or offsets[end_index] is None:
                    continue
                # Skip answers with a length that is either < 0 or > max_answer_length.
                if (
                    end_index < start_index
                    or end_index - start_index + 1 > max_answer_length
                ):
                    continue

                answers.append(
                    {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                )

    best_answer = max(answers, key=lambda x: x["logit_score"])
    predicted_answers.append({"id": example_id, "prediction_text": best_answer["text"]})

In [26]:
theoretical_answers = [
    {"id": ex["id"], "answers": ex["answers"]} for ex in raw_datasets["validation"]
]

In [27]:
print(predicted_answers[0])
print(theoretical_answers[0])

{'id': '56be4db0acb8001400a502ec', 'prediction_text': 'Denver Broncos'}
{'id': '56be4db0acb8001400a502ec', 'answers': {'text': ['Denver Broncos', 'Denver Broncos', 'Denver Broncos'], 'answer_start': [177, 177, 177]}}


In [28]:
metric.compute(predictions=predicted_answers, references=theoretical_answers)

{'exact_match': 95.5, 'f1': 97.58095238095238}

## Fine-tuning on coupon data
### dataset preparation

In [1]:
import json, pandas

In [2]:
USED_COLUMNS = ["Text", "X 1", "Y 1", "View Class Name", "View Depth"]

with open("ds/18929485529_expected.json", "r") as f:
    resps = json.load(f)
    
for x in resps["coupons"]:
    x.pop("discount")
    x.pop("validity")
    
frame = pandas.read_csv("ds/18929485529.csv")
# currently hardcoded
sample_indices = [slice(2, 7), slice(7, 12), slice(49, 54), slice(78, 83), slice(92, 97), slice(97, 102)]

frame = frame[USED_COLUMNS]
contexts = [frame[ind].to_csv() for ind in sample_indices]

QUESTIONS = {
    "old_price": "What is the old price of product?",
    "new_price": "What is the new price of product?",
    "product_name": "What is the name of the discounted product?"
}

### Converting answers to locations in contexts

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

answers_converted = {k: [] for k in QUESTIONS}
answers = {k: [e[k] for e in resps['coupons']] for k in QUESTIONS}

tokenized = tokenizer(contexts, return_offsets_mapping=True, add_special_tokens=False)

for i, (ctx, tokenized_ctx, ctx_offsets) in enumerate(zip(contexts, tokenized["input_ids"], tokenized["offset_mapping"])):
    decoded_tokens = tokenizer.convert_ids_to_tokens(tokenized_ctx)
    token_offsets = []
    for token, (start, end) in zip(decoded_tokens, ctx_offsets):
        token_offsets.append({"token": token, "start": start, "end": end, "text": ctx[start:end]})

    # Print tokens alongside their positions and text
    """for t in token_offsets:
        print(f"Token: {t['token']}, Start: {t['start']}, End: {t['end']}, Text: '{t['text']}'")"""
        
    for q in answers:
        answer = answers[q][i]
        start_char = ctx.find(answer)
        end_char = start_char + len(answer)
        
        # Locate the corresponding tokens
        start_token_idx = None
        end_token_idx = None
        
        for idx, (start, end) in enumerate(ctx_offsets):
            if start <= start_char < end:
                start_token_idx = idx
            if start < end_char <= end:
                end_token_idx = idx
                break
        
        print(f"Answer: '{answer}'")
        print(f"Character-level Start: {start_char}, End: {end_char}")
        print(f"Token-level Start: {start_token_idx}, End: {end_token_idx}")
        
        answers_converted[q].append([start_token_idx, end_token_idx])

  from .autonotebook import tqdm as notebook_tqdm


Answer: '14.99'
Character-level Start: 47, End: 52
Token-level Start: 19, End: 21
Answer: '9.99'
Character-level Start: 90, End: 94
Token-level Start: 39, End: 41
Answer: 'JOHNNIE WALKER Red Label Blended Scotch'
Character-level Start: 172, End: 211
Token-level Start: 78, End: 83
Answer: '0.99'
Character-level Start: 47, End: 51
Token-level Start: 19, End: 21
Answer: '0.75'
Character-level Start: 89, End: 93
Token-level Start: 40, End: 42
Answer: 'SAN MIGUEL Especial'
Character-level Start: 173, End: 192
Token-level Start: 81, End: 85
Answer: '2.99'
Character-level Start: 48, End: 52
Token-level Start: 19, End: 21
Answer: '2.79'
Character-level Start: 92, End: 96
Token-level Start: 40, End: 42
Answer: 'FELIX Knabber Mix'
Character-level Start: 178, End: 195
Token-level Start: 79, End: 83
Answer: '8.99'
Character-level Start: 48, End: 52
Token-level Start: 19, End: 21
Answer: '5.85'
Character-level Start: 92, End: 96
Token-level Start: 40, End: 42
Answer: 'CHANTRÉ Weinbrand'
Character-l

### Create JSON dataset and convert it to datasets library object

In [5]:
as_json = [{
   "id": ci * len(QUESTIONS) + qi,
   "title":"example_title",
   "context": ctx,
   "question": QUESTIONS[q_key],
   "answers":{
      "text":[
         answers[q_key][ci]
      ],
      "answer_start":[
         answers_converted[q_key][ci][0]
      ]
   }
} for ci, ctx in enumerate(contexts) for qi, q_key in enumerate(QUESTIONS)]
with open("ds.json", "w") as f:
    for entry in as_json:
        json.dump(entry, f)
        f.write("\n")

from datasets import load_dataset

dataset = load_dataset("json", data_files="ds.json")
dataset

Generating train split: 18 examples [00:00, 7358.43 examples/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 18
    })
})

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_function(examples):
    questions = examples["question"]
    contexts = examples["context"]
    answers = examples["answers"]
    
    inputs = tokenizer(
        questions,
        contexts,
        max_length=512,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    
    start_positions = []
    end_positions = []
    
    for i, answer in enumerate(answers):
        start_char = answer['answer_start'][0]
        end_char = start_char + len(answer['text'][0])
        
        # Map start and end character positions to token indices
        start_positions.append(inputs.char_to_token(i, start_char))
        end_positions.append(inputs.char_to_token(i, end_char - 1))
        
        # Handling edge cases where the tokenizer may not capture the exact indices
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length - 1
    
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    
    return inputs

tokenized_dataset = dataset['train'].map(preprocess_function, batched=True)


Map: 100%|██████████| 18/18 [00:00<00:00, 979.46 examples/s]


In [None]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased")

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

trainer.train()


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


### Sources
[1]: https://arxiv.org/abs/1606.05250

1. https://arxiv.org/abs/1606.05250