# BERT for Question Answer

## Setup

In [16]:
# if using colab, uncomment the below
# !pip install torch "argilla" datasets accelerate transformers setfit
# !pip install wandb

In [4]:
from datasets import load_dataset

In [5]:
squadv2 = load_dataset('squad_v2')

Downloading builder script:   0%|          | 0.00/5.28k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.40k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.02k [00:00<?, ?B/s]

Downloading and preparing dataset squad_v2/squad_v2 to /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/9.55M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/801k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

Dataset squad_v2 downloaded and prepared to /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
print(squadv2)

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})


In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

## Preprocessing Data

Our sequences will look like

```
[CLS] ...question tokens... [SEP] ...context tokens... [SEP]
```

In cases where the context is too long, we'll split into multiple sequences, like

```
[CLS] ...question tokens... [SEP] ...some context tokens... [SEP]
[CLS] ...question tokens... [SEP] ...overlap from prev sequence... ...more context tokens... [SEP]
...
```

Bassed on the question tokens, the model needs to get a contiguous subset of the context tokens as the answer. Our dataset contains the start position of the answer in the original context string.

The HuggingFace tokenizer is able to map each item in the tokenized sequence to the start and end indices in the original context string.

We need to find which indices in the tokenized sequence map to the start and end of the answer so that our model knows how to predict the contiguous answer section.

If there is no answer available in a sequence, we will set the answer start and end to the `[CLS]` token.

Additionally, for context split accross multiple tokenized sequences, for sequences without the answer (or with only a part of the answer), we will treat it the same as 'no answer' sequences.

In [8]:
def map_answer(offset, ans_start, ans_end, sequence_ids):

    # get start and end indices in tokenized sequence
    idx = 0
    while sequence_ids[idx] != 1: idx += 1
    context_start = idx
    while sequence_ids[idx] == 1: idx += 1
    context_end = idx - 1

    # start with [CLS]
    start, end = 0, 0

    # if answer is not fully in this tokenized sequence, map to [CLS]
    if offset[context_end][0] > ans_end or offset[context_end][1] < ans_start:
        return start, end
    
    idx = context_start
    while idx <= context_end and offset[idx][0] <= ans_start: idx += 1
    start = idx - 1

    idx = context_end
    while idx >= context_start and offset[idx][1] >= ans_end: idx -= 1
    end = idx + 1

    return start, end

def get_answer_mapped_data(batch):
    questions = batch['question']
    contexts = batch['context']
    answers = batch['answers']

    inputs = tokenizer(
        # add data for tokenizing and padding
        questions, contexts,        # data to tokenize
        max_length=400,             # max_length per sequence
        padding='max_length',       # pad til max_length

        # handling truncation
        truncation='only_second',   # only truncate context
        stride=128,                 # overlap size
        return_overflowing_tokens=True, # tokenizer automatically 
                                        # makes extra sequences

        # get mappings to original sentence
        return_offsets_mapping=True,# used to map answer to sequence
    )

    offset_mapping = inputs.pop('offset_mapping')
    sample_map = inputs.pop('overflow_to_sample_mapping')
    starts = []
    ends = []

    for i, offset in enumerate(offset_mapping):

        map_i = sample_map[i]

        answer = answers[map_i]
        text = answer['text']
        
        # SQuAD v2 has some adversarial examples with 'unanswerable' questions
        # in this case, map to [CLS]
        if len(text) < 1:
            starts.append(0)
            ends.append(0)
            continue

        ans_start = answer['answer_start'][0]
        ans_end = ans_start + len(text[0])
        sequence_ids = inputs.sequence_ids(map_i)

        start, end = map_answer(offset, ans_start, ans_end, sequence_ids)

        starts.append(start)
        ends.append(end)

    inputs['start_positions'] = starts
    inputs['end_positions'] = ends

    return inputs

In [9]:
tokenized_squadv2 = squadv2.map(get_answer_mapped_data,
                                batched=True,
                                remove_columns=squadv2['train'].column_names)

Map:   0%|          | 0/130319 [00:00<?, ? examples/s]

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [10]:
print(tokenized_squadv2['train'][0].keys())

dict_keys(['input_ids', 'attention_mask', 'start_positions', 'end_positions'])


# Train

### Set Up HuggingFace Training

In [17]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
from transformers import DefaultDataCollator

We will use DistilBERT for lower memory usage and thus faster training (from larger batch sizes).

In [18]:
dbert_qa = AutoModelForQuestionAnswering.from_pretrained('distilbert-base-uncased')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this mode

In [19]:
BATCH_SIZE = 16
LR = 2e-5
EPOCHS = 3
WEIGHT_DECAY = 0.01
CHKPT_DIR = 'checkpoints'

In [20]:
data_collator = DefaultDataCollator()

train_args = TrainingArguments(
    # save model
    output_dir=CHKPT_DIR,

    # epochs
    evaluation_strategy='epoch',
    num_train_epochs=EPOCHS,

    # batch sizes
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    
    # hyperparams
    learning_rate=LR,
    weight_decay=WEIGHT_DECAY,

    # log to wandb
    report_to='wandb',

)

trainer = Trainer(
    model=dbert_qa,
    args=train_args,
    train_dataset=tokenized_squadv2['train'],
    eval_dataset=tokenized_squadv2['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

### Run Training

In [21]:
import wandb

# use to log in to wandb if needed
# API_KEY = # wandb api key
# wandb.login(key=API_KEY)

wandb.init(
    project='SQuAD2.0 with Fine-Tuned DistilBERT',
    notes='Solving Standford\'s SQuAD 2.0 Q&A dataset with DistilBERT transfer learning.',
)

wandb.config = {
    'epochs': EPOCHS, 
    'learning_rate': LR, 
    'batch_size': BATCH_SIZE,
    'weight_decay': WEIGHT_DECAY,
}

trainer.train()

wandb.finish()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Epoch,Training Loss,Validation Loss
1,0.1419,0.109952
2,0.118,0.121593
3,0.0924,0.144285


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/loss,▁▃█
eval/runtime,▁█▆
eval/samples_per_second,█▁▃
eval/steps_per_second,█▁▃
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
train/learning_rate,████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
train/loss,█▃▃▃▃▃▃▂▃▂▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▂▁▁▁▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.14428
eval/runtime,41.8398
eval/samples_per_second,289.294
eval/steps_per_second,18.093
train/epoch,3.0
train/global_step,24654.0
train/learning_rate,0.0
train/loss,0.0924
train/total_flos,4.02621817931424e+16
train/train_loss,0.13578
