# Training of the question answering model

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 30.0 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 30.3 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.0-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 53.0 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.0 tokenizers-0.13.2 transformers-4.24.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 26.7 M

## Loading the dataset

In [None]:
from datasets import load_dataset

cuad = load_dataset("cuad")

Downloading builder script:   0%|          | 0.00/5.19k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.91k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/15.5k [00:00<?, ?B/s]

Downloading and preparing dataset cuad/default to /root/.cache/huggingface/datasets/cuad/default/1.0.0/01ed7dc61ab84230462731422e77cbb6f54ea8590b22a2d881b594f4d7f3746c...


Downloading data:   0%|          | 0.00/18.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/22450 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/4182 [00:00<?, ? examples/s]

Dataset cuad downloaded and prepared to /root/.cache/huggingface/datasets/cuad/default/1.0.0/01ed7dc61ab84230462731422e77cbb6f54ea8590b22a2d881b594f4d7f3746c. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

## Tokenization

### Downloading the tokenizer

In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")

Downloading:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

### Pre-processing function
Used to add the "end" index of the answers 

In [None]:
def preprocess_function(examples):
    #print(examples["context"][0])
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        if (len(answer["answer_start"])==0):
          start_char = -1
        else:
          start_char = answer["answer_start"][0]

        if (len(answer["answer_start"])==0):
          end_char = -1
        else:
          end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

### Tokenization of the train and validation set 

In [None]:
print(cuad["train"].column_names)

['id', 'title', 'context', 'question', 'answers']


In [None]:
tokenized_cuad = cuad.map(preprocess_function, batched=True, remove_columns=cuad["train"].column_names)

  0%|          | 0/23 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

## Initialization of the data collectors

In [None]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

## Downloading the model

In [None]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")

Downloading:   0%|          | 0.00/496M [00:00<?, ?B/s]

## Training of the model

In [None]:
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/CSI5386 : Assignment 2/trained-models/roberta-base-CUAD",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=6,
    weight_decay=0.01,
    save_steps = 1000
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_cuad["train"],
    eval_dataset=tokenized_cuad["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train(resume_from_checkpoint = False)

***** Running training *****
  Num examples = 22450
  Num Epochs = 6
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 8424
  Number of trainable parameters = 124056578


Epoch,Training Loss,Validation Loss
1,0.31,0.141579
2,0.2581,0.11387
3,0.2447,0.13629
4,0.2173,0.131713
5,0.1891,0.141734
6,0.1736,0.14257


Saving model checkpoint to /content/drive/MyDrive/CSI5386 : Question Answering /trained_models/roberta-base-CUAD/checkpoint-1000
Configuration saved in /content/drive/MyDrive/CSI5386 : Question Answering /trained_models/roberta-base-CUAD/checkpoint-1000/config.json
Model weights saved in /content/drive/MyDrive/CSI5386 : Question Answering /trained_models/roberta-base-CUAD/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/CSI5386 : Question Answering /trained_models/roberta-base-CUAD/checkpoint-1000/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/CSI5386 : Question Answering /trained_models/roberta-base-CUAD/checkpoint-1000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 4182
  Batch size = 16
Saving model checkpoint to /content/drive/MyDrive/CSI5386 : Question Answering /trained_models/roberta-base-CUAD/checkpoint-2000
Configuration saved in /content/drive/MyDrive/CSI5386 : Question Answering /train

TrainOutput(global_step=8424, training_loss=0.23298567041033014, metrics={'train_runtime': 10679.5746, 'train_samples_per_second': 12.613, 'train_steps_per_second': 0.789, 'total_flos': 2.63975048492544e+16, 'train_loss': 0.23298567041033014, 'epoch': 6.0})

# Predictions

In [3]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
import torch
import os

## Running predictions

In [5]:
from datasets import load_dataset
from transformers import pipeline
cuad = load_dataset("cuad")

Downloading builder script:   0%|          | 0.00/5.19k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.91k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/15.5k [00:00<?, ?B/s]

Downloading and preparing dataset cuad/default to /root/.cache/huggingface/datasets/cuad/default/1.0.0/01ed7dc61ab84230462731422e77cbb6f54ea8590b22a2d881b594f4d7f3746c...


Downloading data:   0%|          | 0.00/18.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/22450 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/4182 [00:00<?, ? examples/s]

Dataset cuad downloaded and prepared to /root/.cache/huggingface/datasets/cuad/default/1.0.0/01ed7dc61ab84230462731422e77cbb6f54ea8590b22a2d881b594f4d7f3746c. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

## Functions for evaluation of the model 

In [6]:
def get_prediction(qid):
    # given a question id (qas_id or qid), load the example, get the model outputs and generate an answer

    found = False
    index=0
    for index in range(len(cuad["test"]["id"])):
      if (cuad["test"]["id"][index]==qid):
        found = True
        break
    
    question_curr = cuad["test"]["question"][index]
    context_curr = cuad["test"]["context"][index]

    # Loading the model 
    question_answering = pipeline(task="question-answering",model="/content/drive/MyDrive/CSI5386 : Assignment 2/trained-models/roberta-base-CUAD/checkpoint-8000",tokenizer="/content/drive/MyDrive/CSI5386 : Assignment 2/trained-models/roberta-base-CUAD/checkpoint-8000",batch_size=20,device=torch.cuda.current_device(),num_workers=20)

    result = question_answering(question=question_curr, context=context_curr)
    
    return result

In [7]:
# these functions are heavily influenced by the HF squad_metrics.py script
def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
    import string, re

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        #print("TEXT : "+text)
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()
    
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)
    
    common_tokens = set(pred_tokens) & set(truth_tokens)
    
    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0
    
    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)
    
    return 2 * (prec * rec) / (prec + rec)

def get_gold_answers(example):
    """helper function that retrieves all possible true answers from a squad2.0 example"""
    
    if (len(example["text"])==0):
      gold_answers = [""]
    else:
      gold_answers = example["text"]

    # if gold_answers doesn't exist it's because this is a negative example - 
    # the only correct answer is an empty string
    
        
    return gold_answers

## Evaluation of model
Using the validation set 

In [None]:
from tqdm import tqdm
import json

# Creating a JSON file to store the rsults
# filename = "/content/drive/MyDrive/Question-Answering-main/trained_models/roberta-base-CUAD-2/predictions-roberta-base-cuad.json"
filename = "/content/drive/MyDrive/CSI5386 : Assignment 2/trained-models/roberta-base-CUAD/predictions-roberta-base-cuad-checkpoint-8000.json"

lst = json.load(open(filename))
# Initialiazing the varibalers for the scores
average_f1 = 0
average_em = 0

# Looping through the testing set, using a progress bar. We will also save the prediction to a JSON file to avoid data loss
with tqdm(
    bar_format="{postfix} | Elapsed: {elapsed} | {rate_fmt}",
    postfix=average_f1,
) as t:
    
    for index in range(1113,len(cuad["test"]["title"])):
      
      # Retrieving the predictions
      prediction = get_prediction(cuad["test"]["id"][index])
      example = cuad["test"]["answers"][index]

      # Retrieving the golg answer
      gold_answers = get_gold_answers(example)

      # Computing the scores
      em_score = max((compute_exact_match(prediction["answer"], answer)) for answer in gold_answers)
      f1_score = max((compute_f1(prediction["answer"], answer)) for answer in gold_answers)

      # Keeping track of the Exact Matcb Score and F1 score
      average_f1 = f1_score + average_f1
      average_em = em_score + average_em

      # Updating the progress bar
      t.postfix = "Index : "+str((index))+"/"+str(len(cuad["test"]["title"]))+ " | F1 : "+str((average_f1/(index+1)))+" | EM : "+str(average_em/(index+1)) 
      t.update()

      # Saving the prediction to the JSON file
      # with open(filename,"r") as file:
      #   predictionJSON=json.load(file)

      lst.append({cuad["test"]["id"][index]:prediction["answer"],"score":prediction["score"]})
      
      with open(filename, "w") as file:
        json.dump(lst, file,indent=2)


, Index : 2997/4182 | F1 : 0.1707475321733802 | EM : 0.1534356237491661 | Elapsed: 5:51:37 | 16.05s/it  