<a href="https://colab.research.google.com/github/bhadreshpsavani/UnderstandingNLP/blob/master/AnalysingPretrainedLongFormerModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Performance of Longformer on SQAUD2

In [4]:
!pip install -q transformers
!pip install -q datasets

In [5]:
from transformers import squad_convert_examples_to_features
from transformers.data.processors.squad import SquadV2Processor, SquadFeatures

In [6]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
tokenizer = AutoTokenizer.from_pretrained("mrm8488/longformer-base-4096-finetuned-squadv2", use_fast=False)
model = AutoModelForQuestionAnswering.from_pretrained("mrm8488/longformer-base-4096-finetuned-squadv2")

In [7]:
processor = SquadV2Processor()
valid_examples = processor.get_train_examples("./", filename="dev-v2.0.json")

100%|██████████| 35/35 [00:04<00:00,  8.66it/s]


In [8]:
valid_features, valid_dataset = squad_convert_examples_to_features(
            examples=valid_examples,
            tokenizer=tokenizer,
            max_seq_length=512,
            doc_stride=128,
            max_query_length=64,
            is_training=True,
            return_dataset="pt")

convert squad examples to features: 100%|██████████| 11873/11873 [01:26<00:00, 137.33it/s]
add example index and unique id: 100%|██████████| 11873/11873 [00:00<00:00, 837590.97it/s]


In [9]:
valid_features[0].input_ids
valid_features[0].start_position
valid_features[0].end_position

53

In [10]:
## SQuAD evaluation script. Modifed slightly for this notebook
from __future__ import print_function
from collections import Counter
import string
import re
import argparse
import json
import sys

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def exact_match_score(prediction, ground_truth):
    return (normalize_answer(prediction) == normalize_answer(ground_truth))


def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    scores_for_ground_truths = []
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        scores_for_ground_truths.append(score)
    return max(scores_for_ground_truths)


def evaluate(gold_answers, predictions):
    f1 = exact_match = total = 0

    for ground_truths, prediction in zip(gold_answers, predictions):
        normalize_answer()
        total += 1
        exact_match +=exact_match_score(prediction, ground_truths)
        f1 +=f1_score(prediction, ground_truths)
    
    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total

    return {'exact_match': exact_match, 'f1': f1}

In [11]:
from tqdm.notebook import tqdm
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.cuda.is_available()

True

In [12]:
model.to(device)

LongformerForQuestionAnswering(
  (longformer): LongformerModel(
    (embeddings): LongformerEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(4098, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): LongformerEncoder(
      (layer): ModuleList(
        (0): LongformerLayer(
          (attention): LongformerAttention(
            (self): LongformerSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (query_global): Linear(in_features=768, out_features=768, bias=True)
              (key_global): Linear(in_features=768, out_features=768, bias=True)
              (value_gl

In [13]:
for example in tqdm(valid_dataset):
  print(example[0][example[3]:example[4]+1])
  break

HBox(children=(FloatProgress(value=0.0, max=12006.0), HTML(value='')))

tensor([  11, 1470])


In [14]:
predictions=[]
answers=[]
for example in tqdm(valid_dataset):
    output = model(input_ids=torch.reshape(example[0], [1, -1]).to(device), attention_mask=torch.reshape(example[1], [1, -1]).to(device))
    start_index = torch.argmax(output['start_logits'])
    end_index = torch.argmax(output['end_logits'])
    ans_ids = example[0][start_index :end_index+1]
    answer = tokenizer.decode(ans_ids)
    answer = tokenizer.clean_up_tokenization(answer)
    predictions.append(answer.strip())

HBox(children=(FloatProgress(value=0.0, max=12006.0), HTML(value='')))

	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:882.)
  sep_token_indices = (input_ids == sep_token_id).nonzero()





In [24]:
answers=[]
for example in tqdm(valid_dataset):
  ori_ans_ids = example[0][example[3]+1:example[4]+1]
  ori_answer = tokenizer.decode(ori_ans_ids)
  ori_answer = tokenizer.clean_up_tokenization(ori_answer)
  answers.append(ori_answer.strip())

HBox(children=(FloatProgress(value=0.0, max=12006.0), HTML(value='')))




In [25]:
answers[:15]

['France',
 '10th and 11th',
 'Denmark, Iceland and',
 'Rollo',
 '10th century',
 '',
 '',
 '',
 '',
 'William the Conqueror',
 'Richard',
 '',
 '',
 '',
 '']

In [17]:
new_preds = [pred if pred!='<s>' else '' for pred in predictions]
new_preds[:15]

['France',
 '10th and 11th centuries',
 '',
 'Rollo',
 '10th',
 '',
 '',
 '',
 '',
 'William the Conqueror',
 'Richard I of Normandy',
 'Catholic',
 '',
 '',
 '']

In [32]:
"""
predictions = [{'prediction_text': '1976', 'id': '56e10a3be3433e1400422b22', 'no_answer_probability': 0.}]
references = [{'answers': {'answer_start': [97], 'text': ['1976']}, 'id': '56e10a3be3433e1400422b22'}]
"""
def formatter(new_preds, answers):
  id = 10000000
  predictions = []
  references = []
  for i in range(len(new_preds)):
    no_answer_probability = 0 if answers[i]!='' else 1.0
    pred = {'prediction_text':new_preds[i], 'id': id+i, 'no_answer_probability' : no_answer_probability}
    ref = {'answers': {'answer_start': [valid_dataset[i][3]], 'text': [answers[i]]}, 'id': id+1}
    predictions.append(pred)
    references.append(ref)
  return predictions, references

In [33]:
predictions , references =  formatter(new_preds, answers)
references

[{'answers': {'answer_start': [tensor(52)], 'text': ['France']},
  'id': 10000001},
 {'answers': {'answer_start': [tensor(38)], 'text': ['10th and 11th']},
  'id': 10000001},
 {'answers': {'answer_start': [tensor(76)], 'text': ['Denmark, Iceland and']},
  'id': 10000001},
 {'answers': {'answer_start': [tensor(84)], 'text': ['Rollo']},
  'id': 10000001},
 {'answers': {'answer_start': [tensor(163)], 'text': ['10th century']},
  'id': 10000001},
 {'answers': {'answer_start': [tensor(0)], 'text': ['']}, 'id': 10000001},
 {'answers': {'answer_start': [tensor(0)], 'text': ['']}, 'id': 10000001},
 {'answers': {'answer_start': [tensor(0)], 'text': ['']}, 'id': 10000001},
 {'answers': {'answer_start': [tensor(0)], 'text': ['']}, 'id': 10000001},
 {'answers': {'answer_start': [tensor(208)],
   'text': ['William the Conqueror']},
  'id': 10000001},
 {'answers': {'answer_start': [tensor(123)], 'text': ['Richard']},
  'id': 10000001},
 {'answers': {'answer_start': [tensor(50)], 'text': ['']}, 'id':

In [34]:
from datasets import load_metric

In [35]:
squad_metric = load_metric('squad_v2')
score = squad_metric.compute(predictions=predictions, references=references)

ArrowTypeError: ignored

In [23]:
squad_metric

Metric(name: "squad_v2", features: {'predictions': {'id': Value(dtype='string', id=None), 'prediction_text': Value(dtype='string', id=None), 'no_answer_probability': Value(dtype='float32', id=None)}, 'references': {'id': Value(dtype='string', id=None), 'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None)}}, usage: """
Computes SQuAD v2 scores (F1 and EM).
Args:
    predictions: List of triple for question-answers to score with the following elements:
        - the question-answer 'id' field as given in the references (see below)
        - the text of the answer
        - the probability that the question has no answer
    references: List of question-answers dictionaries with the following key-values:
            - 'id': id of the question-answer pair (see above),
            - 'answers': a list of Dict {'text': text of the answer as a string}
    no_answer_threshold: float
        Probability threshold