<a href="https://colab.research.google.com/github/bhadreshpsavani/UnderstandingNLP/blob/master/Notebooks/Finetuning_DeBERTa_SQUAD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q git+https://github.com/huggingface/transformers.git

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
[K     |████████████████████████████████| 890kB 5.3MB/s 
[K     |████████████████████████████████| 3.2MB 10.4MB/s 
[?25h  Building wheel for transformers (PEP 517) ... [?25l[?25hdone
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [2]:
!pip install -q datasets

[K     |████████████████████████████████| 184kB 5.2MB/s 
[K     |████████████████████████████████| 245kB 8.3MB/s 
[K     |████████████████████████████████| 20.7MB 1.5MB/s 
[K     |████████████████████████████████| 102kB 10.1MB/s 
[?25h

# Get Data:

In [3]:
# This flag is the difference between SQUAD v1 or 2 (if you're using another dataset, it indicates if impossible
# answers are allowed or not).
squad_v2 = True
model_checkpoint = 'microsoft/deberta-base'
batch_size = 16
max_length = 384 # The maximum length of a feature (question and context)
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.

In [4]:
from datasets import load_dataset, load_metric
datasets = load_dataset("squad_v2" if squad_v2 else "squad")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1806.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=963.0, style=ProgressStyle(description_…


Downloading and preparing dataset squad_v2/squad_v2 (download: 44.34 MiB, generated: 122.57 MiB, post-processed: Unknown size, total: 166.91 MiB) to /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/9cac55034b086140f0649ecb5c604d09d7da2f2f5b73a90caa2e2bcc1f5cac09...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=9551051.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=800683.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset squad_v2 downloaded and prepared to /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/9cac55034b086140f0649ecb5c604d09d7da2f2f5b73a90caa2e2bcc1f5cac09. Subsequent calls will reuse this data.


In [5]:
datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})

In [6]:
# load train and validation split of squad
train_dataset  = datasets['train']
valid_dataset = datasets['validation']

In [7]:
answers = train_dataset.map(lambda row: row['answers'])

HBox(children=(FloatProgress(value=0.0, max=130319.0), HTML(value='')))




## Convert SQUAD2.0 Data

In [8]:
from transformers import DebertaTokenizer
tokenizer = DebertaTokenizer.from_pretrained(model_checkpoint)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=3917897.0, style=ProgressStyle(descript…




In [9]:
example = train_dataset[1]
input_pairs = [example['question'], example['context']]

In [10]:
def find_sublist_indices(haystack, needle):
    if not needle:
        return
    # just optimization
    lengthneedle = len(needle)
    firstneedle = needle[0]
    restneedle = needle[1:]
    for idx, item in enumerate(haystack):
        if item == firstneedle:
            if haystack[idx+1:idx+lengthneedle] == restneedle:
                yield tuple(range(idx, idx+lengthneedle))

In [11]:
# Tokenize our training dataset
def convert_to_features(example):

  # Tokenize contexts and questions (as pairs of inputs)
  encodings = tokenizer(example['question'],example['context'], pad_to_max_length=True, truncation=True, max_length=512)
  context_encodings = tokenizer.encode(example['context'], add_special_tokens=False)

  print(example['answers']['text'][0])
  # we need to add space for making sure to get exact match in context after encoding
  try:
    answer_encoding = tokenizer.encode(' '+example['answers']['text'][0], add_special_tokens=False) 
    answer_indexes = list(find_sublist_indices(context_encodings, answer_encoding))[0]
  except IndexError:
    answer_encoding = tokenizer.encode(example['answers']['text'][0], add_special_tokens=False) 
    answer_indexes = list(find_sublist_indices(context_encodings, answer_encoding))[0]
    
  start_positions_context, end_positions_context = answer_indexes[0], answer_indexes[-1]

  # here we will compute the start and end position of the answer in the whole example
  # as the example is encoded like this <s> question</s></s> context</s>
  # and we know the postion of the answer in the context
  # we can just find out the index of the sep token and then add that to position + 1 (+1 because there are two sep tokens)
  # this will give us the position of the answer span in whole example 
  sep_idx = encodings['input_ids'].index(tokenizer.sep_token_id)
  start_positions = start_positions_context + sep_idx + 1
  end_positions = end_positions_context + sep_idx + 1

  

  if end_positions > 512:
        start_positions, end_positions = 0, 0

  if ' ' + example['answers']['text'][0]!=tokenizer.decode(encodings['input_ids'][start_positions:end_positions+1]):
    print("Correct Answer", ' ' + example['answers']['text'][0])
    print(start_positions_context, end_positions_context, sep_idx, start_positions, end_positions)
    print("Answer", tokenizer.decode(encodings['input_ids'][start_positions:end_positions+1]))

  encodings.update({'start_positions': start_positions,
                    'end_positions': end_positions,
                    'attention_mask': encodings['attention_mask']})
  return encodings

## Problems:
1. Getting Start and End Index after tokenization - Solved
2. Implement Sliding Window
3. Unanswerable Question

In [12]:
convert_to_features(train_dataset[2])



2003


IndexError: ignored

#### Comparing With Fast Bert

In [None]:
# model_checkpoint = "distilbert-base-uncased"
# from transformers import AutoTokenizer
# fast_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
# def get_correct_alignement(context, answer):
#     """ Some original examples in SQuAD have indices wrong by 1 or 2 character. We test and fix this here. """
#     gold_text = answer['text'][0]
#     start_idx = answer['answer_start'][0]
#     end_idx = start_idx + len(gold_text)
#     if context[start_idx:end_idx] == gold_text:
#         return start_idx, end_idx       # When the gold label position is good
#     elif context[start_idx-1:end_idx-1] == gold_text:
#         return start_idx-1, end_idx-1   # When the gold label is off by one character
#     elif context[start_idx-2:end_idx-2] == gold_text:
#         return start_idx-2, end_idx-2   # When the gold label is off by two character
#     else:
#         raise ValueError()

# # Tokenize our training dataset
# def convert_to_features(example):
#     # Tokenize contexts and questions (as pairs of inputs)
#     input_pairs = [example['question'], example['context']]
#     encodings = fast_tokenizer.encode_plus(input_pairs, pad_to_max_length=True, max_length=512)
#     context_encodings = fast_tokenizer.encode_plus(example['context'])
    

#     # Compute start and end tokens for labels using Transformers's fast tokenizers alignement methodes.
#     # this will give us the position of answer span in the context text
#     start_idx, end_idx = get_correct_alignement(example['context'], example['answers'])
#     start_positions_context = context_encodings.char_to_token(start_idx)
#     end_positions_context = context_encodings.char_to_token(end_idx-1)

#     # here we will compute the start and end position of the answer in the whole example
#     # as the example is encoded like this <s> question</s></s> context</s>
#     # and we know the postion of the answer in the context
#     # we can just find out the index of the sep token and then add that to position + 1 (+1 because there are two sep tokens)
#     # this will give us the position of the answer span in whole example 
#     sep_idx = encodings['input_ids'].index(fast_tokenizer.sep_token_id)
#     start_positions = start_positions_context + sep_idx 
#     end_positions = end_positions_context + sep_idx

#     # print(start_positions_context, end_positions_context, sep_idx, start_positions, end_positions)

#     # print("Correct Answer", ' ' + example['answers']['text'][0])
#     # print("Answer", fast_tokenizer.decode(encodings['input_ids'][start_positions:end_positions+1]))

#     if end_positions > 512:
#       start_positions, end_positions = 0, 0

#     encodings.update({'start_positions': start_positions,
#                       'end_positions': end_positions,
#                       'attention_mask': encodings['attention_mask']})
#     return encodings

In [None]:
for i in range(len(train_dataset)):
  print(i)
  convert_to_features(train_dataset[i])

0
1
2




IndexError: ignored

In [None]:
new_train_dataset = train_dataset.map(convert_to_features)
new_valid_dataset = valid_dataset.map(convert_to_features, load_from_cache_file=False)

# set the tensor type and the columns which the dataset should return
columns = ['input_ids', 'attention_mask', 'start_positions', 'end_positions']
train_dataset.set_format(type='torch', columns=columns)
valid_dataset.set_format(type='torch', columns=columns)



HBox(children=(FloatProgress(value=0.0, max=130319.0), HTML(value='')))

IndexError: ignored

## Sliding Window: