In [None]:
from google.colab import drive

drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [None]:
!pip install datasets
!pip install transformers
!pip install nlpaug

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
Collec

### Imports

In [None]:
import requests
import json
import torch
import os
from tqdm import tqdm
import sys
import random
from copy import deepcopy
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw

### Download Dataset


In [None]:
from datasets import load_dataset

squad = load_dataset("squad")

Downloading builder script:   0%|          | 0.00/5.27k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.67k [00:00<?, ?B/s]

Downloading and preparing dataset squad/plain_text to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Dataset squad downloaded and prepared to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

### Read Data, Adding Noise and Preprocessing

In [None]:
def read_data(squad, split):

  contexts = []
  questions = []
  answers = []

  for data in squad[split]:
    contexts.append(data['context'])
    questions.append(data['question'])
    answers.append(data['answers'])

  return contexts, questions, answers

In [None]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw

# Noise funcs 

# FOR QA ONLY
char_action = ['insert',
        'substitute',
        'delete',
        'swap',
]

def get_action(type):
    return random.choice(char_action)

def add_noise(question, p=0.7):
    """
    Augment a tweet with character-level and word-level noise.

    Args:
        tweet (str): The original tweet.
        p (float): The probability of applying the char level augmentation.

    Returns:
        str: The augmented question.
    """
    # Define a list of character-level augmentation techniques
    char_augmenters = [
        nac.KeyboardAug(aug_char_p=0.2, aug_word_p=0.2, include_special_char=False, include_numeric=False),
        nac.RandomCharAug(action=get_action("char"), aug_char_p=0.1, aug_word_p=0.1),
    ]

    # Define a list of word-level augmentation techniques
    word_augmenters = [
        naw.SpellingAug(),
        naw.SynonymAug(),
    ]

    # Randomly apply a character-level or word-level augmentation with probability p
    if random.random() < p:
        aug = random.choice(char_augmenters)
        noisy_text = aug.augment(question)
    else:
        aug = random.choice(word_augmenters)
        noisy_text = aug.augment(question)
      
    return noisy_text[0]


In [None]:
# train validation splits
train_contexts, train_questions, train_answers = read_data(squad, 'train')
valid_contexts, valid_questions, valid_answers = read_data(squad, 'validation')


# Make False if no noise to be added
noisy = True
noise_percent = 0.1
if noisy:

  for i, question in enumerate(train_questions):

    if random.random() < noise_percent:
      noisy_question = add_noise(question)
      train_questions[i] = noisy_question

  for i, question in enumerate(valid_questions):

    if random.random() < noise_percent:
      noisy_question = add_noise(question)
      valid_questions[i] = noisy_question

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
import random
train_size = int(0.1 * len(squad['train']))
train_indices = random.sample(range(len(squad['train'])), train_size)
train_contexts = []
train_questions = []
train_answers = []

for idx in train_indices:
  group = squad['train'][idx]
  train_contexts.append(group['context'])
  train_questions.append(group['question'])
  train_answers.append(group['answers'])

In [None]:
import random
valid_size = int(0.1 * len(squad['validation']))
valid_indices = random.sample(range(len(squad['validation'])), valid_size)
valid_contexts = []
valid_questions = []
valid_answers = []

for idx in valid_indices:
  group = squad['validation'][idx]
  valid_contexts.append(group['context'])
  valid_questions.append(group['question'])
  valid_answers.append(group['answers'])

In [None]:
def add_end_idx(answers, contexts):
  for answer, context in zip(answers, contexts):
    gold_text = answer['text'][0]
    start_idx = answer['answer_start'][0]
    end_idx = []
    end_idx.append(start_idx + len(gold_text))

    # sometimes squad answers are off by a character or two so we fix this
    if context[start_idx:end_idx[0]] == gold_text:
      answer['answer_end'] = end_idx
    elif context[start_idx-1:end_idx-1] == gold_text:
      answer['answer_start'] = start_idx - 1
      answer['answer_end'] = end_idx - 1     # When the gold label is off by one character
    elif context[start_idx-2:end_idx-2] == gold_text:
      answer['answer_start'] = start_idx - 2
      answer['answer_end'] = end_idx - 2     # When the gold label is off by two characters



In [None]:
add_end_idx(train_answers, train_contexts)
add_end_idx(valid_answers, valid_contexts)

### Tokenization


In [None]:
from transformers import BertTokenizerFast

#tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
from transformers import ElectraTokenizerFast, ElectraForQuestionAnswering

tokenizer = ElectraTokenizerFast.from_pretrained('google/electra-small-discriminator')



Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [None]:
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
valid_encodings = tokenizer(valid_contexts, valid_questions, truncation=True, padding=True)

In [None]:
train_encodings.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [None]:
tokenizer.decode(train_encodings['input_ids'][0])

'[CLS] there are three ways an existing treaty can be amended. first, formal amendment requires state parties to the treaty to go through the ratification process all over again. the re - negotiation of treaty provisions can be long and protracted, and often some parties to the original treaty will not become parties to the amended treaty. when determining the legal obligations of states, one party to the original treaty and one a party to the amended treaty, the states will only be bound by the terms they both agreed upon. treaties can also be amended informally by the treaty executive council when the changes are only procedural, technical change in customary international law can also amend a treaty, where state behavior evinces a new interpretation of the legal obligations under the treaty. minor corrections to a treaty may be adopted by a proces - verbal ; but a proces - verbal is generally reserved for changes to rectify obvious errors in the text adopted, i. e. where the text ad

In [None]:
def add_token_positions(encodings, answers):
  start_positions = []
  end_positions = []
  for i in range(len(answers)):
    start_positions.append(encodings.char_to_token(i, answers[i]['answer_start'][0]))
    end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'][0] - 1))

    # if start position is None, the answer passage has been truncated
    if start_positions[-1] is None:
      start_positions[-1] = tokenizer.model_max_length
    if end_positions[-1] is None:
      end_positions[-1] = tokenizer.model_max_length

  encodings.update({'start_positions': start_positions, 'end_positions': end_positions})



In [None]:
add_token_positions(train_encodings, train_answers)
add_token_positions(valid_encodings, valid_answers)

In [None]:
train_encodings['start_positions'][:10]

KeyError: ignored

In [None]:
class SQuAD_Dataset(torch.utils.data.Dataset):
  def __init__(self, encodings):
    self.encodings = encodings
  def __getitem__(self, idx):
    return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  def __len__(self):
    return len(self.encodings.input_ids)

In [None]:
train_dataset = SQuAD_Dataset(train_encodings)
valid_dataset = SQuAD_Dataset(valid_encodings)

In [None]:
from torch.utils.data import DataLoader

# Define the dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=16)

### Model definition

In [None]:
from transformers import BertForQuestionAnswering, AutoModel

model = ElectraForQuestionAnswering.from_pretrained('google/electra-base-discriminator')


Downloading (…)lve/main/config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/electra-base-discriminator were not used when initializing ElectraForQuestionAnswering: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForQuestionAnswering were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['qa_outputs.weight', 'qa_outputs.

### Training

In [None]:
# Check on the available device - use GPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f'Working on {device}')

Working on cuda


In [None]:
from transformers import AdamW
from sklearn.metrics import f1_score

N_EPOCHS = 5
optim = AdamW(model.parameters(), lr=2e-5)
best_val_loss = float("inf")
model.to(device)
model.train()

for epoch in range(N_EPOCHS):
  loop = tqdm(train_loader, leave=True)
  for batch in loop:
    optim.zero_grad()
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    start_positions = batch['start_positions'].to(device)
    end_positions = batch['end_positions'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
    loss = outputs[0]
    loss.backward()
    optim.step()

    loop.set_description(f'Epoch {epoch+1}')
    loop.set_postfix(loss=loss.item())

  # Evaluate the model on the validation set
  model.eval()
  val_loss = 0
  predictions = []
  true_labels = []
  with torch.no_grad():
    for batch in valid_loader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      start_positions = batch['start_positions'].to(device)
      end_positions = batch['end_positions'].to(device)
      outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
      loss = outputs[0]
      val_loss += loss.item()
      start_logits = outputs[1]
      end_logits = outputs[2]
      start_preds = start_logits.argmax(dim=1)
      end_preds = end_logits.argmax(dim=1)
      predictions.extend(list(zip(start_preds.cpu().numpy(), end_preds.cpu().numpy())))
      true_labels.extend(list(zip(start_positions.cpu().numpy(), end_positions.cpu().numpy())))
  val_loss /= len(valid_loader)

  if val_loss < best_val_loss:
    best_val_loss = val_loss
    path = '/content/gdrive/MyDrive/NLP Project/models/ELECTRA_QA_NOISY10'
    torch.save(model.state_dict(), path+'/model_parameters.pth')

  print(val_loss,best_val_loss)

  # Set the model back to training mode
  model.train()


Epoch 1: 100%|██████████| 548/548 [13:28<00:00,  1.48s/it, loss=0.974]


1.1534593303812974 1.1534593303812974


Epoch 2: 100%|██████████| 548/548 [13:32<00:00,  1.48s/it, loss=1.25]


1.1352645421901078 1.1352645421901078


Epoch 3: 100%|██████████| 548/548 [13:30<00:00,  1.48s/it, loss=0.378]


1.1847266999226231 1.1352645421901078


Epoch 4: 100%|██████████| 548/548 [13:31<00:00,  1.48s/it, loss=0.744]


1.316502617519182 1.1352645421901078


Epoch 5: 100%|██████████| 548/548 [13:31<00:00,  1.48s/it, loss=0.57]


1.4126717057843596 1.1352645421901078


### Evaluation

In [None]:
import random
from datasets import load_dataset
from torch.utils.data import DataLoader

squad_test = load_dataset("squad")
valid_contexts, valid_questions, valid_answers = read_data(squad_test, 'validation')

valid_size = int(0.2 * len(squad_test['validation']))
valid_indices = random.sample(range(len(squad['validation'])), valid_size)
valid_contexts = []
valid_questions = []
valid_answers = []

for idx in valid_indices:
  group = squad_test['validation'][idx]
  valid_contexts.append(group['context'])
  valid_questions.append(group['question'])
  valid_answers.append(group['answers'])

add_end_idx(valid_answers, valid_contexts)
valid_encodings = tokenizer(valid_contexts, valid_questions, truncation=True, padding=True)
add_token_positions(valid_encodings, valid_answers)
valid_dataset = SQuAD_Dataset(valid_encodings)
valid_loader = DataLoader(valid_dataset, batch_size=16)



  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
from collections import Counter
path = '/content/gdrive/MyDrive/NLP Project/models/ELECTRA_QA_NOISY10'
model.load_state_dict(torch.load(path+'/model_parameters.pth'))

model = model.to(device)
model.eval()

acc = []
em_score = []
f1_score = []

for batch in tqdm(valid_loader):
  with torch.no_grad():
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    start_true = batch['start_positions'].to(device)
    end_true = batch['end_positions'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask)

    start_pred = torch.argmax(outputs['start_logits'], dim=1)
    end_pred = torch.argmax(outputs['end_logits'], dim=1)

    acc.append(((start_pred == start_true).sum()/len(start_pred)).item())
    acc.append(((end_pred == end_true).sum()/len(end_pred)).item())

    # Calculate EM score and F1 score
    for i in range(len(start_true)):
        pred_span = tokenizer.decode(input_ids[i][start_pred[i]:end_pred[i] + 1], skip_special_tokens=True)
        true_span = tokenizer.decode(input_ids[i][start_true[i]:end_true[i] + 1], skip_special_tokens=True)
        em_score.append(int(pred_span == true_span))

        # Compute F1 score
        common = Counter(pred_span.split()) & Counter(true_span.split())
        num_common = sum(common.values())
        precision = num_common / max(len(pred_span.split()), 1)
        recall = num_common / max(len(true_span.split()), 1)
        f1 = (2 * precision * recall) / max((precision + recall), 1e-8)
        f1_score.append(f1)


acc = sum(acc) / len(acc)
em_score = sum(em_score) / len(em_score)
f1_score = sum(f1_score) / len(f1_score)

print("\n\nAccuracy:", acc)
print("EM score:", em_score)
print("F1 score:", f1_score)

print("\n\nT/P\tanswer_start\tanswer_end\n")
for i in range(len(start_true)):
  pred_span = tokenizer.decode(input_ids[i][start_pred[i]:end_pred[i]+1], skip_special_tokens=True)
  true_span = tokenizer.decode(input_ids[i][start_true[i]:end_true[i]+1], skip_special_tokens=True)
  print(f"true\t{start_true[i]}\t{end_true[i]}\t{true_span}\n"
        f"pred\t{start_pred[i]}\t{end_pred[i]}\t{pred_span}\n")

100%|██████████| 133/133 [01:08<00:00,  1.94it/s]



Accuracy: 0.6830357142857143
EM score: 0.5553453169347209
F1 score: 0.7474771729282311


T/P	answer_start	answer_end

true	107	112	superintendent of new york city schools
pred	107	112	superintendent of new york city schools

true	97	98	300 acres
pred	97	97	300






### Evaluation on Noisy Data

In [None]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw

# Noise funcs 

# FOR QA ONLY
char_action = ['insert',
        'substitute',
        'delete',
        'swap',
]

def get_action(type):
    return random.choice(char_action)

def add_noise(question, p=0.7):
    """
    Augment a tweet with character-level and word-level noise.

    Args:
        tweet (str): The original tweet.
        p (float): The probability of applying the char level augmentation.

    Returns:
        str: The augmented question.
    """
    # Define a list of character-level augmentation techniques
    char_augmenters = [
        nac.KeyboardAug(aug_char_p=0.2, aug_word_p=0.2, include_special_char=False, include_numeric=False),
        nac.RandomCharAug(action=get_action("char"), aug_char_p=0.1, aug_word_p=0.1),
    ]

    # Define a list of word-level augmentation techniques
    word_augmenters = [
        naw.SpellingAug(),
        naw.SynonymAug(),
    ]

    # Randomly apply a character-level or word-level augmentation with probability p
    if random.random() < p:
        aug = random.choice(char_augmenters)
        noisy_text = aug.augment(question)
    else:
        aug = random.choice(word_augmenters)
        noisy_text = aug.augment(question)
      
    return noisy_text[0]


In [None]:
import random
from datasets import load_dataset
from torch.utils.data import DataLoader

squad_test = load_dataset("squad")
valid_contexts, valid_questions, valid_answers = read_data(squad_test, 'validation')

import random
random.seed(42)
random_noise = random.uniform(0.05, 0.15)
print(random_noise)

noisy = True
noise_percent = random_noise
if noisy:

  for i, question in enumerate(valid_questions):

    if random.random() < noise_percent:
      noisy_question = add_noise(question)
      valid_questions[i] = noisy_question
      
  print('added noise')

valid_size = int(0.2 * len(squad_test['validation']))
valid_indices = random.sample(range(len(squad['validation'])), valid_size)
valid_contexts = []
valid_questions = []
valid_answers = []

for idx in valid_indices:
  group = squad_test['validation'][idx]
  valid_contexts.append(group['context'])
  valid_questions.append(group['question'])
  valid_answers.append(group['answers'])

add_end_idx(valid_answers, valid_contexts)
valid_encodings = tokenizer(valid_contexts, valid_questions, truncation=True, padding=True)
add_token_positions(valid_encodings, valid_answers)
valid_dataset = SQuAD_Dataset(valid_encodings)
valid_loader = DataLoader(valid_dataset, batch_size=16)



  0%|          | 0/2 [00:00<?, ?it/s]

0.11394267984578837
added noise


In [None]:
path = '/content/gdrive/MyDrive/NLP Project/models/ELECTRA_QA_NOISY10'
model.load_state_dict(torch.load(path+'/model_parameters.pth'))

model = model.to(device)
model.eval()

acc = []
em_score = []
f1_score = []

for batch in tqdm(valid_loader):
  with torch.no_grad():
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    start_true = batch['start_positions'].to(device)
    end_true = batch['end_positions'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask)

    start_pred = torch.argmax(outputs['start_logits'], dim=1)
    end_pred = torch.argmax(outputs['end_logits'], dim=1)

    acc.append(((start_pred == start_true).sum()/len(start_pred)).item())
    acc.append(((end_pred == end_true).sum()/len(end_pred)).item())

    # Calculate EM score and F1 score
    for i in range(len(start_true)):
        pred_span = tokenizer.decode(input_ids[i][start_pred[i]:end_pred[i] + 1], skip_special_tokens=True)
        true_span = tokenizer.decode(input_ids[i][start_true[i]:end_true[i] + 1], skip_special_tokens=True)
        em_score.append(int(pred_span == true_span))

        # Compute F1 score
        common = Counter(pred_span.split()) & Counter(true_span.split())
        num_common = sum(common.values())
        precision = num_common / max(len(pred_span.split()), 1)
        recall = num_common / max(len(true_span.split()), 1)
        f1 = (2 * precision * recall) / max((precision + recall), 1e-8)
        f1_score.append(f1)


acc = sum(acc) / len(acc)
em_score = sum(em_score) / len(em_score)
f1_score = sum(f1_score) / len(f1_score)

print("\n\nAccuracy:", acc)
print("EM score:", em_score)
print("F1 score:", f1_score)

print("\n\nT/P\tanswer_start\tanswer_end\n")
for i in range(len(start_true)):
  pred_span = tokenizer.decode(input_ids[i][start_pred[i]:end_pred[i]+1], skip_special_tokens=True)
  true_span = tokenizer.decode(input_ids[i][start_true[i]:end_true[i]+1], skip_special_tokens=True)
  print(f"true\t{start_true[i]}\t{end_true[i]}\t{true_span}\n"
        f"pred\t{start_pred[i]}\t{end_pred[i]}\t{pred_span}\n")


100%|██████████| 133/133 [01:14<00:00,  1.79it/s]



Accuracy: 0.6891447368421053
EM score: 0.5577105014191107
F1 score: 0.7473474812115412


T/P	answer_start	answer_end

true	42	45	february 1, 2016
pred	42	45	february 1, 2016

true	144	144	1992
pred	144	144	1992




