# **Question Answering with BERT**
on SQuAD 1.0.  


In [1]:
import requests
import json
import torch
import os
from tqdm import tqdm
import sys
import random
from copy import deepcopy
import random
import numpy as np
from collections import Counter
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!pip install datasets
!pip install transformers
!pip install nlpaug

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m38.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
Collec

In [3]:
def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)


set_seed(42)

### **Download SQuAD 1.0 ⬇️**

SQuAD consists of two json files.

* train dataset 
* validation dataset

In [4]:
from datasets import load_dataset

squad = load_dataset("squad")

Downloading builder script:   0%|          | 0.00/5.27k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.67k [00:00<?, ?B/s]

Downloading and preparing dataset squad/plain_text to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Dataset squad downloaded and prepared to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
len(squad['train']), len(squad['validation'])

(87599, 10570)

### **Get data 📁** 

After we got a taste of the jsons files data format let's extract our data and store them into some data structures.

In [6]:
def read_data(squad, split):

  size = int(0.2 * len(squad[split]))
  indices = random.sample(range(len(squad[split])), size)

  contexts = []
  questions = []
  answers = []

  for idx in indices:
    data = squad[split][idx]
    contexts.append(data['context'])
    questions.append(data['question'])
    answers.append(data['answers'])

  return contexts, questions, answers

### Adding Noise

In [7]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw

# Noise funcs 

# FOR QA ONLY
char_action = ['insert',
        'substitute',
        'delete',
        'swap',
]

def get_action(type):
    return random.choice(char_action)

def add_noise(question, p=0.7):
    """
    Augment a tweet with character-level and word-level noise.

    Args:
        tweet (str): The original tweet.
        p (float): The probability of applying the char level augmentation.

    Returns:
        str: The augmented question.
    """
    # Define a list of character-level augmentation techniques
    char_augmenters = [
        nac.KeyboardAug(aug_char_p=0.2, aug_word_p=0.2, include_special_char=False, include_numeric=False),
        nac.RandomCharAug(action=get_action("char"), aug_char_p=0.1, aug_word_p=0.1),
    ]

    # Define a list of word-level augmentation techniques
    word_augmenters = [
        naw.SpellingAug(),
        naw.SynonymAug(),
    ]

    # Randomly apply a character-level or word-level augmentation with probability p
    if random.random() < p:
        aug = random.choice(char_augmenters)
        noisy_text = aug.augment(question)
    else:
        aug = random.choice(word_augmenters)
        noisy_text = aug.augment(question)
      
    return noisy_text[0]


In [8]:
# train validation splits
train_contexts, train_questions, train_answers = read_data(squad, 'train')
valid_contexts, valid_questions, valid_answers = read_data(squad, 'validation')


# Make False if no noise to be added
noisy = True
noise_percent = 0.2
if noisy:

  for i, question in enumerate(train_questions):

    if random.random() < noise_percent:
      noisy_question = add_noise(question)
      train_questions[i] = noisy_question

  for i, question in enumerate(valid_questions):

    if random.random() < noise_percent:
      noisy_question = add_noise(question)
      valid_questions[i] = noisy_question

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
len(train_contexts)

17519

In [9]:
# print a random question and answer
print(f'There are {len(train_questions)} questions')
print(train_questions[0])
print(train_answers[0])

There are 17519 questions
What poet wrote a long poem describing Roman religious holidays?
{'text': ['Ovid'], 'answer_start': [346]}


In [10]:
print(train_answers[0])

{'text': ['Ovid'], 'answer_start': [346]}


In [11]:
def add_end_idx(answers, contexts):
  for answer, context in zip(answers, contexts):
    gold_text = answer['text'][0]
    start_idx = answer['answer_start'][0]
    end_idx = start_idx + len(gold_text)

    # sometimes squad answers are off by a character or two so we fix this
    if context[start_idx:end_idx] == gold_text:
      answer['answer_end'] = [end_idx]
    elif context[start_idx-1:end_idx-1] == gold_text:
      answer['answer_start'] = start_idx - 1
      answer['answer_end'] = [end_idx - 1]     # When the gold label is off by one character
    elif context[start_idx-2:end_idx-2] == gold_text:
      answer['answer_start'] = start_idx - 2
      answer['answer_end'] = [end_idx - 2]     # When the gold label is off by two characters


add_end_idx(train_answers, train_contexts)
add_end_idx(valid_answers, valid_contexts)

In [12]:
# You can see that now we get the answer_end also
print(train_questions[0])
print(train_answers[0])

What poet wrote a long poem describing Roman religious holidays?
{'text': ['Ovid'], 'answer_start': [346], 'answer_end': [350]}


### **Tokenization 🔢**

As we know we have to tokenize our data in form that is acceptable for the BERT model. We are going to use the `BertTokenizerFast` instead of `BertTokenizer` as the first one is much faster. Since we are going to train our model in batches we need to set `padding=True`.

In [13]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
valid_encodings = tokenizer(valid_contexts, valid_questions, truncation=True, padding=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Let's see what we got after tokenizing our data.

In [14]:
train_encodings.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [15]:
def add_token_positions(encodings, answers):
  start_positions = []
  end_positions = []
  for i in range(len(answers)):
    start_positions.append(encodings.char_to_token(i, answers[i]['answer_start'][0]))
    end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'][0] - 1))

    # if start position is None, the answer passage has been truncated
    if start_positions[-1] is None:
      start_positions[-1] = tokenizer.model_max_length
    if end_positions[-1] is None:
      end_positions[-1] = tokenizer.model_max_length

  encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
add_token_positions(valid_encodings, valid_answers)

In [16]:
train_encodings['start_positions'][:10]

[71, 12, 44, 144, 139, 127, 49, 87, 21, 39]

### **Dataset definition 🗄️**

We have to define our dataset using the PyTorch Dataset class from `torch.utils` in order create our dataloaders after that.

In [17]:
class SQuAD_Dataset(torch.utils.data.Dataset):
  def __init__(self, encodings):
    self.encodings = encodings
  def __getitem__(self, idx):
    return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  def __len__(self):
    return len(self.encodings.input_ids)

In [18]:
train_dataset = SQuAD_Dataset(train_encodings)
valid_dataset = SQuAD_Dataset(valid_encodings)

### **Dataloaders 🔁**

In [19]:
from torch.utils.data import DataLoader

# Define the dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=32)

In [None]:
len(train_loader), len(valid_loader)

(1095, 67)

In [None]:
len(train_contexts), len(valid_contexts)

(17519, 2114)

## **Fine-Tuning ⚙️**

### **Model definition 🤖**

We are going to use the `bert-case-uncased` from the huggingface transformers.

In [20]:
from transformers import BertForQuestionAnswering, AutoModel

model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")
# model = AutoModel.from_pretrained('bert-base-uncased')

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased a

### **Training 🏋️‍♂️**

Μy choices for some parameters:

* Use of `AdamW` which is a stochastic optimization method that modifies the typical implementation of weight decay in Adam, by decoupling weight decay from the gradient update. This helps to avoid overfitting which is necessary in this case were the model is very complex.

* Set the `lr=5e-5` as I read that this is the best value for the learning rate for this task.

In [21]:
# Check on the available device - use GPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f'Working on {device}')

Working on cuda


In [None]:
# from sklearn.metrics import classification_report, f1_score
# import torch
# import numpy as np

# from transformers import AdamW

# N_EPOCHS = 3
# optim = AdamW(model.parameters(), lr=1e-5)

# model.to(device)
# model.train()
# best_epoch = 0
# best_val_loss = float("inf")
# for epoch in range(N_EPOCHS):

#   loop = tqdm(train_loader, leave=True)
#   for batch in loop:
#     optim.zero_grad()
#     input_ids = batch['input_ids'].to(device)
#     attention_mask = batch['attention_mask'].to(device)
#     start_positions = batch['start_positions'].to(device)
#     end_positions = batch['end_positions'].to(device)
#     outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
#     loss = outputs[0]
#     loss.backward()
#     optim.step()

#     loop.set_description(f'Epoch {epoch+1}')
#     loop.set_postfix(loss=loss.item())
  

#   model.eval()
#   predictions, true_labels = [], []
#   val_loss = 0

#   with torch.no_grad():
#     for batch in valid_loader:
#       input_ids = batch['input_ids'].to(device)
#       attention_mask = batch['attention_mask'].to(device)
#       start_positions = batch['start_positions'].to(device)
#       end_positions = batch['end_positions'].to(device)

#       outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
      
#       loss = outputs[0]
#       val_loss += loss.item()

#       start_logits, end_logits = outputs[1], outputs[2]
#       start_logits = torch.argmax(start_logits, dim=1)
#       end_logits = torch.argmax(end_logits, dim=1)

#       predictions.extend([start_logits[i].item(), end_logits[i].item()] for i in range(len(start_logits)))
#       true_labels.extend([start_positions[i].item(), end_positions[i].item()] for i in range(len(start_positions)))

#     val_loss /= len(valid_loader)

#   # # Flatten the true labels and predictions
#   # true_labels_flat = np.array(true_labels).reshape(-1, 2)
#   # predictions_flat = np.array(predictions).reshape(-1, 2)

#   # report = classification_report(true_labels, predictions, digits=4, output_dict=True)
#   # f1 = report['weighted avg']['f1-score']
#   # precision = report['weighted avg']['precision']
#   # recall = report['weighted avg']['recall']
#   # support = report['weighted avg']['support']

#   # print(f'Epoch {epoch+1} evaluation report:')
#   # print(classification_report(true_labels, predictions, digits=4))

# #   if val_loss < best_val_loss:
# #     best_val_loss = val_loss
# #     path = '/content/gdrive/MyDrive/NLP Project/models/BERT_QA_clean20'
# #     torch.save(model.state_dict(), path+'/model_parameters.pth')

# # print(f'Best F1 score: {best_f1}')

#   if val_loss < best_val_loss:
#     best_epoch = epoch + 1
#     best_val_loss = val_loss
#     # torch.save(model.state_dict(), "t5_sentiment_model.pt")
#     path = '/content/gdrive/MyDrive/NLP Project/models/BERT_QA_noise20'
#     torch.save(model.state_dict(), path+'/model_parameters.pth')
  
#   print(f"Epoch {epoch + 1} - Val Loss: {val_loss:.4f}. \t Current best epoch is {best_epoch} with val loss - {best_val_loss:.4f}")


Epoch 1: 100%|██████████| 1095/1095 [26:53<00:00,  1.47s/it, loss=1.47]


Epoch 1 - Val Loss: 1.6464. 	 Current best epoch is 1 with val loss - 1.6464


Epoch 2: 100%|██████████| 1095/1095 [26:04<00:00,  1.43s/it, loss=1.1]


Epoch 2 - Val Loss: 1.3816. 	 Current best epoch is 2 with val loss - 1.3816


Epoch 3: 100%|██████████| 1095/1095 [26:03<00:00,  1.43s/it, loss=0.5]


Epoch 3 - Val Loss: 1.4947. 	 Current best epoch is 2 with val loss - 1.3816


**Save the model in my drive in order not to run it each time**

**Load model for Evaluations**

In [22]:
# Load model for evaluations

path = '/content/gdrive/MyDrive/NLP Project/models/BERT_QA_noise20'
model.load_state_dict(torch.load(path+'/model_parameters.pth'))

model = model.to(device)

### **Testing ✅**

We are evaluating the model on the validation set by checking the model's predictions for the answer's start and end indexes and comparing with the true ones.

In [23]:
model.eval()

acc = []
em_score = []
f1_score = []

for batch in tqdm(valid_loader):
  with torch.no_grad():
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    start_true = batch['start_positions'].to(device)
    end_true = batch['end_positions'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask)

    start_pred = torch.argmax(outputs['start_logits'], dim=1)
    end_pred = torch.argmax(outputs['end_logits'], dim=1)

    acc.append(((start_pred == start_true).sum()/len(start_pred)).item())
    acc.append(((end_pred == end_true).sum()/len(end_pred)).item())

    # Calculate EM score and F1 score
    for i in range(len(start_true)):
        pred_span = tokenizer.decode(input_ids[i][start_pred[i]:end_pred[i] + 1], skip_special_tokens=True)
        true_span = tokenizer.decode(input_ids[i][start_true[i]:end_true[i] + 1], skip_special_tokens=True)
        em_score.append(int(pred_span == true_span))

        # Compute F1 score
        common = Counter(pred_span.split()) & Counter(true_span.split())
        num_common = sum(common.values())
        precision = num_common / max(len(pred_span.split()), 1)
        recall = num_common / max(len(true_span.split()), 1)
        f1 = (2 * precision * recall) / max((precision + recall), 1e-8)
        f1_score.append(f1)


acc = sum(acc) / len(acc)
em_score = sum(em_score) / len(em_score)
f1_score = sum(f1_score) / len(f1_score)

print("\n\nAccuracy:", acc)
print("EM score:", em_score)
print("F1 score:", f1_score)

print("\n\nT/P\tanswer_start\tanswer_end\n")
for i in range(len(start_true)):
  pred_span = tokenizer.decode(input_ids[i][start_pred[i]:end_pred[i]+1], skip_special_tokens=True)
  true_span = tokenizer.decode(input_ids[i][start_true[i]:end_true[i]+1], skip_special_tokens=True)
  print(f"true\t{start_true[i]}\t{end_true[i]}\t{true_span}\n"
        f"pred\t{start_pred[i]}\t{end_pred[i]}\t{pred_span}\n")


100%|██████████| 67/67 [01:07<00:00,  1.01s/it]



Accuracy: 0.6231343283582089
EM score: 0.48628192999053926
F1 score: 0.660531284713473


T/P	answer_start	answer_end

true	1	3	downtown san diego
pred	1	3	downtown san diego

true	100	104	five times lower viewership
pred	100	101	five times






### Evaluation on Noisy Data

In [24]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw

# Noise funcs 

# FOR QA ONLY
char_action = ['insert',
        'substitute',
        'delete',
        'swap',
]

def get_action(type):
    return random.choice(char_action)

def add_noise(question, p=0.7):
    """
    Augment a tweet with character-level and word-level noise.

    Args:
        tweet (str): The original tweet.
        p (float): The probability of applying the char level augmentation.

    Returns:
        str: The augmented question.
    """
    # Define a list of character-level augmentation techniques
    char_augmenters = [
        nac.KeyboardAug(aug_char_p=0.2, aug_word_p=0.2, include_special_char=False, include_numeric=False),
        nac.RandomCharAug(action=get_action("char"), aug_char_p=0.1, aug_word_p=0.1),
    ]

    # Define a list of word-level augmentation techniques
    word_augmenters = [
        naw.SpellingAug(),
        naw.SynonymAug(),
    ]

    # Randomly apply a character-level or word-level augmentation with probability p
    if random.random() < p:
        aug = random.choice(char_augmenters)
        noisy_text = aug.augment(question)
    else:
        aug = random.choice(word_augmenters)
        noisy_text = aug.augment(question)
      
    return noisy_text[0]


In [25]:
import random
from datasets import load_dataset
from torch.utils.data import DataLoader

squad_test = load_dataset("squad")
valid_contexts, valid_questions, valid_answers = read_data(squad_test, 'validation')

import random
random.seed(42)
random_noise = random.uniform(0.05, 0.15)
print(random_noise)

noisy = True
noise_percent = random_noise
if noisy:

  for i, question in enumerate(valid_questions):

    if random.random() < noise_percent:
      noisy_question = add_noise(question)
      valid_questions[i] = noisy_question
      
  print('added noise')

valid_size = int(0.2 * len(squad_test['validation']))
valid_indices = random.sample(range(len(squad['validation'])), valid_size)
valid_contexts = []
valid_questions = []
valid_answers = []

for idx in valid_indices:
  group = squad_test['validation'][idx]
  valid_contexts.append(group['context'])
  valid_questions.append(group['question'])
  valid_answers.append(group['answers'])

add_end_idx(valid_answers, valid_contexts)
valid_encodings = tokenizer(valid_contexts, valid_questions, truncation=True, padding=True)
add_token_positions(valid_encodings, valid_answers)
valid_dataset = SQuAD_Dataset(valid_encodings)
valid_loader = DataLoader(valid_dataset, batch_size=16)



  0%|          | 0/2 [00:00<?, ?it/s]

0.11394267984578837
added noise


In [26]:
path = '/content/gdrive/MyDrive/NLP Project/models/BERT_QA_noise20'
model.load_state_dict(torch.load(path+'/model_parameters.pth'))

model = model.to(device)
model.eval()

acc = []
em_score = []
f1_score = []

for batch in tqdm(valid_loader):
  with torch.no_grad():
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    start_true = batch['start_positions'].to(device)
    end_true = batch['end_positions'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask)

    start_pred = torch.argmax(outputs['start_logits'], dim=1)
    end_pred = torch.argmax(outputs['end_logits'], dim=1)

    acc.append(((start_pred == start_true).sum()/len(start_pred)).item())
    acc.append(((end_pred == end_true).sum()/len(end_pred)).item())

    # Calculate EM score and F1 score
    for i in range(len(start_true)):
        pred_span = tokenizer.decode(input_ids[i][start_pred[i]:end_pred[i] + 1], skip_special_tokens=True)
        true_span = tokenizer.decode(input_ids[i][start_true[i]:end_true[i] + 1], skip_special_tokens=True)
        em_score.append(int(pred_span == true_span))

        # Compute F1 score
        common = Counter(pred_span.split()) & Counter(true_span.split())
        num_common = sum(common.values())
        precision = num_common / max(len(pred_span.split()), 1)
        recall = num_common / max(len(true_span.split()), 1)
        f1 = (2 * precision * recall) / max((precision + recall), 1e-8)
        f1_score.append(f1)


acc = sum(acc) / len(acc)
em_score = sum(em_score) / len(em_score)
f1_score = sum(f1_score) / len(f1_score)

print("\n\nAccuracy:", acc)
print("EM score:", em_score)
print("F1 score:", f1_score)

print("\n\nT/P\tanswer_start\tanswer_end\n")
for i in range(len(start_true)):
  pred_span = tokenizer.decode(input_ids[i][start_pred[i]:end_pred[i]+1], skip_special_tokens=True)
  true_span = tokenizer.decode(input_ids[i][start_true[i]:end_true[i]+1], skip_special_tokens=True)
  print(f"true\t{start_true[i]}\t{end_true[i]}\t{true_span}\n"
        f"pred\t{start_pred[i]}\t{end_pred[i]}\t{pred_span}\n")


100%|██████████| 133/133 [01:10<00:00,  1.88it/s]



Accuracy: 0.6322838345864662
EM score: 0.5
F1 score: 0.6762535531041836


T/P	answer_start	answer_end

true	6	10	phagocytic cells
pred	6	10	phagocytic cells

true	4	9	student motivation and attitudes towards school
pred	4	9	student motivation and attitudes towards school






## EXTRA STUFF


In [None]:
def get_prediction(context, question):
  inputs = tokenizer.encode_plus(question, context, return_tensors='pt').to(device)
  outputs = model(**inputs)
  
  answer_start = torch.argmax(outputs[0])  
  answer_end = torch.argmax(outputs[1]) + 1 
  
  answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end]))
  
  return answer

def normalize_text(s):
  """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
  import string, re
  def remove_articles(text):
    regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
    return re.sub(regex, " ", text)
  def white_space_fix(text):
    return " ".join(text.split())
  def remove_punc(text):
    exclude = set(string.punctuation)
    return "".join(ch for ch in text if ch not in exclude)
  def lower(text):
    return text.lower()

  return white_space_fix(remove_articles(remove_punc(lower(s))))

def exact_match(prediction, truth):
    return bool(normalize_text(prediction) == normalize_text(truth))

def compute_f1(prediction, truth):
  pred_tokens = normalize_text(prediction).split()
  truth_tokens = normalize_text(truth).split()
  
  # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
  if len(pred_tokens) == 0 or len(truth_tokens) == 0:
    return int(pred_tokens == truth_tokens)
  
  common_tokens = set(pred_tokens) & set(truth_tokens)
  
  # if there are no common tokens then f1 = 0
  if len(common_tokens) == 0:
    return 0
  
  prec = len(common_tokens) / len(pred_tokens)
  rec = len(common_tokens) / len(truth_tokens)
  
  return round(2 * (prec * rec) / (prec + rec), 2)
  
def question_answer(context, question,answer):
  prediction = get_prediction(context,question)
  em_score = exact_match(prediction, answer)
  f1_score = compute_f1(prediction, answer)

  print(f'Question: {question}')
  print(f'Prediction: {prediction}')
  print(f'True Answer: {answer}')
  print(f'Exact match: {em_score}')
  print(f'F1 score: {f1_score}\n')

**Beyoncé**

In [None]:
context = """Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, 
          songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing 
          and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. 
          Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. 
          Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, 
          earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy"."""


questions = ["For whom the passage is talking about?",
             "When did Beyonce born?",
             "Where did Beyonce born?",
             "What is Beyonce's nationality?",
             "Who was the Destiny's group manager?",
             "What name has the Beyoncé's debut album?",
             "How many Grammy Awards did Beyonce earn?",
             "When did the Beyoncé's debut album release?",
             "Who was the lead singer of R&B girl-group Destiny's Child?"]

answers = ["Beyonce Giselle Knowles - Carter", "September 4, 1981", "Houston, Texas", 
           "American", "Mathew Knowles", "Dangerously in Love", "five", "2003", 
           "Beyonce Giselle Knowles - Carter"]

for question, answer in zip(questions, answers):
  question_answer(context, question, answer)

Question: For whom the passage is talking about?
Prediction: destiny ' s child
True Answer: Beyonce Giselle Knowles - Carter
Exact match: False
F1 score: 0

Question: When did Beyonce born?
Prediction: 1981
True Answer: September 4, 1981
Exact match: False
F1 score: 0.5

Question: Where did Beyonce born?
Prediction: houston, texas
True Answer: Houston, Texas
Exact match: True
F1 score: 1.0

Question: What is Beyonce's nationality?
Prediction: 
True Answer: American
Exact match: False
F1 score: 0

Question: Who was the Destiny's group manager?
Prediction: 
True Answer: Mathew Knowles
Exact match: False
F1 score: 0

Question: What name has the Beyoncé's debut album?
Prediction: destiny ' s child
True Answer: Dangerously in Love
Exact match: False
F1 score: 0

Question: How many Grammy Awards did Beyonce earn?
Prediction: 
True Answer: five
Exact match: False
F1 score: 0

Question: When did the Beyoncé's debut album release?
Prediction: 
True Answer: 2003
Exact match: False
F1 score: 0

Q

**Athens**

In [None]:
context = """Athens is the capital and largest city of Greece. Athens dominates the Attica region and is one of the world's oldest cities, 
             with its recorded history spanning over 3,400 years and its earliest human presence starting somewhere between the 11th and 7th millennium BC.
             Classical Athens was a powerful city-state. It was a center for the arts, learning and philosophy, and the home of Plato's Academy and Aristotle's Lyceum.
             It is widely referred to as the cradle of Western civilization and the birthplace of democracy, largely because of its cultural and political impact on the European continent—particularly Ancient Rome.
             In modern times, Athens is a large cosmopolitan metropolis and central to economic, financial, industrial, maritime, political and cultural life in Greece. 
             In 2021, Athens' urban area hosted more than three and a half million people, which is around 35% of the entire population of Greece.
             Athens is a Beta global city according to the Globalization and World Cities Research Network, and is one of the biggest economic centers in Southeastern Europe. 
             It also has a large financial sector, and its port Piraeus is both the largest passenger port in Europe, and the second largest in the world."""

questions = ["Which is the largest city in Greece?",
             "For what was the Athens center?",
             "Which city was the home of Plato's Academy?"]

answers = ["Athens", "center for the arts, learning and philosophy", "Athens"]

for question, answer in zip(questions, answers):
  question_answer(context, question, answer)

Question: Which is the largest city in Greece?
Prediction: athens
True Answer: Athens
Exact match: True
F1 score: 1.0

Question: For what was the Athens center?
Prediction: arts, learning and philosophy
True Answer: center for the arts, learning and philosophy
Exact match: False
F1 score: 0.8

Question: Which city was the home of Plato's Academy?
Prediction: athens is the capital and largest city of greece
True Answer: Athens
Exact match: False
F1 score: 0.22



**Angelos**

In [None]:
context = """Angelos Poulis was born on 8 April 2001 in Nicosia, Cyprus. He is half Cypriot and half Greek. 
            He is currently studying at the Department of Informatics and Telecommunications of the University of Athens in Greece. 
            His scientific interests are in the broad field of Artificial Intelligence and he loves to train neural networks! 
            Okay, I'm Angelos and I'll stop talking about me right now."""

questions = ["When did Angelos born?",
             "In what university is Angelos studying now?",
             "What is Angelos' nationality?",
             "What are his scientific interests?",
             "What I will do right now?"]

answers = ["8 April 2001", "University of Athens", 
           "half Cypriot and half Greek", "Artificial Intelligence", 
           "stop talking about me"]

for question, answer in zip(questions, answers):
  question_answer(context, question, answer)

Question: When did Angelos born?
Prediction: 8 april 2001
True Answer: 8 April 2001
Exact match: True
F1 score: 1.0

Question: In what university is Angelos studying now?
Prediction: 
True Answer: University of Athens
Exact match: False
F1 score: 0

Question: What is Angelos' nationality?
Prediction: cypriot and half greek
True Answer: half Cypriot and half Greek
Exact match: False
F1 score: 0.89

Question: What are his scientific interests?
Prediction: artificial intelligence
True Answer: Artificial Intelligence
Exact match: True
F1 score: 1.0

Question: What I will do right now?
Prediction: artificial intelligence
True Answer: stop talking about me
Exact match: False
F1 score: 0



## **Summary (and some Questions & Answers) 🧐**

**Technical details:**
* **Model used:** `bert-base-uncased`
* **Dataset:** The Stanford Question Answering Dataset (SQuAD)  
* **Run time:** ~ 4 hours on the Tesla P100 GPU for `N_EPOCHS = 3`. Each epoch took about 1 hour and 15 minutes for training. I think if we run the model for at least `N_EPOCHS = 5` we can get even better results, but what we got for 3 epochs is already very good!

**Conclusion:** We can say that training the model for just 3 epochs, which took about 4 hours on the Tesla P100 GPU, gives us pretty good results. The model can also answer quite well to questions about contents it hasn't seen before and I can say this because I gave it a passage for myself!

Some *example questions and answers* we get are the following:

**About Athens:**

> **Question:** Which is the largest city in Greece?  
  **Prediction:** athens  
  **True Answer:** Athens  
  **Exact match:** True  
  **F1 score:** 1.0  

> **Question:** For what was the Athens center?  
  **Prediction:** center for the arts, learning and philosophy  
  **True Answer:** center for the arts, learning and philosophy  
  **Exact match:** True  
  **F1 score:** 1.0  

**About Beyoncé:**

> **Question:** When did Beyonce born?  
  **Prediction:** september 4, 1981  
  **True Answer:** September 4, 1981  
  **Exact Match:** True	 
  **F1 score:** 1.0

> **Question:** What name has the Beyoncé's debut album?  
  **Prediction:** dangerously in love  
  **True Answer:** Dangerously in Love   
  **Exact Match:** True  
  **F1 score:** 1.0

> **Question:** How many Grammy Awards did Beyonce earn?  
  **Prediction:** five  
  **True Answer:** five  
  **Exact Match:** True  
  **F1 score:** 1.0


> **Question:** When did the Beyoncé's debut album release?  
  **Prediction:** 2003  
  **True Answer:** 2003  
  **Exact Match:** True  
  **F1 score:** 1.0


> **Question:** Who was the lead singer of R&B girl-group Destiny's Child?  
  **Prediction:** beyonce giselle knowles - carter  
  **True Answer:** Beyonce Giselle Knowles - Carter  
  **Exact Match:** True  
  **F1 score:** 1.0


**About Angelos:**

> **Question:** When did Angelos born?  
  **Prediction:** 8 april 2001  
  **True Answer:** 8 April 2001  
  **Exact match:** True  
  **F1 score:** 1.0

> **Question:** In what university is Angelos studying now?  
  **Prediction:** university of athens  
  **True Answer:** University of Athens  
  **Exact match:** True    
  **F1 score:** 1.0

> **Question:** What is Angelos' nationality?  
  **Prediction:** half cypriot and half greek.  
  **True Answer:** half Cypriot and half Greek   
  **Exact match:** True  
  **F1 score:** 0.8

> **Question:** What are his scientific interests?  
  **Prediction:** artificial intelligence  
  **True Answer:** Artificial Intelligence    
  **Exact match:** True  
  **F1 score:** 1.0

> **Question:** What I will do right now?  
  **Prediction:** stop talking about me  
  **True Answer:** stop talking about me  
  **Exact match:** True  
  **F1 score:** 1.0
