# 1. Load the SQUAD 2.0

In [26]:
%%capture
!pip install transformers

In [27]:
pip install pandas



In [28]:

%%capture
import json
import pandas as pd
import numpy as np
from pathlib import Path
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForQuestionAnswering
import time

In [29]:
is_cuda = torch.cuda.is_available()

if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU is available


In [30]:
%%capture
!mkdir squad
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json -O squad/train-v2.0.json
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json -O squad/dev-v2.0.json

# Get the Data and store it , using only 50 samples

In [50]:
import json
from pathlib import Path

# Define the paths to training and validation datasets
train_path = Path('squad/train-v2.0.json')
val_path = Path('squad/dev-v2.0.json')

# Function to load SQuAD data and extract samples
def load_squad_data(path, exclude=None, num_samples=22):
    with open(path, 'r') as f:
        squad_dict = json.load(f)

    texts = []
    questions = []
    answers = []

    count = 0  # Counter to limit the number of samples
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                if not qa["is_impossible"]:  # Exclude unanswerable questions
                    question = qa['question']
                    answer = qa['answers'][0]  # Use the first answer

                    # Check for duplicates in the exclude set
                    if exclude and (context, question) in exclude:
                        continue

                    texts.append(context)
                    questions.append(question)
                    answers.append(answer)
                    count += 1
                    if count >= num_samples:  # Stop after reaching the desired number of samples
                        return texts, questions, answers

    return texts, questions, answers

# Load training data
train_texts, train_queries, train_answers = load_squad_data(train_path, num_samples=50)  # Load 22 training samples
train_set = set(zip(train_texts, train_queries))  # Create a set of (context, question) pairs

# Load validation data excluding training data
val_texts, val_queries, val_answers = load_squad_data(val_path, exclude=train_set, num_samples=50)



In [51]:
import json

# Define the path to the SQuAD 2.0 training data
path = "squad/train-v2.0.json"

# Load and preprocess the SQuAD 2.0 data into the desired structure
def load_squad_samples(path, num_samples=50):
    with open(path, 'r') as f:
        squad_dict = json.load(f)

    samples = []  # List to store the structured data
    count = 0     # Counter to limit the number of samples

    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            questions = []  # Collect questions and answers for this context
            for qa in passage['qas']:
                if not qa["is_impossible"]:  # Exclude unanswerable questions
                    questions.append({
                        "question": qa['question'],
                        "answer": qa['answers'][0]['text']  # Use the first answer
                    })
                    count += 1
                    if count >= num_samples:  # Stop after reaching the desired number of samples
                        samples.append({"context": context, "questions": questions})
                        return samples

            # Add context with all its questions if within sample limit
            if questions:
                samples.append({"context": context, "questions": questions})

    return samples

# Extract 22 samples
samples = load_squad_samples(path, num_samples=50)


# 2. Display a few raw QnA data samples.

In [54]:
# Extract 22 samples
train_texts, train_queries, train_answers = load_squad_data(path, num_samples=50)

# Display the number of extracted samples
print(f"Extracted {len(train_texts)} samples.")

# Display the first few samples for verification
for i in range(50):  # Display the samples
    print(f"Sample {i+1}:")
    print(f"Context: {train_texts[i][:150]}...")  # Truncated for readability
    print(f"Question: {train_queries[i]}")
    print(f"Answer: {train_answers[i]['text']}")
    print("-" * 80)

Extracted 50 samples.
Sample 1:
Context: Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Bor...
Question: When did Beyonce start becoming popular?
Answer: in the late 1990s
--------------------------------------------------------------------------------
Sample 2:
Context: Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Bor...
Question: What areas did Beyonce compete in when she was growing up?
Answer: singing and dancing
--------------------------------------------------------------------------------
Sample 3:
Context: Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Bor...
Question: When did Beyonce leave Destiny's Child and become a solo singer?
Answer: 2003
-----------------------------------

# 3 - Preprocessing

In [55]:
for answer, text in zip(train_answers, train_texts):
    real_answer = answer['text']
    start_idx = answer['answer_start']
    # Get the real end index
    end_idx = start_idx + len(real_answer)

    # Deal with the problem of 1 or 2 more characters
    if text[start_idx:end_idx] == real_answer:
        answer['answer_end'] = end_idx
    # When the real answer is more by one character
    elif text[start_idx-1:end_idx-1] == real_answer:
        answer['answer_start'] = start_idx - 1
        answer['answer_end'] = end_idx - 1
    # When the real answer is more by two characters
    elif text[start_idx-2:end_idx-2] == real_answer:
        answer['answer_start'] = start_idx - 2
        answer['answer_end'] = end_idx - 2

In [56]:
for answer, text in zip(val_answers, val_texts):
    real_answer = answer['text']
    start_idx = answer['answer_start']
    # Get the real end index
    end_idx = start_idx + len(real_answer)

    # Deal with the problem of 1 or 2 more characters
    if text[start_idx:end_idx] == real_answer:
        answer['answer_end'] = end_idx
    # When the real answer is more by one character
    elif text[start_idx-1:end_idx-1] == real_answer:
        answer['answer_start'] = start_idx - 1
        answer['answer_end'] = end_idx - 1
    # When the real answer is more by two characters
    elif text[start_idx-2:end_idx-2] == real_answer:
        answer['answer_start'] = start_idx - 2
        answer['answer_end'] = end_idx - 2


In [57]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [58]:
train_encodings = tokenizer(train_texts, train_queries, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, val_queries, truncation=True, padding=True)

In [59]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []

    count = 0

    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length

        # if end position is None, the 'char_to_token' function points to the space after the correct token, so add - 1
        if end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] - 1)

            # if end position is still None the answer passage has been truncated
            if end_positions[-1] is None:
                count += 1
                end_positions[-1] = tokenizer.model_max_length
    print(count)

    # Update the data in dictionary
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

0
0


In [60]:
class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

In [61]:

train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [62]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=True)

 # Again 3- Preprocessing and cleaning the data

In [63]:
def predict(context, query):
    # Tokenize the inputs
    inputs = tokenizer.encode_plus(query, context, return_tensors='pt')

    # Ensure inputs are on the same device as the model
    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Perform inference
    outputs = model(**inputs)
    answer_start = torch.argmax(outputs[0])  # Start position
    answer_end = torch.argmax(outputs[1]) + 1  # End position

    return tokenizer.decode(inputs['input_ids'][0][answer_start:answer_end])


def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
    import string, re
    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()

    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)

    common_tokens = set(pred_tokens) & set(truth_tokens)

    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0

    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)
    return 2 * (prec * rec) / (prec + rec)

In [64]:
def give_an_answer(context,query,answer):
    prediction = predict(context,query)
    em_score = compute_exact_match(prediction, answer)
    f1_score = compute_f1(prediction, answer)
    print(f"Question: {query}")
    print(f"Prediction: {prediction}")
    print(f"True Answer: {answer}")
    print(f"EM: {em_score}")
    print(f"F1: {f1_score}")
    print("\n")

In [65]:
import re
import string

def normalize_text(s):
    """Lower text, remove punctuation and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)
    def white_space_fix(text):
        return ' '.join(text.split())
    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)
    def lower(text):
        return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_exact_match(prediction, ground_truth):
    return int(normalize_text(prediction) == normalize_text(ground_truth))

def compute_f1(prediction, ground_truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(ground_truth).split()
    common = set(pred_tokens) & set(truth_tokens)
    if not common:
        return 0
    prec = len(common) / len(pred_tokens)
    rec = len(common) / len(truth_tokens)
    return 2 * (prec * rec) / (prec + rec)


# 4-Train the BERT QnA model. Evaluate the model.

In [66]:
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased').to(device)

In [67]:
from transformers import AdamW
optim = AdamW(model.parameters(), lr=5e-5)

epochs = 3

In [68]:
whole_train_eval_time = time.time()

train_losses = []
val_losses = []

print_every = 1000

for epoch in range(epochs):
    epoch_time = time.time()

    # Set model in train mode
    model.train()
    loss_of_epoch = 0

    print("Train")

    for batch_idx,batch in enumerate(train_loader):
        optim.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]
        # do a backwards pass
        loss.backward()
        # update the weights
        optim.step()
        # Find the total loss
        loss_of_epoch += loss.item()

        if (batch_idx+1) % print_every == 0:
            print("Batch {:} / {:}".format(batch_idx+1,len(train_loader)),"\nLoss:", round(loss.item(),1),"\n")

    loss_of_epoch /= len(train_loader)
    train_losses.append(loss_of_epoch)

    ##########Evaluation##################

    # Set model in evaluation mode
    model.eval()

    print("Evaluate")

    loss_of_epoch = 0

    for batch_idx,batch in enumerate(val_loader):

        with torch.no_grad():

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
            loss = outputs[0]
            # Find the total loss
            loss_of_epoch += loss.item()

        if (batch_idx+1) % print_every == 0:
            print("Batch {:} / {:}".format(batch_idx+1,len(val_loader)),"\nLoss:", round(loss.item(),1),"\n")

    loss_of_epoch /= len(val_loader)
    val_losses.append(loss_of_epoch)

    # Print each epoch's time and train/val loss

    print("\n-------Epoch ", epoch+1,
          "-------"
          "\nTraining Loss:", train_losses[-1],
          "\nValidation Loss:", val_losses[-1],
          "\nTime: ",(time.time() - epoch_time),
          "\n-----------------------",
          "\n\n")

print("Total training and evaluation time: ", (time.time() - whole_train_eval_time))

Train
Evaluate

-------Epoch  1 -------
Training Loss: 5.108685493469238 
Validation Loss: 5.023641109466553 
Time:  1.496729850769043 
----------------------- 


Train
Evaluate

-------Epoch  2 -------
Training Loss: 3.909039088657924 
Validation Loss: 4.656170095716204 
Time:  1.5128142833709717 
----------------------- 


Train
Evaluate

-------Epoch  3 -------
Training Loss: 3.2849630968911305 
Validation Loss: 4.489402089800153 
Time:  1.5217924118041992 
----------------------- 


Total training and evaluation time:  4.537001609802246


# 4. Train the BERT QnA model. Evaluate the model

In [69]:
# Load the fine-tuned model and tokenizer
from transformers import BertTokenizerFast, BertForQuestionAnswering
import torch

# Ensure model and tokenizer are loaded
tokenizer = BertTokenizerFast.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Prediction function
def predict(context, question):
    encoding = tokenizer.encode_plus(question, context, return_tensors='pt').to(device)
    outputs = model(**encoding)
    start_idx = torch.argmax(outputs.start_logits)
    end_idx = torch.argmax(outputs.end_logits) + 1
    answer = tokenizer.decode(encoding['input_ids'][0][start_idx:end_idx])
    return answer

# Evaluation metrics
def compute_exact_match(prediction, ground_truth):
    return int(prediction.strip().lower() == ground_truth.strip().lower())

def compute_f1(prediction, ground_truth):
    pred_tokens = prediction.strip().lower().split()
    truth_tokens = ground_truth.strip().lower().split()
    common_tokens = set(pred_tokens) & set(truth_tokens)
    if not common_tokens:
        return 0
    precision = len(common_tokens) / len(pred_tokens)
    recall = len(common_tokens) / len(truth_tokens)
    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

# 5. Perform an Inference and show the predicted vs ground truth answers.

In [70]:

question_number = 1
total_f1 = 0
num_questions = 0

for sample in samples:
    print(f"\nContext: {sample['context'][:150]}...\n")  # Print a truncated context
    for q in sample['questions']:
        prediction = predict(sample["context"], q["question"])
        exact_match = compute_exact_match(prediction, q["answer"])
        f1_score = compute_f1(prediction, q["answer"])

        print(f"Question {question_number}: {q['question']}")
        print(f"Prediction: {prediction}")
        print(f"Ground Truth: {q['answer']}")
        print(f"Exact Match: {exact_match}")
        print(f"F1 Score: {f1_score}")
        print("-" * 50)

        total_f1 += f1_score
        num_questions += 1
        question_number += 1

average_f1 = total_f1 / num_questions if num_questions > 0 else 0
print(f"\nAverage F1 Score: {average_f1:.2f}")



Context: Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Bor...

Question 1: When did Beyonce start becoming popular?
Prediction: late 1990s
Ground Truth: in the late 1990s
Exact Match: 0
F1 Score: 0.6666666666666666
--------------------------------------------------
Question 2: What areas did Beyonce compete in when she was growing up?
Prediction: singing and dancing
Ground Truth: singing and dancing
Exact Match: 1
F1 Score: 1.0
--------------------------------------------------
Question 3: When did Beyonce leave Destiny's Child and become a solo singer?
Prediction: 2003
Ground Truth: 2003
Exact Match: 1
F1 Score: 1.0
--------------------------------------------------
Question 4: In what city and state did Beyonce  grow up? 
Prediction: houston
Ground Truth: Houston, Texas
Exact Match: 0
F1 Score: 0
--------------------------------------------------
Question 5: In which decade did Beyon

The accuracy is approx 83% , here better preprocessing steps and a larger dataset might increase the model accuracy or more like help in giving the right answer