In [1]:
!pip install torch torchtext transformers
!pip install torch datasets

[0m

In [2]:
import random
from datasets import load_dataset

min_samples=50
# Load the SQuAD 2.0 dataset
def load_squad2_dataset(min_samples):
    squad_dataset = load_dataset("squad_v2")

    qna_pairs = []
    for data in squad_dataset["train"]:
        if data["answers"]["text"]:
            qna_pairs.append({
                "context": data["context"],
                "question": data["question"],
                "answer": data["answers"]["text"][0]  # Taking the first answer
            })
        if len(qna_pairs) >= min_samples:
            break
    return qna_pairs

qna_pairs = load_squad2_dataset(min_samples)

#  Displaying samples
def display_samples(samples, num_samples=5):
    print(f"Displaying {num_samples} random QnA samples:\n")
    random_samples = random.sample(samples, min(num_samples, len(samples)))
    for i, sample in enumerate(random_samples, 1):
        print(f"Sample {i}:")
        print(f"Context: {sample['context']}")
        print(f"Question: {sample['question']}")
        print(f"Answer: {sample['answer']}")
        print("-" * 80)

display_samples(qna_pairs, num_samples=5)

  from .autonotebook import tqdm as notebook_tqdm


Displaying 5 random QnA samples:

Sample 1:
Context: Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".
Question: In which decade did Beyonce become famous?
Answer: late 1990s
--------------------------------------------------------------------------------
Sample 2:
Context: A self-described "modern-day feminist", Beyoncé creates songs that are often cha

In [3]:
from transformers import BertTokenizerFast

# Loading the tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

def preprocess_bert_format(qna_pairs, max_length=384):
    """
    Preprocess the dataset into BERT-compatible format.

    Args:
        qna_pairs (list): List of question-answer pairs.
        max_length (int): Maximum length for input tokens.

    Returns:
        processed_data (list): List of dictionaries containing BERT-compatible inputs.
    """
    processed_data = []
    for item in qna_pairs:
        context = item["context"]
        question = item["question"]
        answer = item["answer"]

        # Tokenizing
        encoding = tokenizer(
            question,
            context,
            max_length=max_length,
            truncation=True,
            padding="max_length",
            return_offsets_mapping=True,
            return_tensors="pt"
        )

        start_char_idx = context.find(answer)
        end_char_idx = start_char_idx + len(answer)
        offsets = encoding["offset_mapping"][0]
        start_token_idx = None
        end_token_idx = None

        for idx, (start, end) in enumerate(offsets):
            if start <= start_char_idx < end:
                start_token_idx = idx
            if start < end_char_idx <= end:
                end_token_idx = idx

        if start_token_idx is None or end_token_idx is None:
            continue

        processed_item = {
            "input_ids": encoding["input_ids"][0],
            "attention_mask": encoding["attention_mask"][0],
            "start_token_idx": start_token_idx,
            "end_token_idx": end_token_idx,
        }
        processed_data.append(processed_item)
    
    return processed_data

# Preprocessing
processed_qna_pairs = preprocess_bert_format(qna_pairs)
print("Sample Preprocessed Data:")
print(f"Input IDs: {processed_qna_pairs[0]['input_ids']}")
print(f"Attention Mask: {processed_qna_pairs[0]['attention_mask']}")
print(f"Start Token Index: {processed_qna_pairs[0]['start_token_idx']}")
print(f"End Token Index: {processed_qna_pairs[0]['end_token_idx']}")


Sample Preprocessed Data:
Input IDs: tensor([  101,  2043,  2106, 20773,  2707,  3352,  2759,  1029,   102, 20773,
        21025, 19358, 22815,  1011,  5708,  1006,  1013, 12170, 23432, 29715,
         3501, 29678, 12325, 29685,  1013, 10506,  1011, 10930,  2078,  1011,
         2360,  1007,  1006,  2141,  2244,  1018,  1010,  3261,  1007,  2003,
         2019,  2137,  3220,  1010,  6009,  1010,  2501,  3135,  1998,  3883,
         1012,  2141,  1998,  2992,  1999,  5395,  1010,  3146,  1010,  2016,
         2864,  1999,  2536,  4823,  1998,  5613,  6479,  2004,  1037,  2775,
         1010,  1998,  3123,  2000,  4476,  1999,  1996,  2397,  4134,  2004,
         2599,  3220,  1997,  1054,  1004,  1038,  2611,  1011,  2177, 10461,
         1005,  1055,  2775,  1012,  3266,  2011,  2014,  2269,  1010, 25436,
        22815,  1010,  1996,  2177,  2150,  2028,  1997,  1996,  2088,  1005,
         1055,  2190,  1011,  4855,  2611,  2967,  1997,  2035,  2051,  1012,
         2037, 14221,  2387

In [4]:
print(type(tokenizer))


<class 'transformers.models.bert.tokenization_bert_fast.BertTokenizerFast'>


In [5]:
from transformers import BertForQuestionAnswering, AdamW
from torch.utils.data import DataLoader, Dataset
import torch
from tqdm import tqdm

# Use GPU if available, otherwise CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class QnADataset(Dataset):
    def __init__(self, processed_data):
        self.data = processed_data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return {
            "input_ids": self.data[idx]["input_ids"],
            "attention_mask": self.data[idx]["attention_mask"],
            "start_token_idx": self.data[idx]["start_token_idx"],
            "end_token_idx": self.data[idx]["end_token_idx"],
        }

processed_dataset = QnADataset(processed_qna_pairs)
train_loader = DataLoader(processed_dataset, batch_size=8, shuffle=True)

# Loading the BERT model
model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

  warn(
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
from sklearn.metrics import accuracy_score

# Evaluation function
def evaluate(model, dataloader):
    model.eval()
    total = 0
    correct_start = 0
    correct_end = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            start_positions = batch["start_token_idx"].to(device)
            end_positions = batch["end_token_idx"].to(device)

            # Forward pass
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
            )

            start_predictions = torch.argmax(outputs.start_logits, dim=1)
            end_predictions = torch.argmax(outputs.end_logits, dim=1)

            # Calculating metrics
            total += start_positions.size(0)
            correct_start += (start_predictions == start_positions).sum().item()
            correct_end += (end_predictions == end_positions).sum().item()

    start_accuracy = correct_start / total
    end_accuracy = correct_end / total
    return start_accuracy, end_accuracy
    
epochs=15
def train_and_evaluate(model, train_loader, val_loader, optimizer, epochs):
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}")
        model.train()
        epoch_loss = 0

        for batch in tqdm(train_loader, desc="Training"):
            optimizer.zero_grad()

            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            start_positions = batch["start_token_idx"].to(device)
            end_positions = batch["end_token_idx"].to(device)

            # Forward pass
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                start_positions=start_positions,
                end_positions=end_positions,
            )

            loss = outputs.loss
            epoch_loss += loss.item()

            # Backward pass
            loss.backward()
            optimizer.step()

        print(f"Epoch {epoch + 1} Loss: {epoch_loss:.4f}")
        start_acc, end_acc = evaluate(model, val_loader)
        print(f"Validation Start Accuracy: {start_acc:.4f}")
        print(f"Validation End Accuracy: {end_acc:.4f}")

val_loader = DataLoader(processed_dataset, batch_size=8, shuffle=False)
train_and_evaluate(model, train_loader, val_loader, optimizer, epochs)
model.save_pretrained("./bert-qna-model")


Epoch 1


Training: 100%|██████████| 7/7 [00:00<00:00, 10.84it/s]


Epoch 1 Loss: 38.1839


Evaluating: 100%|██████████| 7/7 [00:00<00:00, 56.10it/s]


Validation Start Accuracy: 0.1800
Validation End Accuracy: 0.2000
Epoch 2


Training: 100%|██████████| 7/7 [00:00<00:00, 15.33it/s]


Epoch 2 Loss: 28.1539


Evaluating: 100%|██████████| 7/7 [00:00<00:00, 57.20it/s]


Validation Start Accuracy: 0.2200
Validation End Accuracy: 0.2000
Epoch 3


Training: 100%|██████████| 7/7 [00:00<00:00, 15.44it/s]


Epoch 3 Loss: 23.5454


Evaluating: 100%|██████████| 7/7 [00:00<00:00, 56.10it/s]


Validation Start Accuracy: 0.2600
Validation End Accuracy: 0.2000
Epoch 4


Training: 100%|██████████| 7/7 [00:00<00:00, 15.49it/s]


Epoch 4 Loss: 19.9465


Evaluating: 100%|██████████| 7/7 [00:00<00:00, 56.32it/s]


Validation Start Accuracy: 0.2200
Validation End Accuracy: 0.3000
Epoch 5


Training: 100%|██████████| 7/7 [00:00<00:00, 15.55it/s]


Epoch 5 Loss: 18.7883


Evaluating: 100%|██████████| 7/7 [00:00<00:00, 53.74it/s]


Validation Start Accuracy: 0.2400
Validation End Accuracy: 0.3000
Epoch 6


Training: 100%|██████████| 7/7 [00:00<00:00, 15.63it/s]


Epoch 6 Loss: 19.4523


Evaluating: 100%|██████████| 7/7 [00:00<00:00, 57.42it/s]


Validation Start Accuracy: 0.2200
Validation End Accuracy: 0.2400
Epoch 7


Training: 100%|██████████| 7/7 [00:00<00:00, 15.62it/s]


Epoch 7 Loss: 16.1742


Evaluating: 100%|██████████| 7/7 [00:00<00:00, 54.74it/s]


Validation Start Accuracy: 0.3400
Validation End Accuracy: 0.4200
Epoch 8


Training: 100%|██████████| 7/7 [00:00<00:00, 15.61it/s]


Epoch 8 Loss: 15.4323


Evaluating: 100%|██████████| 7/7 [00:00<00:00, 56.62it/s]


Validation Start Accuracy: 0.3000
Validation End Accuracy: 0.3200
Epoch 9


Training: 100%|██████████| 7/7 [00:00<00:00, 15.59it/s]


Epoch 9 Loss: 14.8057


Evaluating: 100%|██████████| 7/7 [00:00<00:00, 56.80it/s]


Validation Start Accuracy: 0.3200
Validation End Accuracy: 0.4800
Epoch 10


Training: 100%|██████████| 7/7 [00:00<00:00, 15.61it/s]


Epoch 10 Loss: 12.7204


Evaluating: 100%|██████████| 7/7 [00:00<00:00, 56.94it/s]


Validation Start Accuracy: 0.4600
Validation End Accuracy: 0.5400
Epoch 11


Training: 100%|██████████| 7/7 [00:00<00:00, 15.65it/s]


Epoch 11 Loss: 11.4550


Evaluating: 100%|██████████| 7/7 [00:00<00:00, 52.95it/s]


Validation Start Accuracy: 0.6400
Validation End Accuracy: 0.6000
Epoch 12


Training: 100%|██████████| 7/7 [00:00<00:00, 15.70it/s]


Epoch 12 Loss: 11.4946


Evaluating: 100%|██████████| 7/7 [00:00<00:00, 53.64it/s]


Validation Start Accuracy: 0.7200
Validation End Accuracy: 0.8000
Epoch 13


Training: 100%|██████████| 7/7 [00:00<00:00, 15.50it/s]


Epoch 13 Loss: 11.8126


Evaluating: 100%|██████████| 7/7 [00:00<00:00, 54.86it/s]


Validation Start Accuracy: 0.6400
Validation End Accuracy: 0.6200
Epoch 14


Training: 100%|██████████| 7/7 [00:00<00:00, 15.45it/s]


Epoch 14 Loss: 8.6336


Evaluating: 100%|██████████| 7/7 [00:00<00:00, 57.21it/s]


Validation Start Accuracy: 0.7200
Validation End Accuracy: 0.7000
Epoch 15


Training: 100%|██████████| 7/7 [00:00<00:00, 15.70it/s]


Epoch 15 Loss: 7.6855


Evaluating: 100%|██████████| 7/7 [00:00<00:00, 56.65it/s]


Validation Start Accuracy: 0.8400
Validation End Accuracy: 0.8400


In [7]:
def perform_inference_fixed_v3(model, dataloader, tokenizer, num_samples=5):
    """
    Perform inference and display predicted vs. ground truth answers with better handling.

    Args:
        model: Trained BERT model for QnA.
        dataloader: DataLoader for the test/validation set.
        tokenizer: Tokenizer used for BERT.
        num_samples: Number of examples to display.

    Returns:
        None
    """
    model.eval()
    predictions = []
    ground_truths = []
    contexts = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Performing Inference"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            start_positions = batch["start_token_idx"].to(device)
            end_positions = batch["end_token_idx"].to(device)

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            start_predictions = torch.argmax(outputs.start_logits, dim=1)
            end_predictions = torch.argmax(outputs.end_logits, dim=1)

            for i in range(input_ids.size(0)):
                tokens = tokenizer.convert_ids_to_tokens(input_ids[i])
                context_tokens = tokens[:attention_mask[i].sum().item()]  # Remove padding
                context = tokenizer.convert_tokens_to_string(context_tokens)

                # Prediction answer
                if (
                    start_predictions[i] < len(context_tokens)
                    and end_predictions[i] < len(context_tokens)
                    and start_predictions[i] <= end_predictions[i]
                ):
                    predicted_answer = tokenizer.convert_tokens_to_string(
                        context_tokens[start_predictions[i]:end_predictions[i] + 1]
                    ).strip()
                else:
                    predicted_answer = "[INVALID PREDICTION]"

                # Ground truth answer
                if start_positions[i] < len(context_tokens) and end_positions[i] < len(context_tokens):
                    ground_truth_answer = tokenizer.convert_tokens_to_string(
                        context_tokens[start_positions[i]:end_positions[i] + 1]
                    ).strip()
                else:
                    ground_truth_answer = "[INVALID GROUND TRUTH]"

                contexts.append(context)
                predictions.append(predicted_answer)
                ground_truths.append(ground_truth_answer)

                if len(predictions) >= num_samples:
                    break
            if len(predictions) >= num_samples:
                break

    for i in range(len(predictions)):
        print(f"Example {i + 1}")
        print(f"Context: {contexts[i]}")
        print(f"Predicted Answer: {predictions[i]}")
        print(f"Ground Truth Answer: {ground_truths[i]}")
        print("-" * 80)

perform_inference_fixed_v3(model, val_loader, tokenizer, num_samples=5)


Performing Inference:   0%|          | 0/7 [00:00<?, ?it/s]

Example 1
Context: [CLS] when did beyonce start becoming popular? [SEP] beyonce giselle knowles - carter ( / biːˈjɒnseɪ / bee - yon - say ) ( born september 4, 1981 ) is an american singer, songwriter, record producer and actress. born and raised in houston, texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of r & b girl - group destiny ' s child. managed by her father, mathew knowles, the group became one of the world ' s best - selling girl groups of all time. their hiatus saw the release of beyonce ' s debut album, dangerously in love ( 2003 ), which established her as a solo artist worldwide, earned five grammy awards and featured the billboard hot 100 number - one singles " crazy in love " and " baby boy ". [SEP]
Predicted Answer: late 1990s
Ground Truth Answer: in the late 1990s
--------------------------------------------------------------------------------
Example 2
Context: [CLS] what areas did beyonc




In [9]:
def perform_inference_fixed_v3(model, dataloader, tokenizer, num_samples=20):
    """
    Perform inference and display predicted vs. ground truth answers with better handling.

    Args:
        model: Trained BERT model for QnA.
        dataloader: DataLoader for the test/validation set.
        tokenizer: Tokenizer used for BERT.
        num_samples: Number of examples to display.

    Returns:
        None
    """
    model.eval()
    predictions = []
    ground_truths = []
    contexts = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Performing Inference"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            start_positions = batch["start_token_idx"].to(device)
            end_positions = batch["end_token_idx"].to(device)

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            start_predictions = torch.argmax(outputs.start_logits, dim=1)
            end_predictions = torch.argmax(outputs.end_logits, dim=1)

            for i in range(input_ids.size(0)):
                tokens = tokenizer.convert_ids_to_tokens(input_ids[i])
                context_tokens = tokens[:attention_mask[i].sum().item()]  # Remove padding
                context = tokenizer.convert_tokens_to_string(context_tokens)

                # Prediction answer
                if (
                    start_predictions[i] < len(context_tokens)
                    and end_predictions[i] < len(context_tokens)
                    and start_predictions[i] <= end_predictions[i]
                ):
                    predicted_answer = tokenizer.convert_tokens_to_string(
                        context_tokens[start_predictions[i]:end_predictions[i] + 1]
                    ).strip()
                else:
                    predicted_answer = "[INVALID PREDICTION]"

                # Ground truth answer
                if start_positions[i] < len(context_tokens) and end_positions[i] < len(context_tokens):
                    ground_truth_answer = tokenizer.convert_tokens_to_string(
                        context_tokens[start_positions[i]:end_positions[i] + 1]
                    ).strip()
                else:
                    ground_truth_answer = "[INVALID GROUND TRUTH]"

                contexts.append(context)
                predictions.append(predicted_answer)
                ground_truths.append(ground_truth_answer)

                if len(predictions) >= num_samples:
                    break
            if len(predictions) >= num_samples:
                break

    for i in range(len(predictions)):
        print(f"Example {i + 1}")
        print(f"Context: {contexts[i]}")
        print(f"Predicted Answer: {predictions[i]}")
        print(f"Ground Truth Answer: {ground_truths[i]}")
        print("-" * 80)

perform_inference_fixed_v3(model, val_loader, tokenizer, num_samples=20)

Performing Inference:  29%|██▊       | 2/7 [00:00<00:01,  4.34it/s]

Example 1
Context: [CLS] when did beyonce start becoming popular? [SEP] beyonce giselle knowles - carter ( / biːˈjɒnseɪ / bee - yon - say ) ( born september 4, 1981 ) is an american singer, songwriter, record producer and actress. born and raised in houston, texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of r & b girl - group destiny ' s child. managed by her father, mathew knowles, the group became one of the world ' s best - selling girl groups of all time. their hiatus saw the release of beyonce ' s debut album, dangerously in love ( 2003 ), which established her as a solo artist worldwide, earned five grammy awards and featured the billboard hot 100 number - one singles " crazy in love " and " baby boy ". [SEP]
Predicted Answer: late 1990s
Ground Truth Answer: in the late 1990s
--------------------------------------------------------------------------------
Example 2
Context: [CLS] what areas did beyonc




Experimented with 50 samples by training the model with 15 epochs it predicted correctly for 5 samples but when I checked for 20 samples, some predictions are wrong which is because the samples are minimal and training epochs are minimal so below experimented with 70 samples and trained model with 25 epochs.

In [10]:
import random
from datasets import load_dataset

min_samples=70
# Loading the SQuAD 2.0 dataset 
def load_squad2_dataset(min_samples):
    squad_dataset = load_dataset("squad_v2")

    qna_pairs = []
    for data in squad_dataset["train"]:
        if data["answers"]["text"]:
            qna_pairs.append({
                "context": data["context"],
                "question": data["question"],
                "answer": data["answers"]["text"][0]  # Taking the first answer
            })
        if len(qna_pairs) >= min_samples:
            break
    return qna_pairs

qna_pairs = load_squad2_dataset(min_samples)

# Displaying a few samples
def display_samples(samples, num_samples=5):
    print(f"Displaying {num_samples} random QnA samples:\n")
    random_samples = random.sample(samples, min(num_samples, len(samples)))
    for i, sample in enumerate(random_samples, 1):
        print(f"Sample {i}:")
        print(f"Context: {sample['context']}")
        print(f"Question: {sample['question']}")
        print(f"Answer: {sample['answer']}")
        print("-" * 80)

display_samples(qna_pairs, num_samples=5)

Displaying 5 random QnA samples:

Sample 1:
Context: Beyoncé attended St. Mary's Elementary School in Fredericksburg, Texas, where she enrolled in dance classes. Her singing talent was discovered when dance instructor Darlette Johnson began humming a song and she finished it, able to hit the high-pitched notes. Beyoncé's interest in music and performing continued after winning a school talent show at age seven, singing John Lennon's "Imagine" to beat 15/16-year-olds. In fall of 1990, Beyoncé enrolled in Parker Elementary School, a music magnet school in Houston, where she would perform with the school's choir. She also attended the High School for the Performing and Visual Arts and later Alief Elsik High School. Beyoncé was also a member of the choir at St. John's United Methodist Church as a soloist for two years.
Question: Beyonce moved to which town after she left her first elementary school?
Answer: Houston
---------------------------------------------------------------------------

In [11]:
from transformers import BertTokenizerFast

# Loading the tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

def preprocess_bert_format(qna_pairs, max_length=384):
    """
    Preprocess the dataset into BERT-compatible format.

    Args:
        qna_pairs (list): List of question-answer pairs.
        max_length (int): Maximum length for input tokens.

    Returns:
        processed_data (list): List of dictionaries containing BERT-compatible inputs.
    """
    processed_data = []
    for item in qna_pairs:
        context = item["context"]
        question = item["question"]
        answer = item["answer"]

        encoding = tokenizer(
            question,
            context,
            max_length=max_length,
            truncation=True,
            padding="max_length",
            return_offsets_mapping=True,
            return_tensors="pt"
        )

        start_char_idx = context.find(answer)
        end_char_idx = start_char_idx + len(answer)

        offsets = encoding["offset_mapping"][0]
        start_token_idx = None
        end_token_idx = None

        for idx, (start, end) in enumerate(offsets):
            if start <= start_char_idx < end:
                start_token_idx = idx
            if start < end_char_idx <= end:
                end_token_idx = idx

        if start_token_idx is None or end_token_idx is None:
            continue

        processed_item = {
            "input_ids": encoding["input_ids"][0],
            "attention_mask": encoding["attention_mask"][0],
            "start_token_idx": start_token_idx,
            "end_token_idx": end_token_idx,
        }
        processed_data.append(processed_item)
    
    return processed_data

# Preprocessing
processed_qna_pairs = preprocess_bert_format(qna_pairs)
print("Sample Preprocessed Data:")
print(f"Input IDs: {processed_qna_pairs[0]['input_ids']}")
print(f"Attention Mask: {processed_qna_pairs[0]['attention_mask']}")
print(f"Start Token Index: {processed_qna_pairs[0]['start_token_idx']}")
print(f"End Token Index: {processed_qna_pairs[0]['end_token_idx']}")

Sample Preprocessed Data:
Input IDs: tensor([  101,  2043,  2106, 20773,  2707,  3352,  2759,  1029,   102, 20773,
        21025, 19358, 22815,  1011,  5708,  1006,  1013, 12170, 23432, 29715,
         3501, 29678, 12325, 29685,  1013, 10506,  1011, 10930,  2078,  1011,
         2360,  1007,  1006,  2141,  2244,  1018,  1010,  3261,  1007,  2003,
         2019,  2137,  3220,  1010,  6009,  1010,  2501,  3135,  1998,  3883,
         1012,  2141,  1998,  2992,  1999,  5395,  1010,  3146,  1010,  2016,
         2864,  1999,  2536,  4823,  1998,  5613,  6479,  2004,  1037,  2775,
         1010,  1998,  3123,  2000,  4476,  1999,  1996,  2397,  4134,  2004,
         2599,  3220,  1997,  1054,  1004,  1038,  2611,  1011,  2177, 10461,
         1005,  1055,  2775,  1012,  3266,  2011,  2014,  2269,  1010, 25436,
        22815,  1010,  1996,  2177,  2150,  2028,  1997,  1996,  2088,  1005,
         1055,  2190,  1011,  4855,  2611,  2967,  1997,  2035,  2051,  1012,
         2037, 14221,  2387

In [12]:
from transformers import BertForQuestionAnswering, AdamW
from torch.utils.data import DataLoader, Dataset
import torch
from tqdm import tqdm

# Use GPU if available, otherwise CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class QnADataset(Dataset):
    def __init__(self, processed_data):
        self.data = processed_data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return {
            "input_ids": self.data[idx]["input_ids"],
            "attention_mask": self.data[idx]["attention_mask"],
            "start_token_idx": self.data[idx]["start_token_idx"],
            "end_token_idx": self.data[idx]["end_token_idx"],
        }

processed_dataset = QnADataset(processed_qna_pairs)
train_loader = DataLoader(processed_dataset, batch_size=8, shuffle=True)

# Loading the BERT model 
model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
from sklearn.metrics import accuracy_score

# Evaluation Function
def evaluate(model, dataloader):
    model.eval()
    total = 0
    correct_start = 0
    correct_end = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            start_positions = batch["start_token_idx"].to(device)
            end_positions = batch["end_token_idx"].to(device)

            # Forward pass
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
            )

            start_predictions = torch.argmax(outputs.start_logits, dim=1)
            end_predictions = torch.argmax(outputs.end_logits, dim=1)

            # Calculating metrics
            total += start_positions.size(0)
            correct_start += (start_predictions == start_positions).sum().item()
            correct_end += (end_predictions == end_positions).sum().item()

    start_accuracy = correct_start / total
    end_accuracy = correct_end / total
    return start_accuracy, end_accuracy
    
epochs=25
def train_and_evaluate(model, train_loader, val_loader, optimizer, epochs):
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}")
        model.train()
        epoch_loss = 0

        for batch in tqdm(train_loader, desc="Training"):
            optimizer.zero_grad()

            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            start_positions = batch["start_token_idx"].to(device)
            end_positions = batch["end_token_idx"].to(device)

            # Forward pass
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                start_positions=start_positions,
                end_positions=end_positions,
            )

            loss = outputs.loss
            epoch_loss += loss.item()

            # Backward pass
            loss.backward()
            optimizer.step()

        print(f"Epoch {epoch + 1} Loss: {epoch_loss:.4f}")

        start_acc, end_acc = evaluate(model, val_loader)
        print(f"Validation Start Accuracy: {start_acc:.4f}")
        print(f"Validation End Accuracy: {end_acc:.4f}")

val_loader = DataLoader(processed_dataset, batch_size=8, shuffle=False)
train_and_evaluate(model, train_loader, val_loader, optimizer, epochs)
model.save_pretrained("./bert-qna-model")

Epoch 1


Training: 100%|██████████| 9/9 [00:01<00:00,  7.08it/s]


Epoch 1 Loss: 48.4952


Evaluating: 100%|██████████| 9/9 [00:00<00:00, 16.87it/s]


Validation Start Accuracy: 0.1429
Validation End Accuracy: 0.1714
Epoch 2


Training: 100%|██████████| 9/9 [00:01<00:00,  8.79it/s]


Epoch 2 Loss: 35.9979


Evaluating: 100%|██████████| 9/9 [00:00<00:00, 16.80it/s]


Validation Start Accuracy: 0.2000
Validation End Accuracy: 0.2000
Epoch 3


Training: 100%|██████████| 9/9 [00:00<00:00,  9.16it/s]


Epoch 3 Loss: 28.9773


Evaluating: 100%|██████████| 9/9 [00:00<00:00, 15.80it/s]


Validation Start Accuracy: 0.2000
Validation End Accuracy: 0.2286
Epoch 4


Training: 100%|██████████| 9/9 [00:00<00:00,  9.23it/s]


Epoch 4 Loss: 24.9503


Evaluating: 100%|██████████| 9/9 [00:00<00:00, 16.79it/s]


Validation Start Accuracy: 0.2571
Validation End Accuracy: 0.2571
Epoch 5


Training: 100%|██████████| 9/9 [00:00<00:00,  9.26it/s]


Epoch 5 Loss: 22.6067


Evaluating: 100%|██████████| 9/9 [00:00<00:00, 16.75it/s]


Validation Start Accuracy: 0.2143
Validation End Accuracy: 0.2714
Epoch 6


Training: 100%|██████████| 9/9 [00:00<00:00,  9.21it/s]


Epoch 6 Loss: 21.6277


Evaluating: 100%|██████████| 9/9 [00:00<00:00, 16.65it/s]


Validation Start Accuracy: 0.2714
Validation End Accuracy: 0.3286
Epoch 7


Training: 100%|██████████| 9/9 [00:00<00:00,  9.22it/s]


Epoch 7 Loss: 20.7461


Evaluating: 100%|██████████| 9/9 [00:00<00:00, 16.77it/s]


Validation Start Accuracy: 0.3857
Validation End Accuracy: 0.3143
Epoch 8


Training: 100%|██████████| 9/9 [00:00<00:00,  9.26it/s]


Epoch 8 Loss: 19.2677


Evaluating: 100%|██████████| 9/9 [00:00<00:00, 16.81it/s]


Validation Start Accuracy: 0.5286
Validation End Accuracy: 0.4286
Epoch 9


Training: 100%|██████████| 9/9 [00:00<00:00,  9.15it/s]


Epoch 9 Loss: 16.2286


Evaluating: 100%|██████████| 9/9 [00:00<00:00, 16.00it/s]


Validation Start Accuracy: 0.5571
Validation End Accuracy: 0.5286
Epoch 10


Training: 100%|██████████| 9/9 [00:00<00:00,  9.18it/s]


Epoch 10 Loss: 14.1342


Evaluating: 100%|██████████| 9/9 [00:00<00:00, 16.41it/s]


Validation Start Accuracy: 0.5714
Validation End Accuracy: 0.6143
Epoch 11


Training: 100%|██████████| 9/9 [00:00<00:00,  9.16it/s]


Epoch 11 Loss: 11.7349


Evaluating: 100%|██████████| 9/9 [00:00<00:00, 16.58it/s]


Validation Start Accuracy: 0.6857
Validation End Accuracy: 0.7571
Epoch 12


Training: 100%|██████████| 9/9 [00:00<00:00,  9.01it/s]


Epoch 12 Loss: 9.6928


Evaluating: 100%|██████████| 9/9 [00:00<00:00, 16.39it/s]


Validation Start Accuracy: 0.7714
Validation End Accuracy: 0.8000
Epoch 13


Training: 100%|██████████| 9/9 [00:01<00:00,  8.87it/s]


Epoch 13 Loss: 7.8872


Evaluating: 100%|██████████| 9/9 [00:00<00:00, 16.91it/s]


Validation Start Accuracy: 0.8714
Validation End Accuracy: 0.8714
Epoch 14


Training: 100%|██████████| 9/9 [00:00<00:00,  9.25it/s]


Epoch 14 Loss: 6.2065


Evaluating: 100%|██████████| 9/9 [00:00<00:00, 16.91it/s]


Validation Start Accuracy: 0.9429
Validation End Accuracy: 0.9571
Epoch 15


Training: 100%|██████████| 9/9 [00:00<00:00,  9.03it/s]


Epoch 15 Loss: 3.6648


Evaluating: 100%|██████████| 9/9 [00:00<00:00, 16.41it/s]


Validation Start Accuracy: 0.9714
Validation End Accuracy: 0.9429
Epoch 16


Training: 100%|██████████| 9/9 [00:00<00:00,  9.30it/s]


Epoch 16 Loss: 4.2308


Evaluating: 100%|██████████| 9/9 [00:00<00:00, 16.29it/s]


Validation Start Accuracy: 0.9857
Validation End Accuracy: 0.9429
Epoch 17


Training: 100%|██████████| 9/9 [00:00<00:00,  9.33it/s]


Epoch 17 Loss: 2.8129


Evaluating: 100%|██████████| 9/9 [00:00<00:00, 15.80it/s]


Validation Start Accuracy: 0.9571
Validation End Accuracy: 0.9857
Epoch 18


Training: 100%|██████████| 9/9 [00:01<00:00,  8.69it/s]


Epoch 18 Loss: 2.7466


Evaluating: 100%|██████████| 9/9 [00:00<00:00, 16.75it/s]


Validation Start Accuracy: 0.9857
Validation End Accuracy: 0.9857
Epoch 19


Training: 100%|██████████| 9/9 [00:00<00:00,  9.19it/s]


Epoch 19 Loss: 2.0494


Evaluating: 100%|██████████| 9/9 [00:00<00:00, 15.71it/s]


Validation Start Accuracy: 0.9857
Validation End Accuracy: 0.9571
Epoch 20


Training: 100%|██████████| 9/9 [00:00<00:00,  9.16it/s]


Epoch 20 Loss: 1.2950


Evaluating: 100%|██████████| 9/9 [00:00<00:00, 16.91it/s]


Validation Start Accuracy: 0.9857
Validation End Accuracy: 0.9714
Epoch 21


Training: 100%|██████████| 9/9 [00:00<00:00,  9.31it/s]


Epoch 21 Loss: 2.0920


Evaluating: 100%|██████████| 9/9 [00:00<00:00, 16.80it/s]


Validation Start Accuracy: 0.9857
Validation End Accuracy: 0.9857
Epoch 22


Training: 100%|██████████| 9/9 [00:01<00:00,  8.72it/s]


Epoch 22 Loss: 1.1280


Evaluating: 100%|██████████| 9/9 [00:00<00:00, 16.27it/s]


Validation Start Accuracy: 1.0000
Validation End Accuracy: 0.9857
Epoch 23


Training: 100%|██████████| 9/9 [00:00<00:00,  9.29it/s]


Epoch 23 Loss: 0.8236


Evaluating: 100%|██████████| 9/9 [00:00<00:00, 16.86it/s]


Validation Start Accuracy: 1.0000
Validation End Accuracy: 0.9857
Epoch 24


Training: 100%|██████████| 9/9 [00:00<00:00,  9.30it/s]


Epoch 24 Loss: 0.6878


Evaluating: 100%|██████████| 9/9 [00:00<00:00, 16.94it/s]


Validation Start Accuracy: 1.0000
Validation End Accuracy: 0.9857
Epoch 25


Training: 100%|██████████| 9/9 [00:00<00:00,  9.28it/s]


Epoch 25 Loss: 0.8189


Evaluating: 100%|██████████| 9/9 [00:00<00:00, 16.42it/s]


Validation Start Accuracy: 1.0000
Validation End Accuracy: 0.9857


In [15]:
def perform_inference_fixed_v3(model, dataloader, tokenizer, num_samples=20):
    """
    Perform inference and display predicted vs. ground truth answers with better handling.

    Args:
        model: Trained BERT model for QnA.
        dataloader: DataLoader for the test/validation set.
        tokenizer: Tokenizer used for BERT.
        num_samples: Number of examples to display.

    Returns:
        None
    """
    model.eval()
    predictions = []
    ground_truths = []
    contexts = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Performing Inference"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            start_positions = batch["start_token_idx"].to(device)
            end_positions = batch["end_token_idx"].to(device)

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            start_predictions = torch.argmax(outputs.start_logits, dim=1)
            end_predictions = torch.argmax(outputs.end_logits, dim=1)

            for i in range(input_ids.size(0)):
                tokens = tokenizer.convert_ids_to_tokens(input_ids[i])
                context_tokens = tokens[:attention_mask[i].sum().item()]  # Remove padding
                context = tokenizer.convert_tokens_to_string(context_tokens)

                # Predicted answer
                if (
                    start_predictions[i] < len(context_tokens)
                    and end_predictions[i] < len(context_tokens)
                    and start_predictions[i] <= end_predictions[i]
                ):
                    predicted_answer = tokenizer.convert_tokens_to_string(
                        context_tokens[start_predictions[i]:end_predictions[i] + 1]
                    ).strip()
                else:
                    predicted_answer = "[INVALID PREDICTION]"

                # Ground truth answer
                if start_positions[i] < len(context_tokens) and end_positions[i] < len(context_tokens):
                    ground_truth_answer = tokenizer.convert_tokens_to_string(
                        context_tokens[start_positions[i]:end_positions[i] + 1]
                    ).strip()
                else:
                    ground_truth_answer = "[INVALID GROUND TRUTH]"

                contexts.append(context)
                predictions.append(predicted_answer)
                ground_truths.append(ground_truth_answer)

                if len(predictions) >= num_samples:
                    break
            if len(predictions) >= num_samples:
                break

    for i in range(len(predictions)):
        print(f"Example {i + 1}")
        print(f"Context: {contexts[i]}")
        print(f"Predicted Answer: {predictions[i]}")
        print(f"Ground Truth Answer: {ground_truths[i]}")
        print("-" * 80)

perform_inference_fixed_v3(model, val_loader, tokenizer, num_samples=20)

Performing Inference:  22%|██▏       | 2/9 [00:00<00:01,  5.66it/s]

Example 1
Context: [CLS] when did beyonce start becoming popular? [SEP] beyonce giselle knowles - carter ( / biːˈjɒnseɪ / bee - yon - say ) ( born september 4, 1981 ) is an american singer, songwriter, record producer and actress. born and raised in houston, texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of r & b girl - group destiny ' s child. managed by her father, mathew knowles, the group became one of the world ' s best - selling girl groups of all time. their hiatus saw the release of beyonce ' s debut album, dangerously in love ( 2003 ), which established her as a solo artist worldwide, earned five grammy awards and featured the billboard hot 100 number - one singles " crazy in love " and " baby boy ". [SEP]
Predicted Answer: in the late 1990s
Ground Truth Answer: in the late 1990s
--------------------------------------------------------------------------------
Example 2
Context: [CLS] what areas did




For samples 70 and training model with 25 epochs it predicted all the samples correctly, we can see predicted and ground truth answers are same.