In [1]:
import torch
from transformers import BertForQuestionAnswering, BertTokenizerFast, BertModel
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import numpy as np
import nltk

# Download NLTK data for BLEU score if needed
nltk.download('punkt')

# Load the model and tokenizer
model_path = 'D:/Rajesh/Rajesh/Personal/AISanDiego/520-NLP/Project/saved_model/'
model = BertForQuestionAnswering.from_pretrained(model_path)
embedding_model = BertModel.from_pretrained('bert-base-uncased')  # Use BertModel for embeddings
tokenizer = BertTokenizerFast.from_pretrained(model_path)

# Function to compute cosine similarity using hidden states (embeddings)
def compute_cosine_similarity(text1, text2):
    # Tokenize the inputs
    inputs_1 = tokenizer(text1, return_tensors="pt", truncation=True, padding=True)
    inputs_2 = tokenizer(text2, return_tensors="pt", truncation=True, padding=True)

    # Get hidden states (sentence embeddings) from embedding model
    with torch.no_grad():
        outputs_1 = embedding_model(**inputs_1)
        outputs_2 = embedding_model(**inputs_2)

    # Extract the CLS token embeddings
    embedding_1 = outputs_1.last_hidden_state[:, 0, :].numpy()
    embedding_2 = outputs_2.last_hidden_state[:, 0, :].numpy()

    # Compute cosine similarity between the embeddings
    similarity = cosine_similarity(embedding_1, embedding_2)[0][0]
    return similarity

# Function to align token lengths by padding shorter lists with dummy tokens
def align_token_lengths(true_tokens, pred_tokens):
    max_len = max(len(true_tokens), len(pred_tokens))
    true_tokens += ['[PAD]'] * (max_len - len(true_tokens))
    pred_tokens += ['[PAD]'] * (max_len - len(pred_tokens))
    return true_tokens, pred_tokens

# Function to answer questions and calculate metrics
def answer_question_with_metrics(question, context, ground_truth):
    inputs = tokenizer(question, context, return_tensors='pt', truncation=True, max_length=512)

    # Generate the answer using the model
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the most likely beginning and end of the answer span
    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits) + 1

    # Convert the tokens to text
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end]))

    # Capitalize the first letter of the answer
    answer = answer[0].upper() + answer[1:]

    # Tokenize predicted answer and ground truth for token-level comparison
    pred_tokens = tokenizer.tokenize(answer)
    true_tokens = tokenizer.tokenize(ground_truth)

    # Align token lengths to avoid length mismatches
    true_tokens, pred_tokens = align_token_lengths(true_tokens, pred_tokens)

    # Calculate token-level precision, recall, F1
    precision = precision_score(true_tokens, pred_tokens, average='micro') if true_tokens else 0
    recall = recall_score(true_tokens, pred_tokens, average='micro') if true_tokens else 0
    f1 = f1_score(true_tokens, pred_tokens, average='micro') if true_tokens else 0

    # Token accuracy
    token_accuracy = sum([p == t for p, t in zip(pred_tokens, true_tokens)]) / len(true_tokens) if true_tokens else 0

    # Calculate BLEU score
    smoothing_fn = SmoothingFunction().method1
    bleu_score = sentence_bleu([true_tokens], pred_tokens, smoothing_function=smoothing_fn)

    # Calculate Cosine Similarity
    cosine_sim = compute_cosine_similarity(answer, ground_truth)

    return answer, precision, recall, f1, token_accuracy, bleu_score, cosine_sim

# Example context and questions
context = ("Transformers are a type of deep learning model introduced in the paper 'Attention is All You Need' by Vaswani et al. "
           "in 2017. Unlike traditional recurrent neural networks (RNNs), Transformers rely entirely on self-attention mechanisms "
           "to capture the relationships between different parts of a sequence. This allows Transformers to process data in parallel, "
           "making them much faster and more efficient for tasks like natural language processing. Transformers are the foundation for "
           "many state-of-the-art models, such as BERT, GPT, and T5. They are used in a variety of applications, including machine translation, "
           "text generation, and question answering.")

questions = [
    "Name some models that are based on the Transformer architecture",
    "What mechanism does the Transformer rely on",
    "Why are Transformers faster than traditional RNNs?",
    "What happens if self-attention mechanism captures the relationship between different parts of a sequence?",
    "State some applications of Transformers?",
    "Who introduced self Attention?"
]

ground_truths = [
    "BERT, GPT, and T5",
    "Self-attention mechanisms",
    "Transformers rely entirely on self-attention mechanisms, which captures the relationship between different parts of a sentence",
    "Transformers can process data in parallel",
    "Machine translation, text generation, and question answering",
    "Vaswani et al."
]

# Initialize totals for overall metrics
total_precision, total_recall, total_f1, total_accuracy, total_bleu, total_cosine_sim = 0, 0, 0, 0, 0, 0
num_questions = len(questions)

# Store the results
for question, truth in zip(questions, ground_truths):
    answer, precision, recall, f1, token_accuracy, bleu_score, cosine_sim = answer_question_with_metrics(question, context, truth)
    
    # Print individual results
    print(f"Question: {question}")
    print(f"Predicted Answer: {answer}")
    print(f"Ground Truth: {truth}")
    print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}, Accuracy: {token_accuracy:.2f}")
    print(f"BLEU Score: {bleu_score:.2f}, Cosine Similarity: {cosine_sim:.2f}\n")

    # Accumulate the metrics for overall calculation
    total_precision += precision
    total_recall += recall
    total_f1 += f1
    total_accuracy += token_accuracy
    total_bleu += bleu_score
    total_cosine_sim += cosine_sim

# Calculate and print the consolidated metrics
avg_precision = total_precision / num_questions
avg_recall = total_recall / num_questions
avg_f1 = total_f1 / num_questions
avg_accuracy = total_accuracy / num_questions
avg_bleu = total_bleu / num_questions
avg_cosine_sim = total_cosine_sim / num_questions

print(f"Consolidated Precision: {avg_precision:.2f}")
print(f"Consolidated Recall: {avg_recall:.2f}")
print(f"Consolidated F1 Score: {avg_f1:.2f}")
print(f"Consolidated Token-Level Accuracy: {avg_accuracy:.2f}")
print(f"Consolidated BLEU Score: {avg_bleu:.2f}")
print(f"Consolidated Cosine Similarity: {avg_cosine_sim:.2f}")


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\CZ0068\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Question: Name some models that are based on the Transformer architecture
Predicted Answer: Bert, gpt, and t5
Ground Truth: BERT, GPT, and T5
Precision: 1.00, Recall: 1.00, F1 Score: 1.00, Accuracy: 1.00
BLEU Score: 1.00, Cosine Similarity: 1.00

Question: What mechanism does the Transformer rely on
Predicted Answer: Self - attention mechanisms
Ground Truth: Self-attention mechanisms
Precision: 1.00, Recall: 1.00, F1 Score: 1.00, Accuracy: 1.00
BLEU Score: 1.00, Cosine Similarity: 1.00

Question: Why are Transformers faster than traditional RNNs?
Predicted Answer: Transformers rely entirely on self - attention mechanisms to capture the relationships between different parts of a sequence
Ground Truth: Transformers rely entirely on self-attention mechanisms, which captures the relationship between different parts of a sentence
Precision: 0.42, Recall: 0.42, F1 Score: 0.42, Accuracy: 0.42
BLEU Score: 0.57, Cosine Similarity: 0.94

Question: What happens if self-attention mechanism capture

When evaluating language models using strict token-level metrics like precision, recall, F1 score, and accuracy does not give meaningful results. These metrics focus on exact word matches between the predicted and ground truth answers, which can lead to low scores even if the answers are semantically correct but have slight variations in wording. We are reporting these token level metrics for completeness

### Explanation of the metrics:
1. **Precision, Recall, and F1 Score (Token-level):**
   - **Precision** measures how many of the predicted tokens are correct compared to the ground truth.
   - **Recall** measures how many of the ground truth tokens are captured in the prediction.
   - **F1 Score** is the harmonic mean of precision and recall.
   - These metrics work well for tasks where exact matches are required (e.g., named entity recognition) but are not ideal for tasks like question answering where meaning matters more than exact word matches.

   In our case:
   - **For "Why are Transformers faster than traditional RNNs?"**:
     - The predicted and ground truth answers are semantically the same, but the difference in phrases like "parts of a sequence" vs. "parts of a sentence" results in lower precision, recall, and F1 scores.
   - **For "What happens if self-attention mechanism captures...?"**:
     - The difference between "Allows transformers to process data in parallel" and "Transformers can process data in parallel" leads to 0 token-level scores, even though the meaning is correct.

2. **BLEU Score**:
   - BLEU focuses on the n-gram overlap between predicted and ground truth answers. It can give some credit for partial matches but doesn't handle paraphrasing well. Hence, it gives a moderate score (e.g., 0.57 and 0.43), indicating partial overlap but not full matches.

3. **Cosine Similarity**:
   - Cosine similarity between embeddings gives a much better indication of semantic similarity. High scores (e.g., 0.94 and 0.87) indicate that the predicted and ground truth answers are semantically very similar, despite minor word differences.

### What this means:
The token-level metrics (precision, recall, F1, accuracy) are giving a misleading picture in this case because they penalize minor wording differences. BLEU score provides some improvement, but cosine similarity gives the most accurate reflection of how similar the answers are in meaning.

### Next Steps:
- We will **focus on semantic metrics** like cosine similarity that focus on meaning rather than exact matches..

In [9]:
import torch
from transformers import BertTokenizerFast, BertForQuestionAnswering, BertModel
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk import edit_distance
from rouge_score import rouge_scorer
from bert_score import score as bert_score
import numpy as np

# Load BERT model and tokenizer
model_path = 'D:/Rajesh/Rajesh/Personal/AISanDiego/520-NLP/Project/saved_model/'
model = BertForQuestionAnswering.from_pretrained(model_path)
embedding_model = BertModel.from_pretrained(model_path)  # For embeddings
tokenizer = BertTokenizerFast.from_pretrained(model_path)

# Example context and questions
context = ("Transformers are a type of deep learning model introduced in the paper 'Attention is All You Need' by Vaswani et al. "
           "in 2017. Unlike traditional recurrent neural networks (RNNs), Transformers rely entirely on self-attention mechanisms "
           "to capture the relationships between different parts of a sequence. This allows Transformers to process data in parallel, "
           "making them much faster and more efficient for tasks like natural language processing. Transformers are the foundation for "
           "many state-of-the-art models, such as BERT, GPT, and T5. They are used in a variety of applications, including machine translation, "
           "text generation, and question answering.")

questions = [
    "Name some models that are based on the Transformer architecture",
    "What mechanism does the Transformer rely on",
    "Why are Transformers faster than traditional RNNs?",
    "What happens if self-attention mechanism captures the relationship between different parts of a sequence?",
    "State some applications of Transformers?",
    "Who introduced self Attention?"
]

ground_truths = [
    "BERT, GPT, and T5",
    "Self-attention mechanisms",
    "Transformers rely entirely on self-attention mechanisms, which captures the relationship between different parts of a sentence",
    "Transformers can process data in parallel",
    "Machine translation, text generation, and question answering",
    "Vaswani et al."
]

# Function to extract answer from model
def answer_question(question, context):
    inputs = tokenizer.encode_plus(question, context, return_tensors='pt', truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the most likely beginning and end of answer
    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits) + 1
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end]))

    return answer

# Function to compute metrics
def compute_metrics(pred_answer, true_answer):
    # Tokenize the predicted and true answers
    pred_tokens = tokenizer.tokenize(pred_answer.lower())
    true_tokens = tokenizer.tokenize(true_answer.lower())

    # Token-based Precision, Recall, and F1
    common_tokens = set(pred_tokens) & set(true_tokens)
    precision = len(common_tokens) / len(pred_tokens) if pred_tokens else 0
    recall = len(common_tokens) / len(true_tokens) if true_tokens else 0
    f1 = (2 * precision * recall) / (precision + recall) if precision + recall > 0 else 0

    # BLEU Score with smoothing
    smoothing_fn = SmoothingFunction().method1
    bleu = sentence_bleu([true_answer.split()], pred_answer.split(), smoothing_function=smoothing_fn)

    # Levenshtein Distance
    levenshtein_dist = edit_distance(pred_answer, true_answer)
    levenshtein_score = 1 - (levenshtein_dist / max(len(pred_answer), len(true_answer)))

    # ROUGE Score
    rouge_scorer_inst = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    rouge_scores = rouge_scorer_inst.score(true_answer, pred_answer)

    # BERTScore
    P, R, F1 = bert_score([pred_answer], [true_answer], lang="en", verbose=False)

    # Cosine Similarity using BERT embeddings
    pred_emb = np.mean(embedding_model.embeddings.word_embeddings(torch.tensor(tokenizer.convert_tokens_to_ids(pred_tokens))).detach().numpy(), axis=0)
    true_emb = np.mean(embedding_model.embeddings.word_embeddings(torch.tensor(tokenizer.convert_tokens_to_ids(true_tokens))).detach().numpy(), axis=0)
    cosine_sim = cosine_similarity([pred_emb], [true_emb])[0][0]

    # Jaccard Similarity (word overlap)
    pred_set = set(pred_answer.split())
    true_set = set(true_answer.split())
    jaccard_sim = len(pred_set & true_set) / len(pred_set | true_set) if pred_set | true_set else 0.0

    return {
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'bleu': bleu,
        'levenshtein': levenshtein_score,
        'rouge-1': rouge_scores['rouge1'].fmeasure,
        'rouge-L': rouge_scores['rougeL'].fmeasure,
        'bertscore_f1': F1.mean().item(),
        'cosine_similarity': cosine_sim,
        'jaccard_similarity': jaccard_sim
    }

# Evaluate the model on the questions
metrics_summary = {
    'precision': [],
    'recall': [],
    'f1': [],
    'bleu': [],
    'levenshtein': [],
    'rouge-1': [],
    'rouge-L': [],
    'bertscore_f1': [],
    'cosine_similarity': [],
    'jaccard_similarity': []
}

for question, truth in zip(questions, ground_truths):
    pred_answer = answer_question(question, context)
    print(f"Question: {question}")
    print(f"Predicted Answer: {pred_answer}")
    print(f"Ground Truth: {truth}")

    metrics = compute_metrics(pred_answer, truth)

    for key in metrics_summary:
        metrics_summary[key].append(metrics[key])

    # Print individual metrics
    print(f"Precision: {metrics['precision']:.2f}, Recall: {metrics['recall']:.2f}, F1 Score: {metrics['f1']:.2f}")
    print(f"BLEU Score: {metrics['bleu']:.2f}, Levenshtein Score: {metrics['levenshtein']:.2f}")
    print(f"ROUGE-1: {metrics['rouge-1']:.2f}, ROUGE-L: {metrics['rouge-L']:.2f}")
    print(f"BERTScore F1: {metrics['bertscore_f1']:.2f}, Cosine Similarity: {metrics['cosine_similarity']:.2f}")
    print(f"Jaccard Similarity: {metrics['jaccard_similarity']:.2f}\n")

# Consolidate metrics
print("\n--- Consolidated Metrics ---")
for key, values in metrics_summary.items():
    avg_value = sum(values) / len(values)
    print(f"{key.capitalize().replace('_', ' ')}: {avg_value:.2f}")


Some weights of BertModel were not initialized from the model checkpoint at /content/drive/My Drive/SQuAD_datasets/saved_model/ and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Question: Name some models that are based on the Transformer architecture
Predicted Answer: bert, gpt, and t5
Ground Truth: BERT, GPT, and T5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Precision: 0.88, Recall: 0.88, F1 Score: 0.88
BLEU Score: 0.08, Levenshtein Score: 0.53
ROUGE-1: 1.00, ROUGE-L: 1.00
BERTScore F1: 0.94, Cosine Similarity: 1.00
Jaccard Similarity: 0.14

Question: What mechanism does the Transformer rely on
Predicted Answer: self - attention mechanisms
Ground Truth: Self-attention mechanisms


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Precision: 1.00, Recall: 1.00, F1 Score: 1.00
BLEU Score: 0.08, Levenshtein Score: 0.89
ROUGE-1: 1.00, ROUGE-L: 1.00
BERTScore F1: 0.95, Cosine Similarity: 1.00
Jaccard Similarity: 0.20

Question: Why are Transformers faster than traditional RNNs?
Predicted Answer: transformers rely entirely on self - attention mechanisms to capture the relationships between different parts of a sequence
Ground Truth: Transformers rely entirely on self-attention mechanisms, which captures the relationship between different parts of a sentence


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Precision: 0.78, Recall: 0.74, F1 Score: 0.76
BLEU Score: 0.28, Levenshtein Score: 0.90
ROUGE-1: 0.88, ROUGE-L: 0.88
BERTScore F1: 0.95, Cosine Similarity: 0.98
Jaccard Similarity: 0.36

Question: What happens if self-attention mechanism captures the relationship between different parts of a sequence?
Predicted Answer: allows transformers to process data in parallel
Ground Truth: Transformers can process data in parallel


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Precision: 0.71, Recall: 0.83, F1 Score: 0.77
BLEU Score: 0.41, Levenshtein Score: 0.77
ROUGE-1: 0.77, ROUGE-L: 0.77
BERTScore F1: 0.94, Cosine Similarity: 0.95
Jaccard Similarity: 0.44

Question: State some applications of Transformers?
Predicted Answer: machine translation, text generation, and question answering
Ground Truth: Machine translation, text generation, and question answering


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Precision: 0.89, Recall: 0.89, F1 Score: 0.89
BLEU Score: 0.81, Levenshtein Score: 0.98
ROUGE-1: 1.00, ROUGE-L: 1.00
BERTScore F1: 1.00, Cosine Similarity: 1.00
Jaccard Similarity: 0.75

Question: Who introduced self Attention?
Predicted Answer: vaswani et al.
Ground Truth: Vaswani et al.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Precision: 1.00, Recall: 1.00, F1 Score: 1.00
BLEU Score: 0.24, Levenshtein Score: 0.93
ROUGE-1: 1.00, ROUGE-L: 1.00
BERTScore F1: 1.00, Cosine Similarity: 1.00
Jaccard Similarity: 0.50


--- Consolidated Metrics ---
Precision: 0.88
Recall: 0.89
F1: 0.88
Bleu: 0.32
Levenshtein: 0.83
Rouge-1: 0.94
Rouge-l: 0.94
Bertscore f1: 0.96
Cosine similarity: 0.99
Jaccard similarity: 0.40


Here’s an interpretation of the consolidated metrics we obtained for the model:

### 1. **Precision: 0.88**
   - **Meaning**: Of all the tokens the model predicted in its answers, 88% were correct.
   - **Interpretation**: The model is fairly precise in predicting relevant tokens, meaning it is good at predicting correct information without including too much irrelevant content.

### 2. **Recall: 0.89**
   - **Meaning**: Of all the relevant tokens from the ground truth answers, the model correctly retrieved 89%.
   - **Interpretation**: The model does a good job at recalling the necessary details from the ground truth, slightly better than precision, which means it's retrieving most relevant tokens.

### 3. **F1 Score: 0.88**
   - **Meaning**: This is the harmonic mean of precision and recall, reflecting a balance between the two.
   - **Interpretation**: A high F1 score indicates the model performs well overall, balancing between not predicting too much irrelevant information (precision) and still retrieving most of the relevant tokens (recall).

### 4. **BLEU Score: 0.32**
   - **Meaning**: BLEU (Bilingual Evaluation Understudy) evaluates how closely the predicted answer matches the ground truth, considering n-gram overlaps.
   - **Interpretation**: A BLEU score of 0.32 indicates that while the model is capturing some of the n-grams (word sequences), it's not perfectly aligned with the structure of the ground truth. This score tends to be lower when predictions deviate in word choice or order from the expected answer.

### 5. **Levenshtein Score: 0.83**
   - **Meaning**: The Levenshtein distance measures how many edits (insertions, deletions, or substitutions) are needed to transform the predicted answer into the ground truth. A score of 0.83 indicates an 83% match.
   - **Interpretation**: This high score suggests that the predicted answers are quite close to the ground truth in terms of word similarity, with only minor changes needed.

### 6. **ROUGE-1: 0.94 & ROUGE-L: 0.94**
   - **Meaning**: ROUGE-1 measures the overlap of unigrams (individual words), while ROUGE-L evaluates the longest common subsequence between the prediction and ground truth.
   - **Interpretation**: A score of 0.94 for both indicates that the model’s answers contain a significant amount of the same words and sequences as the ground truth, showing strong alignment.

### 7. **BERTScore F1: 0.96**
   - **Meaning**: BERTScore evaluates the semantic similarity between the predicted and true answers using embeddings. A score of 0.96 indicates very high semantic similarity.
   - **Interpretation**: The model’s predictions are highly aligned semantically with the ground truth, even if they may not match perfectly word-for-word.

### 8. **Cosine Similarity: 0.99**
   - **Meaning**: Cosine similarity compares the vector embeddings of the predicted and true answers. A value of 0.99 indicates that the vectors are almost identical.
   - **Interpretation**: The predictions are nearly identical to the true answers in terms of overall meaning and representation in embedding space.

### 9. **Jaccard Similarity: 0.40**
   - **Meaning**: Jaccard similarity measures the overlap of unique words between the predicted and ground truth answers. A score of 0.40 indicates only a moderate amount of overlap in word choice.
   - **Interpretation**: The model's predicted answers contain only 40% of the same unique words as the ground truth. This suggests that while the answers are semantically correct (based on BERTScore and cosine similarity), the exact word usage differs more significantly.

### **Overall Interpretation**:
The model performs very well on semantic metrics like **BERTScore** and **Cosine Similarity**, indicating that it captures the meaning of the answers quite well. The high **ROUGE** scores also show strong word overlap with the ground truth. However, the **BLEU** and **Jaccard** scores are somewhat lower, suggesting that the exact wording or phrasing of the model’s responses may deviate from the expected ground truth, even though the meaning remains highly accurate.

In summary, while the model might not always match the exact wording of the expected answers, it is highly effective in capturing the underlying meaning.