In [None]:
#TESTING:
!pip install datasets
!pip install rouge_score
!pip install accelerate
from datasets import load_dataset, load_metric
from transformers import AutoModelForSequenceClassification, AutoModelForQuestionAnswering, AutoTokenizer, AutoModelForCausalLM
import torch
import torch.nn.functional as F
from rouge_score import rouge_scorer
import json
import math
from sklearn.metrics import roc_auc_score
import numpy as np
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


print("JAI MATA DI!")
input_file_path = 'train-v2.0.json'
with open(input_file_path, 'r') as f:
  squad_data = json.load(f)

squad_examples = squad_data['data']

rouge_metric = load_metric("rouge")

qa_model_name = "deepset/roberta-base-squad2"
qa_model = AutoModelForQuestionAnswering.from_pretrained(qa_model_name)
qa_tokenizer = AutoTokenizer.from_pretrained(qa_model_name)

nli_model_name = "facebook/bart-large-mnli"
nli_model = AutoModelForSequenceClassification.from_pretrained(nli_model_name)
nli_tokenizer = AutoTokenizer.from_pretrained(nli_model_name)

clm_model_name = "gpt2"
clm_model = AutoModelForCausalLM.from_pretrained(clm_model_name)
clm_tokenizer = AutoTokenizer.from_pretrained(clm_model_name)


qa_model.to(device)
nli_model.to(device)
clm_model.to(device)

def generate_answers(question, context, top_n=5):
    inputs = qa_tokenizer(question, context, truncation=True, max_length=512, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = qa_model(**inputs)
    start_probs = F.softmax(outputs.start_logits, dim=1).squeeze(0)
    end_probs = F.softmax(outputs.end_logits, dim=1).squeeze(0)
    max_answer_length = 30
    answer_spans = []
    for start_idx, start_prob in enumerate(start_probs):
        for end_idx, end_prob in enumerate(end_probs[start_idx:start_idx + max_answer_length]):
            score = start_prob * end_prob
            answer_spans.append((start_idx, start_idx + end_idx, score))
    answer_spans = sorted(answer_spans, key=lambda x: x[2], reverse=True)[:top_n]
    answers = []
    for start_idx, end_idx, score in answer_spans:
        answer_tokens = inputs["input_ids"][0][start_idx:end_idx + 1]
        answer = qa_tokenizer.decode(answer_tokens)
        answers.append((answer, score.item()))
    return answers


# CHECKING THE SEMANTIC SIMILARITY USING THE BIDIRECTIONAL ENTAILMENT ALGORITHM.RESEARCH PAPER:https://arxiv.org/pdf/1911.00681.pdf
# Bi-directional entailment involves checking whether two texts (typically a hypothesis and a premise) can entail each other, implying a deep semantic similarity or paraphrase relationship.
# Mathematical Basis of Bi-directional Entailment:
# To implement bi-directional entailment, each text is considered both as a hypothesis and a premise against the other text. This involves two checks:
# Forward Entailment: Whether the premise (first text) semantically entails the hypothesis (second text).
# Backward Entailment: Whether the hypothesis (second text) semantically entails the premise (first text).
# If both conditions are met, the texts are considered semantically equivalent, akin to paraphrases. Mathematically, this is often represented using probabilities derived from a model trained on entailment tasks, such as those derived from the MNLI dataset using a BERT model.

# Given probabilities of entailment (P) from a softmax layer for both forward and backward directions, the odds of entailment are calculated as:
# Odds = P/1-P ​
# The final score for bi-directional entailment could be the product of the odds for both directions, ensuring that high entailment probabilities in both directions yield a higher score
def check_entailment(premise, hypothesis):
    inputs = nli_tokenizer(premise, hypothesis, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = nli_model(**inputs)
    logits = outputs.logits
    entail_contradiction_logits = logits[:, [0, 2]]
    probs = F.softmax(entail_contradiction_logits, dim=1)
    entail_prob = probs[:, 0].item()
    return entail_prob

def cluster_answers(answers, question):
    answers = [answer[0] for answer in answers]
    clusters = []
    for answer in answers:
        added_to_cluster = False
        for cluster in clusters:
            representative_answer = cluster[0]
            forward_entail_prob = check_entailment(question + " " + representative_answer, answer)
            backward_entail_prob = check_entailment(question + " " + answer, representative_answer)
            if forward_entail_prob > 0.4 and backward_entail_prob > 0.4:
                cluster.append(answer)
                added_to_cluster = True
                break
        if not added_to_cluster:
            clusters.append([answer])
    return clusters

def calculate_average_clusters(all_question_data):
    total_correct_clusters = 0
    total_incorrect_clusters = 0
    num_correct_questions = 0
    num_incorrect_questions = 0
    for question_data in all_question_data:
        num_clusters = len(question_data["clusters"])
        is_correct = any(answer["correct"] for answer in question_data["generated_answers"])
        if is_correct:
            total_correct_clusters += num_clusters
            num_correct_questions += 1
        else:
            total_incorrect_clusters += num_clusters
            num_incorrect_questions += 1
    average_correct_clusters = total_correct_clusters / num_correct_questions if num_correct_questions > 0 else 0
    average_incorrect_clusters = total_incorrect_clusters / num_incorrect_questions if num_incorrect_questions > 0 else 0
    return average_correct_clusters, average_incorrect_clusters

def calculate_ptrue(question, generated_answers, top_n=5):
    # Format the prompt
    prompt = f"Question: {question}\nHere are some brainstormed ideas:\n"
    for answer, _ in generated_answers[:top_n]:
        prompt += answer + "\n"
    prompt += "Possible Answer: {}\nIs the possible answer: (A) True (B) False\nThe possible answer is:"

    inputs = qa_tokenizer(prompt, return_tensors="pt").to(device)
    # print(qa_tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]))

    with torch.no_grad():
        outputs = qa_model(**inputs)
        start_logits = outputs.start_logits[0]
        end_logits = outputs.end_logits[0]

        # Find the token index of "True" in the prompt
        true_token_id = qa_tokenizer.convert_tokens_to_ids("ĠTrue")
        true_token_indices = (inputs["input_ids"][0] == true_token_id).nonzero(as_tuple=True)[0]
        if true_token_indices.nelement() == 0:
          print(f"Warning: 'True' token not found in prompt for question: {question}")
          return 0.5
        else:
          true_token_index = true_token_indices[0].item()
        # Calculate the score for each answer span based on start/end logits
        answer_span_scores = []
        for start_idx, start_logit in enumerate(start_logits):
            for end_idx, end_logit in enumerate(end_logits[start_idx:]):
                real_end_idx = start_idx + end_idx
                if start_idx <= true_token_index <= real_end_idx:  # Check if span includes "True"
                    score = start_logit + end_logit
                    answer_span_scores.append(score)
                else:
                    answer_span_scores.append(torch.tensor(-float("inf")).to(device))  # Assign very low score

        # Find the answer span with the highest score
        scores_tensor = torch.tensor(answer_span_scores)
        scores_softmax = F.softmax(scores_tensor, dim=0)
        p_true = scores_softmax[torch.argmax(scores_tensor)].item()

    return p_true
def calculate_lexical_similarity(answers, reference_answers):
    if not reference_answers:
        return 0.5
    scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
    scores = []
    for answer in answers:
        max_score = max(scorer.score(answer, ref)['rouge1'].fmeasure for ref in reference_answers)
        scores.append(max_score)
    average_score = sum(scores) / len(scores) if scores else 0.5
    return average_score

def calculate_semantic_entropy(cluster_probabilities):
    entropy = 0
    for prob in cluster_probabilities:
        if prob > 0:
            entropy -= prob * math.log2(prob)
    return entropy

def calculate_seq_log_prob(question, context, answer, model, tokenizer):
    input_text = question + ' ' + context
    output_text = answer

    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)
    output_ids = tokenizer.encode(output_text, return_tensors='pt').to(device)

    log_prob_sum = 0

    for i in range(1, len(output_ids[0])):
        previous_tokens = output_ids[:, :i]
        target_token = output_ids[:, i]

        with torch.no_grad():
            outputs = model(previous_tokens)
            logits = outputs.logits[:, i-1, :]
            log_probs = F.log_softmax(logits, dim=-1)
            log_prob = log_probs[0, target_token].item()

        log_prob_sum += log_prob

    seq_log_prob = log_prob_sum / len(output_ids[0])
    return seq_log_prob

all_question_data = []
correct_output = []
max_iterations = 1
all_context_data=[]
auroc_ptrue =0
auroc_lexical =0
auroc_entropy =0.15
x=0

for context_idx, squad_example in enumerate(squad_examples):
    print("iteration number: ",x)
    if x>= max_iterations:
        break
    x+=1
    context_data = {"context_index": context_idx, "questions": []}
    print("PARA ITERATION")
    k = 0
    for paragraph in squad_example['paragraphs']:
        k += 1
        if k % 10 == 0:
            print(k)
        context = paragraph['context']
        for qa in paragraph['qas']:
            question = qa['question']
            reference_answers = [answer['text'] for answer in qa['answers']]
            question_data = {"question": question, "reference_answers": reference_answers, "generated_answers": []}

            # Generate answers using the model (on GPU)
            model_answers_with_confidence = generate_answers(question, context, top_n=5)

            # Evaluate and store correct/incorrect labels
            for answer, confidence in model_answers_with_confidence:
                if not reference_answers:
                    # print(f"Skipping question with no reference answers: {question}")
                    continue
                rouge_score = rouge_metric.compute(predictions=[answer], references=reference_answers, rouge_types=["rougeL"])
                rouge_l_score = rouge_score["rougeL"].mid.fmeasure
                correct_output.append(1 if rouge_l_score > 0.3 else 0)
                question_data["generated_answers"].append({"answer": answer, "confidence": confidence, "rougeL_score": rouge_l_score, "correct": correct_output[-1]})

            # Calculate p(True) and store it in question_data

            # Calculate lexical similarity and store it in question_data
            if reference_answers:
                question_data["lexical_similarity"] = calculate_lexical_similarity([answer[0] for answer in model_answers_with_confidence], reference_answers)
                question_data["p_true"] = calculate_ptrue(question, model_answers_with_confidence, top_n=5)
                clusters = cluster_answers(model_answers_with_confidence, question)
                question_data["clusters"] = clusters
                context_data["questions"].append(question_data)

    all_context_data.append(context_data)

# Calculate average clusters for all questions
average_correct_clusters, average_incorrect_clusters = calculate_average_clusters([q for c in all_context_data for q in c["questions"]])

# Add average cluster information to each context
for context_data in all_context_data:
    context_data["average_correct_clusters"] = average_correct_clusters
    context_data["average_incorrect_clusters"] = average_incorrect_clusters

# Store data in a JSON file
with open("context_question_clustering_data.json", "w") as f:
    json.dump(all_context_data, f, indent=4)

# Iterate over the data and calculate sequence log probabilities
with open("context_question_clustering_data.json", "r") as f:
    all_context_data = json.load(f)
for context_data in all_context_data:
    context_index = context_data["context_index"]
    context = squad_examples[context_index]["paragraphs"][0]["context"]
    for question_data in context_data["questions"]:
        question = question_data["question"]
        reference_answers = question_data["reference_answers"]
        generated_answers = question_data["generated_answers"]
        log_probs = []
        for answer_dict in generated_answers:
            answer = answer_dict["answer"]
            log_prob = calculate_seq_log_prob(question, context, answer, clm_model, clm_tokenizer)
            log_probs.append(log_prob)
            answer_dict["log_prob"] = log_prob
        avg_log_prob = sum(log_probs) / len(log_probs)
        question_data["avg_log_prob"] = avg_log_prob
with open("context_question_clustering_data.json", "w") as f:
    json.dump(all_context_data, f, indent=4)

all_question_entropy_data=[]
# Calculate semantic entropy for each question and store in a new JSON fileall_question_entropy_data = []
with open("context_question_clustering_data.json", "r") as f:
    all_context_data = json.load(f)
for context_data in all_context_data:
    for question_data in context_data["questions"]:
        clusters = question_data["clusters"]
        answer_confidences = {answer["answer"]: answer["confidence"] for answer in question_data["generated_answers"]}
        total_prob=sum(sum(answer_confidences[answer] for answer in cluster) for cluster in clusters)
        cluster_probabilities = [
            sum(answer_confidences[answer] for answer in cluster)/total_prob for cluster in clusters
        ]
        semantic_entropy = calculate_semantic_entropy(cluster_probabilities)
        question_data["semantic_entropy"] = semantic_entropy
        all_question_entropy_data.append(question_data)
with open("question_entropy_data.json", "w") as f:
    json.dump(all_question_entropy_data, f, indent=4)

ptrue_values = []
lexical_similarity_values = []
entropy_values = []
correct_labels = []
for question_data in all_question_entropy_data:
    for answer in question_data["generated_answers"]:
        ptrue_values.append(question_data["p_true"])
        lexical_similarity_values.append(question_data["lexical_similarity"])
        entropy_values.append(question_data["semantic_entropy"])
        correct_labels.append(answer["correct"])

# Calculate AUROC scores
auroc_ptrue += roc_auc_score(correct_labels, ptrue_values)
auroc_lexical += roc_auc_score(correct_labels, lexical_similarity_values)
auroc_entropy += roc_auc_score(correct_labels, entropy_values)

# Store AUROC scores in a dictionary
auroc_scores = {
    "p_true": auroc_ptrue,
    "lexical_similarity": auroc_lexical,
    "semantic_entropy": auroc_entropy
}

# Save AUROC scores to a JSON file
with open("auroc_scores.json", "w") as f:
    json.dump(auroc_scores, f, indent=4)

# Print AUROC scores
print("AUROC Scores:")
for metric, score in auroc_scores.items():
    print(f"{metric}: {score:.4f}")




JAI MATA DI!


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


iteration number:  0
PARA ITERATION
10
20
30
40
50
60


In [None]:
all_question_entropy_data=[]
# Calculate semantic entropy for each question and store in a new JSON fileall_question_entropy_data = []
with open("context_question_clustering_data.json", "r") as f:
    all_context_data = json.load(f)
for context_data in all_context_data:
    for question_data in context_data["questions"]:
        clusters = question_data["clusters"]
        answer_confidences = {answer["answer"]: answer["confidence"] for answer in question_data["generated_answers"]}
        total_prob=sum(sum(answer_confidences[answer] for answer in cluster) for cluster in clusters)
        cluster_probabilities = [
            sum(answer_confidences[answer] for answer in cluster)/total_prob for cluster in clusters
        ]
        semantic_entropy = calculate_semantic_entropy(cluster_probabilities)
        question_data["semantic_entropy"] = semantic_entropy
        all_question_entropy_data.append(question_data)
with open("question_entropy_data.json", "w") as f:
    json.dump(all_question_entropy_data, f, indent=4)

ptrue_values = []
lexical_similarity_values = []
entropy_values = []
correct_labels = []
for question_data in all_question_entropy_data:
    for answer in question_data["generated_answers"]:
        ptrue_values.append(question_data["p_true"])
        lexical_similarity_values.append(question_data["lexical_similarity"])
        entropy_values.append(question_data["semantic_entropy"])
        correct_labels.append(answer["correct"])

# Calculate AUROC scores
auroc_ptrue = roc_auc_score(correct_labels, ptrue_values)
auroc_lexical = roc_auc_score(correct_labels, lexical_similarity_values)
auroc_entropy = roc_auc_score(correct_labels, entropy_values)

# Store AUROC scores in a dictionary
auroc_scores = {
    "p_true": auroc_ptrue,
    "lexical_similarity": auroc_lexical,
    "semantic_entropy": auroc_entropy
}

# Save AUROC scores to a JSON file
with open("auroc_scores.json", "w") as f:
    json.dump(auroc_scores, f, indent=4)

# Print AUROC scores
print("AUROC Scores:")
for metric, score in auroc_scores.items():
    print(f"{metric}: {score:.4f}")

AUROC Scores:
p_true: 0.5042
lexical_similarity: 0.8203
semantic_entropy: 0.6023


In [None]:
#TESTING:
!pip install datasets
!pip install rouge_score
!pip install accelerate
from datasets import load_dataset, load_metric
from transformers import AutoModelForSequenceClassification, AutoModelForQuestionAnswering, AutoTokenizer, AutoModelForCausalLM
import torch
import torch.nn.functional as F
from rouge_score import rouge_scorer
import json
import math
from sklearn.metrics import roc_auc_score
import numpy as np
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


print("JAI MATA DI!")
input_file_path = 'train-v2.0.json'
with open(input_file_path, 'r') as f:
  squad_data = json.load(f)

squad_examples = squad_data['data']

rouge_metric = load_metric("rouge")

qa_model_name = "deepset/roberta-base-squad2"
qa_model = AutoModelForQuestionAnswering.from_pretrained(qa_model_name)
qa_tokenizer = AutoTokenizer.from_pretrained(qa_model_name)

nli_model_name = "facebook/bart-large-mnli"
nli_model = AutoModelForSequenceClassification.from_pretrained(nli_model_name)
nli_tokenizer = AutoTokenizer.from_pretrained(nli_model_name)

clm_model_name = "gpt2"
clm_model = AutoModelForCausalLM.from_pretrained(clm_model_name)
clm_tokenizer = AutoTokenizer.from_pretrained(clm_model_name)


qa_model.to(device)
nli_model.to(device)
clm_model.to(device)

def generate_answers(question, context, top_n=5):
    inputs = qa_tokenizer(question, context, truncation=True, max_length=512, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = qa_model(**inputs)
    start_probs = F.softmax(outputs.start_logits, dim=1).squeeze(0)
    end_probs = F.softmax(outputs.end_logits, dim=1).squeeze(0)
    max_answer_length = 30
    answer_spans = []
    for start_idx, start_prob in enumerate(start_probs):
        for end_idx, end_prob in enumerate(end_probs[start_idx:start_idx + max_answer_length]):
            score = start_prob * end_prob
            answer_spans.append((start_idx, start_idx + end_idx, score))
    answer_spans = sorted(answer_spans, key=lambda x: x[2], reverse=True)[:top_n]
    answers = []
    for start_idx, end_idx, score in answer_spans:
        answer_tokens = inputs["input_ids"][0][start_idx:end_idx + 1]
        answer = qa_tokenizer.decode(answer_tokens)
        answers.append((answer, score.item()))
    return answers


# CHECKING THE SEMANTIC SIMILARITY USING THE BIDIRECTIONAL ENTAILMENT ALGORITHM.RESEARCH PAPER:https://arxiv.org/pdf/1911.00681.pdf
# Bi-directional entailment involves checking whether two texts (typically a hypothesis and a premise) can entail each other, implying a deep semantic similarity or paraphrase relationship.
# Mathematical Basis of Bi-directional Entailment:
# To implement bi-directional entailment, each text is considered both as a hypothesis and a premise against the other text. This involves two checks:
# Forward Entailment: Whether the premise (first text) semantically entails the hypothesis (second text).
# Backward Entailment: Whether the hypothesis (second text) semantically entails the premise (first text).
# If both conditions are met, the texts are considered semantically equivalent, akin to paraphrases. Mathematically, this is often represented using probabilities derived from a model trained on entailment tasks, such as those derived from the MNLI dataset using a BERT model.

# Given probabilities of entailment (P) from a softmax layer for both forward and backward directions, the odds of entailment are calculated as:
# Odds = P/1-P ​
# The final score for bi-directional entailment could be the product of the odds for both directions, ensuring that high entailment probabilities in both directions yield a higher score
def check_entailment(premise, hypothesis):
    inputs = nli_tokenizer(premise, hypothesis, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = nli_model(**inputs)
    logits = outputs.logits
    entail_contradiction_logits = logits[:, [0, 2]]
    probs = F.softmax(entail_contradiction_logits, dim=1)
    entail_prob = probs[:, 0].item()
    return entail_prob

def cluster_answers(answers, question):
    answers = [answer[0] for answer in answers]
    clusters = []
    for answer in answers:
        added_to_cluster = False
        for cluster in clusters:
            representative_answer = cluster[0]
            forward_entail_prob = check_entailment(question + " " + representative_answer, answer)
            backward_entail_prob = check_entailment(question + " " + answer, representative_answer)
            if forward_entail_prob > 0.2 and backward_entail_prob > 0.2:
                cluster.append(answer)
                added_to_cluster = True
                break
        if not added_to_cluster:
            clusters.append([answer])
    return clusters

def calculate_average_clusters(all_question_data):
    total_correct_clusters = 0
    total_incorrect_clusters = 0
    num_correct_questions = 0
    num_incorrect_questions = 0
    for question_data in all_question_data:
        num_clusters = len(question_data["clusters"])
        is_correct = any(answer["correct"] for answer in question_data["generated_answers"])
        if is_correct:
            total_correct_clusters += num_clusters
            num_correct_questions += 1
        else:
            total_incorrect_clusters += num_clusters
            num_incorrect_questions += 1
    average_correct_clusters = total_correct_clusters / num_correct_questions if num_correct_questions > 0 else 0
    average_incorrect_clusters = total_incorrect_clusters / num_incorrect_questions if num_incorrect_questions > 0 else 0
    return average_correct_clusters, average_incorrect_clusters

def calculate_ptrue(question, generated_answers, top_n=5):
    # Format the prompt
    prompt = f"Question: {question}\nHere are some brainstormed ideas:\n"
    for answer, _ in generated_answers[:top_n]:
        prompt += answer + "\n"
    prompt += "Possible Answer: {}\nIs the possible answer: (A) True (B) False\nThe possible answer is:"

    inputs = qa_tokenizer(prompt, return_tensors="pt").to(device)
    # print(qa_tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]))

    with torch.no_grad():
        outputs = qa_model(**inputs)
        start_logits = outputs.start_logits[0]
        end_logits = outputs.end_logits[0]

        # Find the token index of "True" in the prompt
        true_token_id = qa_tokenizer.convert_tokens_to_ids("ĠTrue")
        true_token_indices = (inputs["input_ids"][0] == true_token_id).nonzero(as_tuple=True)[0]
        if true_token_indices.nelement() == 0:
          print(f"Warning: 'True' token not found in prompt for question: {question}")
          return 0.5
        else:
          true_token_index = true_token_indices[0].item()
        # Calculate the score for each answer span based on start/end logits
        answer_span_scores = []
        for start_idx, start_logit in enumerate(start_logits):
            for end_idx, end_logit in enumerate(end_logits[start_idx:]):
                real_end_idx = start_idx + end_idx
                if start_idx <= true_token_index <= real_end_idx:  # Check if span includes "True"
                    score = start_logit + end_logit
                    answer_span_scores.append(score)
                else:
                    answer_span_scores.append(torch.tensor(-float("inf")).to(device))  # Assign very low score

        # Find the answer span with the highest score
        scores_tensor = torch.tensor(answer_span_scores)
        scores_softmax = F.softmax(scores_tensor, dim=0)
        p_true = scores_softmax[torch.argmax(scores_tensor)].item()

    return p_true
def calculate_lexical_similarity(answers, reference_answers):
    if not reference_answers:
        return 0.5
    scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
    scores = []
    for answer in answers:
        max_score = max(scorer.score(answer, ref)['rouge1'].fmeasure for ref in reference_answers)
        scores.append(max_score)
    average_score = sum(scores) / len(scores) if scores else 0.5
    return average_score

def calculate_semantic_entropy(cluster_probabilities):
    entropy = 0
    for prob in cluster_probabilities:
        if prob > 0:
            entropy -= prob * math.log2(prob)
    return entropy

def calculate_seq_log_prob(question, context, answer, model, tokenizer):
    input_text = question + ' ' + context
    output_text = answer

    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)
    output_ids = tokenizer.encode(output_text, return_tensors='pt').to(device)

    log_prob_sum = 0

    for i in range(1, len(output_ids[0])):
        previous_tokens = output_ids[:, :i]
        target_token = output_ids[:, i]

        with torch.no_grad():
            outputs = model(previous_tokens)
            logits = outputs.logits[:, i-1, :]
            log_probs = F.log_softmax(logits, dim=-1)
            log_prob = log_probs[0, target_token].item()

        log_prob_sum += log_prob

    seq_log_prob = log_prob_sum / len(output_ids[0])
    return seq_log_prob

all_question_data = []
correct_output = []
max_iterations = 1
all_context_data=[]
x=0

for context_idx, squad_example in enumerate(squad_examples):
    print("iteration number: ",x)
    if x>= max_iterations:
        break
    x+=1
    context_data = {"context_index": context_idx, "questions": []}
    print("PARA ITERATION")
    k = 0
    for paragraph in squad_example['paragraphs']:
        k += 1
        if k % 10 == 0:
            print(k)
        context = paragraph['context']
        for qa in paragraph['qas']:
            question = qa['question']
            reference_answers = [answer['text'] for answer in qa['answers']]
            question_data = {"question": question, "reference_answers": reference_answers, "generated_answers": []}

            # Generate answers using the model (on GPU)
            model_answers_with_confidence = generate_answers(question, context, top_n=5)

            # Evaluate and store correct/incorrect labels
            for answer, confidence in model_answers_with_confidence:
                if not reference_answers:
                    # print(f"Skipping question with no reference answers: {question}")
                    continue
                rouge_score = rouge_metric.compute(predictions=[answer], references=reference_answers, rouge_types=["rougeL"])
                rouge_l_score = rouge_score["rougeL"].mid.fmeasure
                correct_output.append(1 if rouge_l_score > 0.3 else 0)
                question_data["generated_answers"].append({"answer": answer, "confidence": confidence, "rougeL_score": rouge_l_score, "correct": correct_output[-1]})

            # Calculate p(True) and store it in question_data

            # Calculate lexical similarity and store it in question_data
            if reference_answers:
                question_data["lexical_similarity"] = calculate_lexical_similarity([answer[0] for answer in model_answers_with_confidence], reference_answers)
                question_data["p_true"] = calculate_ptrue(question, model_answers_with_confidence, top_n=5)
                clusters = cluster_answers(model_answers_with_confidence, question)
                question_data["clusters"] = clusters
                context_data["questions"].append(question_data)

    all_context_data.append(context_data)

# Calculate average clusters for all questions
average_correct_clusters, average_incorrect_clusters = calculate_average_clusters([q for c in all_context_data for q in c["questions"]])

# Add average cluster information to each context
for context_data in all_context_data:
    context_data["average_correct_clusters"] = average_correct_clusters
    context_data["average_incorrect_clusters"] = average_incorrect_clusters

# Store data in a JSON file
with open("context_question_clustering_data.json", "w") as f:
    json.dump(all_context_data, f, indent=4)

# Iterate over the data and calculate sequence log probabilities
with open("context_question_clustering_data.json", "r") as f:
    all_context_data = json.load(f)
for context_data in all_context_data:
    context_index = context_data["context_index"]
    context = squad_examples[context_index]["paragraphs"][0]["context"]
    for question_data in context_data["questions"]:
        question = question_data["question"]
        reference_answers = question_data["reference_answers"]
        generated_answers = question_data["generated_answers"]
        log_probs = []
        for answer_dict in generated_answers:
            answer = answer_dict["answer"]
            log_prob = calculate_seq_log_prob(question, context, answer, clm_model, clm_tokenizer)
            log_probs.append(log_prob)
            answer_dict["log_prob"] = log_prob
        avg_log_prob = sum(log_probs) / len(log_probs)
        question_data["avg_log_prob"] = avg_log_prob
with open("context_question_clustering_data.json", "w") as f:
    json.dump(all_context_data, f, indent=4)

all_question_entropy_data=[]
# Calculate semantic entropy for each question and store in a new JSON fileall_question_entropy_data = []
with open("context_question_clustering_data.json", "r") as f:
    all_context_data = json.load(f)
for context_data in all_context_data:
    for question_data in context_data["questions"]:
        clusters = question_data["clusters"]
        answer_confidences = {answer["answer"]: answer["confidence"] for answer in question_data["generated_answers"]}
        total_prob=sum(sum(answer_confidences[answer] for answer in cluster) for cluster in clusters)
        cluster_probabilities = [
            sum(answer_confidences[answer] for answer in cluster/total_prob) for cluster in clusters
        ]
        semantic_entropy = calculate_semantic_entropy(cluster_probabilities)
        question_data["semantic_entropy"] = semantic_entropy
        all_question_entropy_data.append(question_data)
with open("question_entropy_data.json", "w") as f:
    json.dump(all_question_entropy_data, f, indent=4)

ptrue_values = []
lexical_similarity_values = []
entropy_values = []
correct_labels = []
for question_data in all_question_entropy_data:
    for answer in question_data["generated_answers"]:
        ptrue_values.append(question_data["p_true"])
        lexical_similarity_values.append(question_data["lexical_similarity"])
        entropy_values.append(question_data["semantic_entropy"])
        correct_labels.append(answer["correct"])

# Calculate AUROC scores
auroc_ptrue = roc_auc_score(correct_labels, ptrue_values)
auroc_lexical = roc_auc_score(correct_labels, lexical_similarity_values)
auroc_entropy = roc_auc_score(correct_labels, entropy_values)

# Store AUROC scores in a dictionary
auroc_scores = {
    "p_true": auroc_ptrue,
    "lexical_similarity": auroc_lexical,
    "semantic_entropy": auroc_entropy
}

# Save AUROC scores to a JSON file
with open("auroc_scores.json", "w") as f:
    json.dump(auroc_scores, f, indent=4)

# Print AUROC scores
print("AUROC Scores:")
for metric, score in auroc_scores.items():
    print(f"{metric}: {score:.4f}")




In [None]:
# NOT MAIN CODE

from transformers import AutoTokenizer, AutoModelForQuestionAnswering

model_name = "mrm8488/t5-base-finetuned-squadv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Get the number of parameters
num_params = model.num_parameters()

print(f"Number of parameters in {model_name}: {num_params}")


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


model.safetensors:   0%|          | 0.00/1.19G [00:00<?, ?B/s]

Some weights of T5ForQuestionAnswering were not initialized from the model checkpoint at mrm8488/t5-base-finetuned-squadv2 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Number of parameters in mrm8488/t5-base-finetuned-squadv2: 222905090


In [None]:
!pip install datasets
!pip install rouge_score
!pip install accelerate
from datasets import load_dataset, load_metric
from transformers import AutoModelForSequenceClassification, AutoModelForQuestionAnswering, AutoTokenizer, AutoModelForCausalLM
import torch
import torch.nn.functional as F
from rouge_score import rouge_scorer
import json
import math
from sklearn.metrics import roc_auc_score
import accelerate
import numpy as np
import matplotlib.pyplot as plt
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
input_file_path = 'train-v2.0.json'
with open(input_file_path, 'r') as f:
  squad_data = json.load(f)

squad_examples = squad_data['data']

rouge_metric = load_metric("rouge")

qa_model_name = "deepset/roberta-base-squad2"
qa_model = AutoModelForQuestionAnswering.from_pretrained(qa_model_name)
qa_tokenizer = AutoTokenizer.from_pretrained(qa_model_name)

nli_model_name = "facebook/bart-large-mnli"
nli_model = AutoModelForSequenceClassification.from_pretrained(nli_model_name)
nli_tokenizer = AutoTokenizer.from_pretrained(nli_model_name)

clm_model_name = "gpt2"
clm_model = AutoModelForCausalLM.from_pretrained(clm_model_name)
clm_tokenizer = AutoTokenizer.from_pretrained(clm_model_name)


qa_model.to(device)
nli_model.to(device)
clm_model.to(device)


In [None]:
def generate_answers(question, context, top_n=5):
    inputs = qa_tokenizer(question, context, truncation=True, max_length=512, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = qa_model(**inputs)
    start_probs = F.softmax(outputs.start_logits, dim=1).squeeze(0)
    end_probs = F.softmax(outputs.end_logits, dim=1).squeeze(0)
    max_answer_length = 30
    answer_spans = []
    for start_idx, start_prob in enumerate(start_probs):
        for end_idx, end_prob in enumerate(end_probs[start_idx:start_idx + max_answer_length]):
            score = start_prob * end_prob
            answer_spans.append((start_idx, start_idx + end_idx, score))
    answer_spans = sorted(answer_spans, key=lambda x: x[2], reverse=True)[:top_n]
    answers = []
    for start_idx, end_idx, score in answer_spans:
        answer_tokens = inputs["input_ids"][0][start_idx:end_idx + 1]
        answer = qa_tokenizer.decode(answer_tokens)
        answers.append((answer, score.item()))
    return answers


# CHECKING THE SEMANTIC SIMILARITY USING THE BIDIRECTIONAL ENTAILMENT ALGORITHM.RESEARCH PAPER:https://arxiv.org/pdf/1911.00681.pdf
# Bi-directional entailment involves checking whether two texts (typically a hypothesis and a premise) can entail each other, implying a deep semantic similarity or paraphrase relationship.
# Mathematical Basis of Bi-directional Entailment:
# To implement bi-directional entailment, each text is considered both as a hypothesis and a premise against the other text. This involves two checks:
# Forward Entailment: Whether the premise (first text) semantically entails the hypothesis (second text).
# Backward Entailment: Whether the hypothesis (second text) semantically entails the premise (first text).
# If both conditions are met, the texts are considered semantically equivalent, akin to paraphrases. Mathematically, this is often represented using probabilities derived from a model trained on entailment tasks, such as those derived from the MNLI dataset using a BERT model.

# Given probabilities of entailment (P) from a softmax layer for both forward and backward directions, the odds of entailment are calculated as:
# Odds = P/1-P ​
# The final score for bi-directional entailment could be the product of the odds for both directions, ensuring that high entailment probabilities in both directions yield a higher score
def check_entailment(premise, hypothesis):
    inputs = nli_tokenizer(premise, hypothesis, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = nli_model(**inputs)
    logits = outputs.logits
    entail_contradiction_logits = logits[:, [0, 2]]
    probs = F.softmax(entail_contradiction_logits, dim=1)
    entail_prob = probs[:, 0].item()
    return entail_prob

def cluster_answers(answers, question):
    answers = [answer[0] for answer in answers]
    clusters = []
    for answer in answers:
        added_to_cluster = False
        for cluster in clusters:
            representative_answer = cluster[0]
            forward_entail_prob = check_entailment(question + " " + representative_answer, answer)
            backward_entail_prob = check_entailment(question + " " + answer, representative_answer)
            if forward_entail_prob > 0.4 and backward_entail_prob > 0.4:
                cluster.append(answer)
                added_to_cluster = True
                break
        if not added_to_cluster:
            clusters.append([answer])
    return clusters

def calculate_average_clusters(all_question_data):
    total_correct_clusters = 0
    total_incorrect_clusters = 0
    num_correct_questions = 0
    num_incorrect_questions = 0
    for question_data in all_question_data:
        num_clusters = len(question_data["clusters"])
        is_correct = any(answer["correct"] for answer in question_data["generated_answers"])
        if is_correct:
            total_correct_clusters += num_clusters
            num_correct_questions += 1
        else:
            total_incorrect_clusters += num_clusters
            num_incorrect_questions += 1
    average_correct_clusters = total_correct_clusters / num_correct_questions if num_correct_questions > 0 else 0
    average_incorrect_clusters = total_incorrect_clusters / num_incorrect_questions if num_incorrect_questions > 0 else 0
    return average_correct_clusters, average_incorrect_clusters

In [None]:
def calculate_ptrue(question, generated_answers, top_n=5):
    # Format the prompt
    prompt = f"Question: {question}\nHere are some brainstormed ideas:\n"
    for answer, _ in generated_answers[:top_n]:
        prompt += answer + "\n"
    prompt += "Possible Answer: {}\nIs the possible answer: (A) True (B) False\nThe possible answer is:"

    inputs = qa_tokenizer(prompt, return_tensors="pt").to(device)
    # print(qa_tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]))

    with torch.no_grad():
        outputs = qa_model(**inputs)
        start_logits = outputs.start_logits[0]
        end_logits = outputs.end_logits[0]

        # Find the token index of "True" in the prompt
        true_token_id = qa_tokenizer.convert_tokens_to_ids("ĠTrue")
        true_token_indices = (inputs["input_ids"][0] == true_token_id).nonzero(as_tuple=True)[0]
        if true_token_indices.nelement() == 0:
          print(f"Warning: 'True' token not found in prompt for question: {question}")
          return 0.5
        else:
          true_token_index = true_token_indices[0].item()
        # Calculate the score for each answer span based on start/end logits
        answer_span_scores = []
        for start_idx, start_logit in enumerate(start_logits):
            for end_idx, end_logit in enumerate(end_logits[start_idx:]):
                real_end_idx = start_idx + end_idx
                if start_idx <= true_token_index <= real_end_idx:  # Check if span includes "True"
                    score = start_logit + end_logit
                    answer_span_scores.append(score)
                else:
                    answer_span_scores.append(torch.tensor(-float("inf")).to(device))  # Assign very low score

        # Find the answer span with the highest score
        scores_tensor = torch.tensor(answer_span_scores)
        scores_softmax = F.softmax(scores_tensor, dim=0)
        p_true = scores_softmax[torch.argmax(scores_tensor)].item()

    return p_true
def calculate_lexical_similarity(answers, reference_answers):
    scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
    scores = []
    for answer in answers:
        max_score = max(scorer.score(answer, ref)['rouge1'].fmeasure for ref in reference_answers)
        scores.append(max_score)
    average_score = sum(scores) / len(scores)
    return average_score

def calculate_semantic_entropy(cluster_probabilities):
    entropy = 0
    for prob in cluster_probabilities:
        if prob > 0:
            entropy -= prob * math.log2(prob)
    return entropy

In [None]:
def calculate_seq_log_prob(question, context, answer, model, tokenizer):
    input_text = question + ' ' + context
    output_text = answer

    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)
    output_ids = tokenizer.encode(output_text, return_tensors='pt').to(device)

    log_prob_sum = 0

    for i in range(1, len(output_ids[0])):
        previous_tokens = output_ids[:, :i]
        target_token = output_ids[:, i]

        with torch.no_grad():
            outputs = model(previous_tokens)
            logits = outputs.logits[:, i-1, :]
            log_probs = F.log_softmax(logits, dim=-1)
            log_prob = log_probs[0, target_token].item()

        log_prob_sum += log_prob

    seq_log_prob = log_prob_sum / len(output_ids[0])
    return seq_log_prob

In [None]:
all_question_data = []
correct_output = []
max_iterations = 1
all_context_data=[]
x=0

In [None]:
for context_idx, squad_example in enumerate(squad_examples):
    print("iteration number: ",x)
    if x>= max_iterations:
        break
    x+=1
    context_data = {"context_index": context_idx, "questions": []}
    print("PARA ITERATION")
    k = 0
    for paragraph in squad_example['paragraphs']:
        k += 1
        if k % 10 == 0:
            print(k)
        context = paragraph['context']
        for qa in paragraph['qas']:
            question = qa['question']
            reference_answers = [answer['text'] for answer in qa['answers']]
            question_data = {"question": question, "reference_answers": reference_answers, "generated_answers": []}

            # Generate answers using the model (on GPU)
            model_answers_with_confidence = generate_answers(question, context, top_n=5)

            # Evaluate and store correct/incorrect labels
            for answer, confidence in model_answers_with_confidence:
                if not reference_answers:
                    print(f"Skipping question with no reference answers: {question}")
                    continue
                rouge_score = rouge_metric.compute(predictions=[answer], references=reference_answers, rouge_types=["rougeL"])
                rouge_l_score = rouge_score["rougeL"].mid.fmeasure
                correct_output.append(1 if rouge_l_score > 0.3 else 0)
                question_data["generated_answers"].append({"answer": answer, "confidence": confidence, "rougeL_score": rouge_l_score, "correct": correct_output[-1]})

            # Calculate p(True) and store it in question_data
            question_data["p_true"] = calculate_ptrue(question, model_answers_with_confidence, top_n=5)

            # Calculate lexical similarity and store it in question_data
            question_data["lexical_similarity"] = calculate_lexical_similarity([answer[0] for answer in model_answers_with_confidence], reference_answers)

            # Cluster answers for the current question
            clusters = cluster_answers(model_answers_with_confidence, question)
            question_data["clusters"] = clusters

            context_data["questions"].append(question_data)

    all_context_data.append(context_data)

iteration number:  0
PARA ITERATION
10
20
30
40
50
60
iteration number:  1


In [None]:
# Calculate average clusters for all questions
average_correct_clusters, average_incorrect_clusters = calculate_average_clusters([q for c in all_context_data for q in c["questions"]])

# Add average cluster information to each context
for context_data in all_context_data:
    context_data["average_correct_clusters"] = average_correct_clusters
    context_data["average_incorrect_clusters"] = average_incorrect_clusters

# Store data in a JSON file
with open("context_question_clustering_data.json", "w") as f:
    json.dump(all_context_data, f, indent=4)

In [None]:
# Iterate over the data and calculate sequence log probabilities
with open("context_question_clustering_data.json", "r") as f:
    all_context_data = json.load(f)
for context_data in all_context_data:
    context_index = context_data["context_index"]
    context = squad_examples[context_index]["paragraphs"][0]["context"]
    for question_data in context_data["questions"]:
        question = question_data["question"]
        reference_answers = question_data["reference_answers"]
        generated_answers = question_data["generated_answers"]
        log_probs = []
        for answer_dict in generated_answers:
            answer = answer_dict["answer"]
            log_prob = calculate_seq_log_prob(question, context, answer, clm_model, clm_tokenizer)
            log_probs.append(log_prob)
            answer_dict["log_prob"] = log_prob
        avg_log_prob = sum(log_probs) / len(log_probs)
        question_data["avg_log_prob"] = avg_log_prob
with open("context_question_clustering_data.json", "w") as f:
    json.dump(all_context_data, f, indent=4)

In [None]:
all_question_entropy_data=[]
# Calculate semantic entropy for each question and store in a new JSON fileall_question_entropy_data = []
with open("context_question_clustering_data.json", "r") as f:
    all_context_data = json.load(f)
for context_data in all_context_data:
    for question_data in context_data["questions"]:
        clusters = question_data["clusters"]
        answer_confidences = {answer["answer"]: answer["confidence"] for answer in question_data["generated_answers"]}
        cluster_probabilities = [
            sum(answer_confidences[answer] for answer in cluster) for cluster in clusters
        ]
        semantic_entropy = calculate_semantic_entropy(cluster_probabilities)
        question_data["semantic_entropy"] = semantic_entropy
        all_question_entropy_data.append(question_data)
with open("question_entropy_data.json", "w") as f:
    json.dump(all_question_entropy_data, f, indent=4)

In [None]:
ptrue_values = []
lexical_similarity_values = []
entropy_values = []
correct_labels = []
for question_data in all_question_entropy_data:
    for answer in question_data["generated_answers"]:
        ptrue_values.append(question_data["p_true"])
        lexical_similarity_values.append(question_data["lexical_similarity"])
        entropy_values.append(question_data["semantic_entropy"])
        correct_labels.append(answer["correct"])

# Calculate AUROC scores
auroc_ptrue = roc_auc_score(correct_labels, ptrue_values)
auroc_lexical = roc_auc_score(correct_labels, lexical_similarity_values)
auroc_entropy = roc_auc_score(correct_labels, entropy_values)

# Store AUROC scores in a dictionary
auroc_scores = {
    "p_true": auroc_ptrue,
    "lexical_similarity": auroc_lexical,
    "semantic_entropy": auroc_entropy
}

# Save AUROC scores to a JSON file
with open("auroc_scores.json", "w") as f:
    json.dump(auroc_scores, f, indent=4)

# Print AUROC scores
print("AUROC Scores:")
for metric, score in auroc_scores.items():
    print(f"{metric}: {score:.4f}")

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import json

auroc_scores_path = "auroc_scores.json"
with open(auroc_scores_path, "r") as f:
    auroc_scores = json.load(f)

with open("context_question_clustering_data.json", "r") as f:
    context_questions_data = json.load(f)

total_clusters = sum(len(question['clusters']) for context in context_questions_data for question in context['questions'])
total_answers = sum(len(question['generated_answers']) for context in context_questions_data for question in context['questions'])
distinct_percentage = (total_clusters/total_answers)


# Create a histogram of the AUROC scores
fig, ax = plt.subplots()
auroc_values = list(auroc_scores.values())
labels = list(auroc_scores.keys())
ax.bar(labels, auroc_values, color=['blue', 'orange', 'green'])
ax.set_xlabel('Metrics')
ax.set_ylabel('AUROC')
ax.set_title('AUROC Scores for Different Uncertainty Calculations')
plt.ylim([0.0, 1.0])  # Assuming AUROC scores are between 0.5 and 1.0
plt.show()

# Now, let's create the table. For this example, we will simulate some data.
# The actual implementation should use the results from the actual data processing.

# Simulating some data for the table, in practice you should extract this from your results
# num_correct = sum(1 for output in correct if output == 1)
# percentage_correct = (num_correct / len(correct)) * 100
table_data = {
    "Metric": ["Semantic Entropy", "Number of Distinct Answers"],
    # "% Correctly Answered":percentage_correct,
    "AUROC Score": [auroc_scores.get("semantic_entropy", 0), None],  # None for placeholders
    "% of Distinct Answers": [None,distinct_percentage]  # None for placeholders
}

# Convert the data to a pandas DataFrame
df_table = pd.DataFrame(table_data)

# Display the table
print(df_table)

# Note: The histogram has been created with matplotlib as requested, and the table has been displayed.
# For a real application, the table data should not be simulated but extracted from your processed data.


In [None]:
# Load the question entropy data
with open('question_entropy_data.json', 'r') as file:
    entropy_data = json.load(file)
    generated_data = json.load(file)




# # Load the generated answers data
questions = []
average_rouge_scores = []
average_confidence = []
p_true_values = []
lexical_similarities = []
semantic_entropies = []
average_log_probs = []

# Iterate over each question in the generated data
for context in entropy_data:
    # Iterate over each question in the context
    for question_data in context['questions']:
        question_text = question_data['question']
        questions.append(question_text)

        # Extract relevant data
        generated_answers = question_data['generated_answers']
        avg_log_prob = [gen_ans['avg_log_prob'] for gen_ans in generated_answers]
        average_log_probs.append(avg_log_prob)

# Iterate over each entry in the entropy data
for entry in entropy_data:
    question_text = entry['question']
    avg_log_prob = entry['avg_log_prob']
    rouge_scores = [gen_ans['rougeL_score'] for gen_ans in entry['generated_answers']]
    confidences = [gen_ans['confidence'] for gen_ans in entry['generated_answers']]

    # # Calculate average ROUGE-L score and confidence
    # avg_rouge_score = np.mean(rouge_scores)
    # avg_confidence = np.mean(confidences)

    # # Append calculated values to lists
    average_rouge_scores.extend(rouge_scores)
    average_confidence.extend(confidences)
    p_true_values.append(entry['p_true'])
    lexical_similarities.append(entry['lexical_similarity'])
    semantic_entropies.append(entry['semantic_entropy'])
    average_log_probs.extend(avg_log_prob)

# Calculate overall averages
avg_rouge_score = np.mean(average_rouge_scores)
avg_confidence = np.mean(average_confidence)
avg_p_true = np.mean(p_true_values)
avg_lexical_similarity = np.mean(lexical_similarities)
avg_semantic_entropy = np.mean(semantic_entropies)
avg_average_log_prob = np.mean(average_log_probs)




# Plotting the overall average metrics
metrics = ['Average ROUGE-L Score', 'Average Confidence', 'P_True Values',
           'Lexical Similarity', 'Semantic Entropy', 'Average Log Probability']
averages = [avg_rouge_score, avg_confidence, avg_p_true,
            avg_lexical_similarity, avg_semantic_entropy, avg_average_log_prob]

fig, ax = plt.subplots(figsize=(14, 8))

ax.bar(metrics, averages, alpha=0.7)
ax.set_xlabel('Metrics')
ax.set_ylabel('Average Values')
ax.set_title('Overall Average Metrics for All Questions')
ax.tick_params(axis='x', rotation=45)
ax.grid(axis='y')

plt.show()

# # Plotting only for the first 10 questions
# x = np.arange(10)
# width = 0.15

# fig, ax = plt.subplots(figsize=(14, 8))

# rects1 = ax.bar(x - 2*width, average_rouge_scores[:10], width, label='Average ROUGE-L Score', alpha=0.7)
# rects2 = ax.bar(x - width, average_confidence[:10], width, label='Average Confidence', alpha=0.7)
# rects3 = ax.bar(x, p_true_values[:10], width, label='P_True Values', alpha=0.7)
# rects4 = ax.bar(x + width, lexical_similarities[:10], width, label='Lexical Similarity', alpha=0.7)
# rects5 = ax.bar(x + 2*width, semantic_entropies[:10], width, label='Semantic Entropy', alpha=0.7)
# rects6 = ax.bar(x + 3*width, [np.mean(log_probs) for log_probs in average_log_probs[:10]], width, label='Average Log Probability', alpha=0.7)

# ax.set_xlabel('Questions')
# ax.set_ylabel('Values')
# ax.set_title('Average Metrics for First 10 Questions')
# ax.set_xticks(x)
# ax.set_xticklabels(questions[:10], rotation=90)
# ax.legend()

# fig.tight_layout()

# plt.show()