In [1]:
# Import necessary libraries and modules
import os   
from sacrebleu import corpus_bleu 
from rouge_score import rouge_scorer
from bert_score import score
from transformers import GPT2LMHeadModel, GPT2Tokenizer  
import torch
import pandas as pd 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Path to the Excel file
file_path = "reference_response_rag.csv"   

In [3]:
class RAGEvaluator:
    """
    RAGEvaluator class provides methods to evaluate generated text using various metrics 
    like BLEU, ROUGE, BERTScore, and Perplexity, leveraging GPT-2 for perplexity calculations.
    """

    def __init__(self, model_path="local_gpt2"):
        """
        Initializes the RAGEvaluator by loading the GPT-2 model and tokenizer.

        Parameters:
        - model_path (str): Path to the local GPT-2 model directory (default: "local_gpt2").

        Output:
        - None
        """
        self.gpt2_model, self.gpt2_tokenizer = self.load_gpt2_model(model_path)

    def load_gpt2_model(self, model_path):
        """
        Loads the GPT-2 model and tokenizer. If the model doesn't exist locally, downloads it.

        Parameters:
        - model_path (str): Path to the local GPT-2 model directory.

        Output:
        - Tuple[model, tokenizer]: The GPT-2 model and tokenizer.
        """
        if os.path.exists(model_path):
            model = GPT2LMHeadModel.from_pretrained(model_path)
            tokenizer = GPT2Tokenizer.from_pretrained(model_path)
        else:
            model = GPT2LMHeadModel.from_pretrained('gpt2')
            tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
            model.save_pretrained(model_path)
            tokenizer.save_pretrained(model_path)
        return model, tokenizer

    def evaluate_bleu_rouge(self, candidates, references):
        """
        Evaluates BLEU and ROUGE-1 scores for a set of generated responses and references.

        Parameters:
        - candidates (list of str): Generated text responses.
        - references (list of str): Ground truth reference texts.

        Output:
        - Tuple[float, float]: BLEU score and ROUGE-1 f-measure.
        """
        bleu_score = corpus_bleu(candidates, [references]).score
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        rouge_scores = [scorer.score(ref, cand) for ref, cand in zip(references, candidates)]
        rouge1 = sum([score['rouge1'].fmeasure for score in rouge_scores]) / len(rouge_scores)
        return bleu_score, rouge1

    def evaluate_bert_score(self, candidates, references):
        """
        Evaluates BERTScore (Precision, Recall, F1) for generated responses and references.

        Parameters:
        - candidates (list of str): Generated text responses.
        - references (list of str): Ground truth reference texts.

        Output:
        - Tuple[float, float, float]: BERT Precision, Recall, and F1 scores.
        """
        P, R, F1 = score(candidates, references, lang="en", model_type='bert-base-multilingual-cased')
        return P.mean().item(), R.mean().item(), F1.mean().item()

    def evaluate_perplexity(self, text):
        """
        Evaluates the perplexity of the generated text using the GPT-2 model.

        Parameters:
        - text (str): The text to evaluate perplexity for.

        Output:
        - float: Perplexity value.
        """
        encodings = self.gpt2_tokenizer(text, return_tensors='pt')
        input_ids = encodings['input_ids']
        labels = input_ids.clone()
        with torch.no_grad():
            outputs = self.gpt2_model(input_ids, labels=labels)
        return torch.exp(outputs.loss).item()

    def evaluate_all(self, response, reference):
        """
        Evaluates the generated response against the reference using all metrics (BLEU, ROUGE-1, 
        BERTScore, and Perplexity).

        Parameters:
        - response (str): The generated response text.
        - reference (str): The ground truth reference text.

        Output:
        - dict: Dictionary containing evaluation scores for BLEU, ROUGE-1, BERTScore (P, R, F1), 
                and Perplexity.
        """
        candidates = [response]
        references = [reference]
        bleu, rouge1 = self.evaluate_bleu_rouge(candidates, references)
        bert_p, bert_r, bert_f1 = self.evaluate_bert_score(candidates, references)
        perplexity = self.evaluate_perplexity(response)
        return {
            "BLEU": bleu,
            "ROUGE-1": rouge1,
            "BERT P": bert_p,
            "BERT R": bert_r,
            "BERT F1": bert_f1,
            "Perplexity": perplexity,
        }

In [4]:
def evaluate_from_excel(file_path):
    """
    Evaluates text responses and references from an Excel file using various metrics.

    Parameters:
    - file_path (str): Path to the Excel file containing 'response' and 'reference' columns.

    Output:
    - dict: Dictionary containing average scores for BLEU, ROUGE-1, BERTScore (P, R, F1), 
            and Perplexity across all rows in the Excel file.
    """
    evaluator = RAGEvaluator()
    df = pd.read_csv(file_path)
    
    scores = []
    for index, row in df.iterrows():
        response = row['response']
        reference = row['reference'] 
        scores.append(evaluator.evaluate_all(response, reference)) 
 
    avg_scores = {metric: sum(score[metric] for score in scores) / len(scores) for metric in scores[0].keys()}

    return avg_scores

In [5]:
# Evaluate and print average scores
average_scores = evaluate_from_excel(file_path)
print("Average Scores:")
for metric, value in average_scores.items():
    print(f"{metric}: {value:.4f}")

Average Scores:
BLEU: 14.9585
ROUGE-1: 0.4522
BERT P: 0.7491
BERT R: 0.7438
BERT F1: 0.7460
Perplexity: 30.6252
