In [1]:
"""Load prediction files and compute evaluation metrics"""

import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer
import json
import evaluate
from pathlib import Path
from tqdm import tqdm
import nltk

import sys
sys.path.append("..")
from metrics import mean_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/hpcdu1/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
def mean_score(scores):
    return sum(scores) / len(scores)

def postprocess_text(preds, labels):
    """ Postprocessing predictions and references for computing rouge L scores
    """
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

In [4]:
# Load references and predictions

file_path = "/home/hpcdu1/project/context-faithful-llm/results/cnn_dm/cad-llama2-7b-zero-shot-prompt-base_preds.json"
with open(file_path, 'r') as fin:
    pred_data = json.load(fin)

docs = [data['article'] for data in pred_data]
predictions = [data['prediction'] for data in pred_data]
references = [data['highlights'] for data in pred_data]

In [10]:
import copy

annotated_examples = []
for data in pred_data[:10]:
    doc = data['article']
    ref = data['highlights']
    pred = data['prediction']
    annotated_example = copy.deepcopy(data)

    # Compute ROUGE score for each instance
    rouge = evaluate.load("rouge")
    processed_pred, processed_ref = postprocess_text([pred], [ref])
    rouge_scores = rouge.compute(predictions=processed_pred, 
                                 references=processed_ref,
                                 use_aggregator=True)

    for rouge_type in ['rouge1', 'rouge2', 'rougeLsum']:
        annotated_example[rouge_type] = rouge_scores[rouge_type]
    print(annotated_example)
    
    # # Compute factKB score per instance
    # factkb_input = [[pred, doc]]
    # factkb_tokens = factkb_tokenizer(factkb_input, return_tensors="pt", 
    #                                     padding="max_length", truncation=True).to(factkb_model.device)
    # factkb_logits = factkb_model(**factkb_tokens).logits
    # factkb_res = torch.softmax(factkb_logits, dim=1)
    # factkb_score = float(factkb_res[0][1])
    # factkb_scores.append(factkb_score)

    # annotated_example['factkb'] = factkb_score
    annotated_examples.append(annotated_example)

{'article': '(CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC\'s founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians\' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday

In [None]:
with open("test.out", 'w') as file:
    json.dump(annotated_examples, file, indent=4)

In [17]:
# Compute and collate FactKB scores

# Load FactKB model
download_path = "/home/hpcdu1/experiments/huggingface-hub"
factkb_tokenizer = AutoTokenizer.from_pretrained("roberta-base", padding="max_length", truncation=True,
                                                 cache_dir=download_path)
factkb_model = AutoModelForSequenceClassification.from_pretrained("bunsenfeng/FactKB", num_labels=2, 
                                                                  device_map="auto", cache_dir=download_path)

factkb_scores = []
for pred, doc in tqdm(zip(predictions, docs)):
    # Evaluate FactKB score
    factkb_input = [[pred, doc]]
    factkb_tokens = factkb_tokenizer(factkb_input, return_tensors="pt", 
                                        padding="max_length", truncation=True).to(factkb_model.device)
    factkb_logits = factkb_model(**factkb_tokens).logits
    factkb_res = torch.softmax(factkb_logits, dim=1)
    factkb_scores.append(float(factkb_res[0][1]))

KeyboardInterrupt: 

In [13]:
# Compute the ROUGE scores
# Note: need to post-process the text before computing Rouge-L scores

rouge = evaluate.load("rouge")
processed_preds, processed_refs = postprocess_text(predictions, references)
rouge_scores = rouge.compute(predictions=processed_preds, 
                                references=processed_refs,
                                use_aggregator=False)

# Compute BERT score
bert_score = evaluate.load('bertscore')
bert_score_res = bert_score.compute(predictions=predictions, 
                                    references=references, 
                                    model_type="microsoft/deberta-xlarge-mnli", lang="en")

metrics = {
    'rouge1': mean_score(rouge_scores['rouge1']),
    'rouge2': mean_score(rouge_scores['rouge2']),
    'rougeLsum': mean_score(rouge_scores['rougeLsum']),
    'bertscore_p': mean_score(bert_score_res['precision']),
    'bertscore_r': mean_score(bert_score_res['recall']),
    'bertscore_f1': mean_score(bert_score_res['f1']),
    "factKB": mean_score(factkb_scores)
}

# Print the evaluation metrics
for metric_name, value in metrics.items():
    print(f"{metric_name}: {value:.4f}")

In [14]:
print(metrics)

{'rouge1': 0.32149860649718576, 'rouge2': 0.133588029140058, 'rougeLsum': 0.2936645807891397}
