In [2]:
# Compute the evaluation metrics for generated summary
import json
from pathlib import Path
from tqdm import tqdm

from alignscore import AlignScore

# compute alignscore
scorer = AlignScore(model='roberta-large', batch_size=16, device='cuda:0', ckpt_path='AlignScore-large.ckpt', evaluation_mode='nli_sp')

proj_path = Path("/mnt/ceph_rbd/context-faithful-llm/guided-cad")
pred_data_path = proj_path / "results" / "summary" / "xsum-mistral-7b-base_preds.json"

predictions = []
references = []
documents = []
with open(pred_data_path, 'r') as fin:
    pred_data = json.load(fin)

for idx, sample in tqdm(enumerate(pred_data)):
    doc = sample['document']
    ref = sample['summary']
    pred = sample['generated_summary']

    documents.append(doc)
    references.append(ref)
    predictions.append(pred)

align_score = scorer.score(contexts=documents, claims=predictions)
print("AlignScore: ", align_score)




  from .autonotebook import tqdm as notebook_tqdm
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


FileNotFoundError: [Errno 2] No such file or directory: '/mnt/ceph_rbd/context-faithful-llm/guided-cad/notebooks/AlignScore-large.ckpt'

In [19]:
# TODO: compute the rouge and bert scores

import json
from pathlib import Path
from tqdm import tqdm

import evaluate
import nltk

proj_path = Path("/mnt/ceph_rbd/context-faithful-llm/guided-cad")
pred_data_path = proj_path / "results" / "summary" / "xsum-mistral-7b-attention_impt+cad_preds.json"

with open(pred_data_path, 'r') as fin:
    pred_data = json.load(fin)

predictions = []
references = []
documents = []
word_lengths = []
for idx, sample in tqdm(enumerate(pred_data)):
    doc = sample['document']
    ref = sample['summary']
    pred = sample['generated_summary']
    word_count = len(pred.split())

    references.append(ref)
    predictions.append(pred)
    documents.append(doc)
    word_lengths.append(word_count)

print("Average length of generated text (in word):", sum(word_lengths) / len(word_lengths))

# Compute the ROUGE scores
rouge = evaluate.load('rouge')
rouge_metrics = eval_rouge_scores(predictions, documents)

print("ROUGE scores for the generated summary:")
print_metrics(rouge_metrics)

# Compute the BERT scores
bert_score_metrics = eval_bert_scores(predictions, documents)
print("BERT scores for the generated summary:")
print_metrics(bert_score_metrics)

998it [00:00, 552304.45it/s]




Average length of generated text (in word): 22.85370741482966
ROUGE scores for the generated summary:
rouge1: 0.1396
rouge2: 0.0759
rougeLsum: 0.1284


OutOfMemoryError: CUDA out of memory. Tried to allocate 21.43 GiB (GPU 0; 79.15 GiB total capacity; 49.05 GiB already allocated; 11.93 GiB free; 61.25 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [5]:
## Utility functions

def mean_score(scores):
    return sum(scores) / len(scores)

def eval_rouge_scores(preds, labels):
    rouge = evaluate.load('rouge')
    processed_preds, processed_labels = postprocess_text(preds, labels)
    rouge_scores = rouge.compute(predictions=processed_preds,
                                 references=processed_labels)
    metrics = {
        'rouge1': rouge_scores['rouge1'],
        'rouge2': rouge_scores['rouge2'],
        'rougeLsum': rouge_scores['rougeLsum']
    }

    return metrics

def eval_bert_scores(preds, labels):
    bert_score = evaluate.load('bertscore')
    bert_score_res = bert_score.compute(predictions=preds, 
                                        references=labels, 
                                        model_type="microsoft/deberta-xlarge-mnli", lang="en")
    metrics = {
        'bertscore_p': mean_score(bert_score_res['precision']),
        'bertscore_r': mean_score(bert_score_res['recall']),
        'bertscore_f1': mean_score(bert_score_res['f1']),
    }

    return metrics

def print_metrics(metrics):
    for metric_name, value in metrics.items():
        print(f"{metric_name}: {value:.4f}")

def postprocess_text(preds, labels):
    """ Postprocessing predictions and references for computing rouge L scores
    """
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels