In [None]:
from datasets import load_metric
from bert_score import score
from summac.model_summac import SummaCConv

# Load metric
rouge = load_metric("rouge")
bleu = load_metric("bleu")
meteor = load_metric("meteor")
summac_model = SummaCConv(model_name="vitc", granularity="sentence")


def evaluate_summarization(model, dataset, num_samples=10):
    predictions, references = [], []

    for i in range(num_samples):
        article = dataset["article"][i]
        reference_summary = dataset["highlights"][i]

        inputs = tokenizer("summarize: " + article, return_tensors="pt", truncation=True, max_length=512)
        output_ids = model.generate(**inputs, max_length=128, num_beams=4, early_stopping=True)
        predicted_summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)

        predictions.append(predicted_summary)
        references.append(reference_summary)

    rouge_scores = rouge.compute(predictions=predictions, references=references)
    bleu_scores = bleu.compute(
      predictions=[pred.split() for pred in predictions],
      references=[[ref.split()] for ref in references]
    )
    meteor_scores = meteor.compute(predictions=predictions, references=references)
    # bert_F1_score - > mean between bert_precision and bert_recall
    bert_precision, bert_recall, bert_F1_score = score(predictions, references, lang="en", rescale_with_baseline=True)
    bert_score = bert_F1_score.mean().item()
    summac_scores = summac_model.score(references, predictions)["scores"]
    return rouge_scores, bleu_scores, meteor_scores, bert_score, summac_scores

# Evaluate on validation set
rouge_scores, bleu_scores, meteor_scores, bert_score, summac_scores = evaluate_summarization(model, dataset["validation"])
print(rouge_scores)
print(bleu_scores)
print(meteor_scores)
print(bert_score)
print(summac_scores)