In [1]:
import torch
from transformers import BartForConditionalGeneration, BartTokenizer
from datasets import load_dataset
from evaluate import load
import numpy as np

dataset = load_dataset("knkarthick/dialogsum")
test_dataset = dataset["test"]


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch

In [None]:


# Load ROUGE metric (do this once, outside the loop)
metric = load("bertscore")


def load_model_and_tokenizer(model_path):
    model = BartForConditionalGeneration.from_pretrained(model_path)
    tokenizer = BartTokenizer.from_pretrained(model_path)
    return model, tokenizer


def generate_summaries(model, tokenizer, dataset, num_examples=None, max_source_length=512):
    """Generates summaries and calculates ROUGE scores."""
    model.eval()
    model.to("cuda" if torch.cuda.is_available() else "cpu")

    all_decoded_preds = []
    all_decoded_labels = []


    # Process all examples if num_examples is not specified, otherwise process the specified number
    num_examples = len(dataset) if num_examples is None else num_examples


    for i in range(num_examples):
        input_dialogue = dataset[i]["dialogue"]
        input_ids = tokenizer.encode(
            input_dialogue, return_tensors="pt", max_length=max_source_length, truncation=True
        ).to(model.device)

        # Generate summary with specified parameters
        gen_kwargs = {
            "max_length": 128,  # Adjust as needed
            "num_beams": 4,      # Adjust as needed
            "no_repeat_ngram_size": 3,  # Add this to prevent repetitions
            "length_penalty": 1.0 # Add this to penalize short summaries
        }

        with torch.no_grad(): # Add this to disable gradient calculations
            generated_ids = model.generate(input_ids=input_ids, **gen_kwargs)


        generated_summary = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        reference_summary = dataset[i]['summary']

        all_decoded_preds.append(generated_summary)
        all_decoded_labels.append(reference_summary)

        print(f"Input Dialogue:\n{input_dialogue[:200]}...") # Shorten input for display
        print(f"Generated Summary:\n{generated_summary}")
        print(f"Reference Summary:\n{reference_summary}")
        print("-" * 50)



    # Calculate ROUGE scores after generating all summaries
    result = metric.compute(predictions=all_decoded_preds, references=all_decoded_labels, use_stemmer=True)
    result = {key: value * 100 for key, value in result.items()}  # Convert to percentage

    # Print ROUGE scores
    print("ROUGE Scores:")
    for key, value in result.items():
        print(f"{key}: {value:.2f}")

# Example Usage (assuming you have a 'test_dataset' loaded):
model_path = "./bart-large-cnn-finetuned-dialogsum"
model, tokenizer = load_model_and_tokenizer(model_path)
generate_summaries(model, tokenizer, test_dataset, num_examples=10) # Pass dataset, num_examples to limit the processing to 10 examples



Input Dialogue: #Person1#: Ms. Dawson, I need you to take a dictation for me.
#Person2#: Yes, sir...
#Person1#: This should go out as an intra-office memorandum to all employees by this afternoon. Are you ready?
#Person2#: Yes, sir. Go ahead.
#Person1#: Attention all staff... Effective immediately, all office communications are restricted to email correspondence and official memos. The use of Instant Message programs by employees during working hours is strictly prohibited.
#Person2#: Sir, does this apply to intra-office communications only? Or will it also restrict external communications?
#Person1#: It should apply to all communications, not only in this office between employees, but also any outside communications.
#Person2#: But sir, many employees use Instant Messaging to communicate with their clients.
#Person1#: They will just have to change their communication methods. I don't want any - one using Instant Messaging in this office. It wastes too much time! Now, please continue w

In [None]:

# def evaluate_model(decoded_preds, test_dataset):  # No need for model_name
#     rouge = evaluate.load("rouge")
#     references = test_dataset["summary"]
#     rouge_results = rouge.compute(predictions=decoded_preds, references=references)

#     print("ROUGE-1:", rouge_results["rouge1"])
#     print("ROUGE-2:", rouge_results["rouge2"])
#     print("ROUGE-L:", rouge_results["rougeL"])

#     P, R, F1 = score(decoded_preds, references, lang="en", verbose=True)
#     print("BERTScore F1:", F1.mean().item())
