# **Project: Healthcare answer summarization**

# 1 . Dataset Analysis :

## 1.1 Training Dataset :

In [2]:
import json
from collections import Counter, defaultdict

# Load the dataset
with open('/kaggle/input/palsma-data/train.json', 'r') as f:
    train_data = json.load(f)

# Initialize counters
label_counter = Counter()
summary_type_counter = Counter()
questions_with_labels = defaultdict(set)

# Analyze the dataset
for item in train_data:
    uri = item.get("uri")
    question = item.get("question")
    
    # Count labelled answer spans
    labelled_spans = item.get("labelled_answer_spans", {})
    for label_type, spans in labelled_spans.items():
        label_counter[label_type] += len(spans)
        questions_with_labels[label_type].add(uri)
    
    # Count summary types
    summaries = item.get("labelled_summaries", {})
    for summary_type in summaries:
        summary_type_counter[summary_type] += 1

# Report
print("Total number of questions:", len(train_data))
print("\nLabelled Answer Span Counts:")
for label, count in label_counter.items():
    print(f"{label}: {count} (in {len(questions_with_labels[label])} questions)")

print("\nSummary Types Counts:")
for summary, count in summary_type_counter.items():
    print(f"{summary}: {count}")


Total number of questions: 2236

Labelled Answer Span Counts:
INFORMATION: 4388 (in 1767 questions)
SUGGESTION: 3613 (in 1360 questions)
CAUSE: 579 (in 308 questions)
EXPERIENCE: 1245 (in 747 questions)
QUESTION: 284 (in 215 questions)

Summary Types Counts:
INFORMATION_SUMMARY: 1742
CAUSE_SUMMARY: 305
SUGGESTION_SUMMARY: 1363
EXPERIENCE_SUMMARY: 745
QUESTION_SUMMARY: 213


## 1.2 Validation Dataset:

In [3]:
import json
from collections import Counter, defaultdict

# Load the dataset
with open('/kaggle/input/palsma-data/valid.json', 'r') as f:
    valid_data = json.load(f)

# Initialize counters
label_counter = Counter()
summary_type_counter = Counter()
questions_with_labels = defaultdict(set)

# Analyze the dataset
for item in valid_data:
    uri = item.get("uri")
    question = item.get("question")
    
    # Count labelled answer spans
    labelled_spans = item.get("labelled_answer_spans", {})
    for label_type, spans in labelled_spans.items():
        label_counter[label_type] += len(spans)
        questions_with_labels[label_type].add(uri)
    
    # Count summary types
    summaries = item.get("labelled_summaries", {})
    for summary_type in summaries:
        summary_type_counter[summary_type] += 1

# Report
print("Total number of questions:", len(valid_data))
print("\nLabelled Answer Span Counts:")
for label, count in label_counter.items():
    print(f"{label}: {count} (in {len(questions_with_labels[label])} questions)")

print("\nSummary Types Counts:")
for summary, count in summary_type_counter.items():
    print(f"{summary}: {count}")


Total number of questions: 959

Labelled Answer Span Counts:
EXPERIENCE: 565 (in 316 questions)
INFORMATION: 1805 (in 735 questions)
SUGGESTION: 1635 (in 595 questions)
CAUSE: 266 (in 139 questions)
QUESTION: 131 (in 102 questions)

Summary Types Counts:
EXPERIENCE_SUMMARY: 315
INFORMATION_SUMMARY: 733
CAUSE_SUMMARY: 138
SUGGESTION_SUMMARY: 595
QUESTION_SUMMARY: 101


## 1.3 Test Dataset :

In [5]:
import json
from collections import Counter, defaultdict

# Load the dataset
with open('/kaggle/input/palsma-data/test.json', 'r') as f:
    test_data = json.load(f)

# Initialize counters
label_counter = Counter()
summary_type_counter = Counter()
questions_with_labels = defaultdict(set)

# Analyze the dataset
for item in test_data:
    uri = item.get("uri")
    question = item.get("question")
    
    # Count labelled answer spans
    labelled_spans = item.get("labelled_answer_spans", {})
    for label_type, spans in labelled_spans.items():
        label_counter[label_type] += len(spans)
        questions_with_labels[label_type].add(uri)
    
    # Count summary types
    summaries = item.get("labelled_summaries", {})
    for summary_type in summaries:
        summary_type_counter[summary_type] += 1

# Report
print("Total number of questions:", len(test_data))
print("\nLabelled Answer Span Counts:")
for label, count in label_counter.items():
    print(f"{label}: {count} (in {len(questions_with_labels[label])} questions)")

print("\nSummary Types Counts:")
for summary, count in summary_type_counter.items():
    print(f"{summary}: {count}")


Total number of questions: 640

Labelled Answer Span Counts:
INFORMATION: 1188 (in 488 questions)
CAUSE: 197 (in 103 questions)
SUGGESTION: 1105 (in 394 questions)
EXPERIENCE: 374 (in 207 questions)
QUESTION: 86 (in 65 questions)

Summary Types Counts:
INFORMATION_SUMMARY: 486
CAUSE_SUMMARY: 102
SUGGESTION_SUMMARY: 394
EXPERIENCE_SUMMARY: 206
QUESTION_SUMMARY: 64


In [6]:
def print_samples(data, name="Dataset", count=5):
    print(f"\n{'='*20} {name} Samples {'='*20}")
    for i, sample in enumerate(data[:count]):
        print(f"\n--- Sample {i+1} ---")
        print("Question:", sample.get("question"))
        print("Answers:", sample.get("answers", []))
        summaries = sample.get("labelled_summaries", {})
        if summaries:
            print("Summaries:")
            for k, v in summaries.items():
                print(f"  {k}: {v}")
        else:
            print("Summaries: None")



# Print samples
print_samples(train_data, name="Train")
print_samples(valid_data, name="Validation")
print_samples(test_data, name="Test")




--- Sample 1 ---
Question: what is parkinesonism?
Answers: ['u spelt it wrong !!\nParkinson\'s disease is one of the most common neurologic disorders of the elderly. The term "parkinsonism" refers to any condition that causes any combination of the types of movement abnormalities seen in Parkinson\'s disease by damaging or destroying dopamine neurons in a certain area of the brain.', "Parkinsonism describes the common symptoms of Parkinson's disease - tremor, rigidity, akinesia or bradykinesia and postural instability. Those patients who respond to drug treatment for Parkinson's disease are diagnosed with it, and those who do not have parkinsonism."]
Summaries:
  INFORMATION_SUMMARY: Parkinson's disease is a prevalent neurologic disorder among the elderly. The term "parkinsonism" encompasses any condition leading to movement abnormalities similar to those observed in Parkinson's disease. This condition arises from the damage or destruction of dopamine neurons in a specific brain regi

# 2 . Training Models :

In [1]:
pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=1.0.0->bert_score)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch>=1.0.0->bert_score)
  

# 2.1 Bart Model : 

In [None]:
import os
import json
import math
import torch
import warnings
import numpy as np
from tqdm import tqdm
from rouge import Rouge
from scipy.spatial.distance import cosine
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from transformers import (
    BertTokenizer, BertModel, RobertaTokenizer, RobertaForSequenceClassification,
    AutoTokenizer, AutoModelForSeq2SeqLM
)
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW


warnings.filterwarnings("ignore")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


### =============================== Dataset ===============================
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=1024):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        question = item.get("question", "").strip()
        answers = item.get("answers", [])

        labelled_summary_dict = item.get("labelled_summaries", {})
        
        if not labelled_summary_dict:
            return self.__getitem__((idx + 1) % len(self.data))  # Skip bad sample


        labelled_answer_spans = item.get("labelled_answer_spans", {})

        
        # Assume only one perspective (e.g., "INFORMATION")
        # if not labelled_summary_dict:
        #     raise ValueError("Missing labelled_summaries in example.")
        perspective_key = list(labelled_summary_dict.keys())[0]
        perspective = perspective_key.replace("_SUMMARY", "")
        target_text = labelled_summary_dict[perspective_key].strip()

        # Prepare answer context
        concatenated_answers = " ".join([ans.replace('\n', ' ').strip() for ans in answers])

        # Definitions and tones for guidance
        start_phrases = {
            "SUGGESTION": ("It is suggested", "Advisory, Recommending",
                           ["Advisory", "Recommending", "Cautioning", "Prescriptive", "Guiding"]),
            "INFORMATION": ("For information purposes", "Informative, Educational",
                            ["Clinical", "Scientific", "Informative", "Educational"]),
            "EXPERIENCE": ("In user's experience", "Personal, Narrative",
                           ["Personal", "Narrative", "Introspective", "Exemplary"]),
            "CAUSE": ("Some of the causes", "Explanatory, Causal",
                      ["Diagnostic", "Explanatory", "Causal", "Due to"]),
            "QUESTION": ("It is inquired", "Seeking Understanding",
                         ["Inquiry", "Rhetorical", "Exploratory Questioning"])
        }

        definitions = {
            "SUGGESTION": "Advice or recommendations to assist users.",
            "INFORMATION": "Knowledge about diseases and facts.",
            "EXPERIENCE": "Individual experiences or insights.",
            "CAUSE": "Reasons responsible for symptoms or conditions.",
            "QUESTION" : "Inquiry made for deeper understanding."
        }

        start_with, tone, _ = start_phrases.get(perspective, ("", "", []))
        definition = definitions.get(perspective, "")

        # Check and prepend start phrase if necessary
        if len(set(target_text.split()[:5]).intersection(set(start_with.split()))) < 2:
            target_text = f"{start_with} {target_text}"

        # Build task input
        task_prefix = (
            f"Adhering to the condition of 'begin summary with' and 'tone of summary' and summarize "
            f"according to {perspective} and start the summary with '{start_with}'. "
            f"Maintain summary tone as {tone}. "
            f"Definition of perspective: {definition}. "
            f"Content to summarize: {concatenated_answers} Question: {question}."
        )

        inputs = self.tokenizer(task_prefix, padding="max_length", max_length=self.max_length,
                                truncation=True, return_tensors="pt")
        labels = self.tokenizer(target_text, padding="max_length", max_length=self.max_length,
                                truncation=True, return_tensors="pt")

        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": labels["input_ids"].squeeze(),
            "perspective": perspective,
            "Summary": target_text
        }



### =============================== Dataloaders ===============================
def create_dataloader(train_dataset, valid_dataset, train_bs, valid_bs):
    return (
        DataLoader(train_dataset, batch_size=train_bs, shuffle=True),
        DataLoader(valid_dataset, batch_size=valid_bs, shuffle=True)
    )

def test_create_dataloader(test_dataset, test_bs):
    return DataLoader(test_dataset, batch_size=test_bs, shuffle=False)


### =============================== Models ===============================
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)

roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=5).to(device)

ckpt_path = "./classifier/checkpoint_classifier"
if os.path.exists(ckpt_path):
    print("Loading the trained checkpoint...")
    ckpt = torch.load(ckpt_path)
    roberta_model.load_state_dict(ckpt['model_state_dict'])


### =============================== Embedding & Scoring ===============================
def get_bert_embedding(text):
    inputs = bert_tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
    outputs = bert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze()


def Ep(summary):
    inputs = roberta_tokenizer(summary, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        logits = roberta_model(**inputs).logits
        probs = torch.nn.functional.softmax(logits, dim=-1)
    classes = ["EXPERIENCE", "SUGGESTION", "INFORMATION", "CAUSE", "QUESTION"]
    return {cls: probs[0][i].item() for i, cls in enumerate(classes)}


def Es(summary):
    start_phrases = [
        "In user's experience…", "It is suggested", "For information purposes",
        "Some of the causes", "It is inquired"
    ]
    pred = ' '.join(summary.split()[:4])
    rouge = Rouge()
    scores = {}
    for ref in start_phrases:
        score = rouge.get_scores(pred.lower(), ref.lower())[0]["rouge-1"]["f"]
        scores[ref] = score
    return scores


def Et(summary):
    tone_dict = {
        'sugg': ["Advisory", "Recommending", "Cautioning", "Prescriptive"],
        'exp': ["Personal", "Narrative", "Introspective"],
        'info': ["Clinical", "Scientific", "Informative"],
        'cause': ["Diagnostic", "Explanatory", "Causal"],
        'qs': ["Inquiry", "Rhetorical", "Exploratory Questioning"]
    }
    summary_emb = get_bert_embedding(summary)
    sims = {}
    for k, word_list in tone_dict.items():
        phrase_emb = get_bert_embedding(' '.join(word_list))
        sims[k] = 1 - cosine(summary_emb.detach().cpu().numpy(), phrase_emb.detach().cpu().numpy())
    return sims



### =============================== Custom Loss ===============================
def compute_custom_loss(model, input_ids, attention_mask, perspectives, tokenizer):
    outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=100, num_beams=5)
    generated_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

    if len(generated_summary) <= 0:
        generated_summary = 'None'

    Ep_dict = Ep(generated_summary)
    Es_dict = Es(generated_summary)
    Et_dict = Et(generated_summary)

    alpha, beta, gamma = 0.7, 0.3, 0.5

    E_X = {
        "EXPERIENCE": alpha * Ep_dict["EXPERIENCE"] + beta * Es_dict["In user's experience…"] + gamma * Et_dict['exp'],
        "SUGGESTION": alpha * Ep_dict["SUGGESTION"] + beta * Es_dict["It is suggested"] + gamma * Et_dict['sugg'],
        "INFORMATION": alpha * Ep_dict["INFORMATION"] + beta * Es_dict["For information purposes"] + gamma * Et_dict['info'],
        "CAUSE": alpha * Ep_dict["CAUSE"] + beta * Es_dict["Some of the causes"] + gamma * Et_dict['cause'],
        "QUESTION": alpha * Ep_dict["QUESTION"] + beta * Es_dict["It is inquired"] + gamma * Et_dict['qs']
    }

    exp_E_X = {k: math.exp(-1 / (v + 1e-6)) for k, v in E_X.items()}
    Z = sum(exp_E_X.values())
    P_X = {k: v / Z for k, v in exp_E_X.items()}

    Y = {k: 0 for k in E_X}
    Y[perspectives[0]] = 1

    P_X_tensor = torch.tensor(list(P_X.values())).to(device)
    Y_tensor = torch.tensor(list(Y.values())).to(device)

    return -torch.sum(Y_tensor * torch.log(P_X_tensor + 1e-6))


### =============================== Validation Loop ===============================
def validate(model, valid_loader, tokenizer):
    print("Starting validation...")
    model.eval()
    losses = []
    for i, batch in enumerate(tqdm(valid_loader)):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        with torch.no_grad():
            output = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            custom_loss = compute_custom_loss(model, input_ids, attention_mask, [batch["perspective"][0]], tokenizer)
            total_loss = output.loss + custom_loss
            losses.append(total_loss.item())

        print(f"Batch {i+1}/{len(valid_loader)} | Loss: {total_loss.item():.4f}")

    avg_loss = np.mean(losses)
    print(f"\nValidation completed. Avg loss: {avg_loss:.4f}")
    return avg_loss


def main():
    from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
    import random

    # Set seeds for reproducibility
    torch.manual_seed(42)
    np.random.seed(42)
    random.seed(42)

    # -------------------- Load Data --------------------
    with open("/kaggle/input/plasma-data/train.json", "r") as f:
        train_data = json.load(f)
    with open("/kaggle/input/plasma-data/valid.json", "r") as f:
        val_data = json.load(f)

    # -------------------- Model Setup --------------------
    model_name = "facebook/bart-base"  # or use 'google/flan-t5-base'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

    # -------------------- Dataset and Dataloader --------------------
    train_dataset = CustomDataset(train_data, tokenizer)
    val_dataset = CustomDataset(val_data, tokenizer)

    train_loader, val_loader = create_dataloader(train_dataset, val_dataset, train_bs=2, valid_bs=2)

    # -------------------- Optimizer --------------------
    optimizer = AdamW(model.parameters(), lr=5e-5)



    num_epochs = 10
    best_val_loss = float("inf")

    for epoch in range(num_epochs):
        print(f"\n======== Epoch {epoch + 1}/{num_epochs} ========")
        model.train()
        epoch_losses = []

        for step, batch in enumerate(tqdm(train_loader)):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            perspectives = [batch["perspective"]] if isinstance(batch["perspective"], str) else batch["perspective"]

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss_ce = outputs.loss
            # loss_custom = compute_custom_loss(model, input_ids, attention_mask, perspectives, tokenizer)
            loss_custom = compute_custom_loss(model, input_ids[0].unsqueeze(0), attention_mask[0].unsqueeze(0), [batch["perspective"][0]], tokenizer)

            total_loss = loss_ce + loss_custom

            total_loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            epoch_losses.append(total_loss.item())

            if step % 10 == 0:
                print(f"Step {step} | Loss: {total_loss.item():.4f} (CE: {loss_ce.item():.4f}, Custom: {loss_custom.item():.4f})")

        avg_train_loss = np.mean(epoch_losses)
        print(f"Epoch {epoch + 1} Avg Training Loss: {avg_train_loss:.4f}")

        # -------------------- Validation --------------------
        val_loss = validate(model, val_loader, tokenizer)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            print(f"Saving best model (val_loss = {val_loss:.4f})...")
            model.save_pretrained("best_model")
            tokenizer.save_pretrained("best_model")

    print("\nTraining Finished!")


if __name__ == "__main__":
    main()


In [None]:
import shutil

# Source path
source_path = "/kaggle/working/best_model"

# Destination path (this will create a zip file you can download)
shutil.make_archive('/kaggle/working/best_model', 'zip', source_path)


## 2.1 Bart model : (Inference)

In [3]:
import os
import json
import torch
import pandas as pd
from tqdm import tqdm
from rouge import Rouge
from bert_score import score as bert_scoring
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torch.utils.data import Dataset, DataLoader
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=1024):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        question = item.get("question", "").strip()
        answers = item.get("answers", [])

        labelled_summary_dict = item.get("labelled_summaries", {})
        
        if not labelled_summary_dict:
            return self.__getitem__((idx + 1) % len(self.data))  # Skip bad sample


        labelled_answer_spans = item.get("labelled_answer_spans", {})

        
        # Assume only one perspective (e.g., "INFORMATION")
        # if not labelled_summary_dict:
        #     raise ValueError("Missing labelled_summaries in example.")
        perspective_key = list(labelled_summary_dict.keys())[0]
        perspective = perspective_key.replace("_SUMMARY", "")
        target_text = labelled_summary_dict[perspective_key].strip()

        # Prepare answer context
        concatenated_answers = " ".join([ans.replace('\n', ' ').strip() for ans in answers])

        # Definitions and tones for guidance
        start_phrases = {
            "SUGGESTION": ("It is suggested", "Advisory, Recommending",
                           ["Advisory", "Recommending", "Cautioning", "Prescriptive", "Guiding"]),
            "INFORMATION": ("For information purposes", "Informative, Educational",
                            ["Clinical", "Scientific", "Informative", "Educational"]),
            "EXPERIENCE": ("In user's experience", "Personal, Narrative",
                           ["Personal", "Narrative", "Introspective", "Exemplary"]),
            "CAUSE": ("Some of the causes", "Explanatory, Causal",
                      ["Diagnostic", "Explanatory", "Causal", "Due to"]),
            "QUESTION": ("It is inquired", "Seeking Understanding",
                         ["Inquiry", "Rhetorical", "Exploratory Questioning"])
        }

        definitions = {
            "SUGGESTION": "Advice or recommendations to assist users.",
            "INFORMATION": "Knowledge about diseases and facts.",
            "EXPERIENCE": "Individual experiences or insights.",
            "CAUSE": "Reasons responsible for symptoms or conditions.",
            "QUESTION" : "Inquiry made for deeper understanding."
        }

        start_with, tone, _ = start_phrases.get(perspective, ("", "", []))
        definition = definitions.get(perspective, "")

        # Check and prepend start phrase if necessary
        if len(set(target_text.split()[:5]).intersection(set(start_with.split()))) < 2:
            target_text = f"{start_with} {target_text}"

        # Build task input
        task_prefix = (
            f"Adhering to the condition of 'begin summary with' and 'tone of summary' and summarize "
            f"according to {perspective} and start the summary with '{start_with}'. "
            f"Maintain summary tone as {tone}. "
            f"Definition of perspective: {definition}. "
            f"Content to summarize: {concatenated_answers} Question: {question}."
        )

        inputs = self.tokenizer(task_prefix, padding="max_length", max_length=self.max_length,
                                truncation=True, return_tensors="pt")
        labels = self.tokenizer(target_text, padding="max_length", max_length=self.max_length,
                                truncation=True, return_tensors="pt")

        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": labels["input_ids"].squeeze(),
            "perspective": perspective,
            "Summary": target_text
        }



### =============================== Dataloaders ===============================
def create_dataloader(train_dataset, valid_dataset, train_bs, valid_bs):
    return (
        DataLoader(train_dataset, batch_size=train_bs, shuffle=True),
        DataLoader(valid_dataset, batch_size=valid_bs, shuffle=True)
    )

def test_create_dataloader(test_dataset, test_bs):
    return DataLoader(test_dataset, batch_size=test_bs, shuffle=False)


def calculate_metrics(pred_summary, actual_summary):
    # Tokenize for METEOR and BLEU
    pred_tokens = pred_summary.split()
    actual_tokens = [actual_summary.split()]  # Note: BLEU expects reference as list of lists
    
    # Calculate BLEU
    smoothie = SmoothingFunction().method4
    bleu_score = sentence_bleu(actual_tokens, pred_tokens, smoothing_function=smoothie)
    
    # Calculate ROUGE
    rouge = Rouge()
    rouge_scores = rouge.get_scores(pred_summary, actual_summary)[0]
    
    # Calculate METEOR
    meteor = meteor_score(actual_tokens, pred_tokens)
    
    return {
        'BLEU': bleu_score,
        'ROUGE-1': rouge_scores['rouge-1']['f'],
        'ROUGE-2': rouge_scores['rouge-2']['f'],
        'ROUGE-L': rouge_scores['rouge-l']['f'],
        'METEOR': meteor
    }

def run_inference_with_metrics(model, test_loader, tokenizer):
    model.eval()
    results = []
    all_preds = []
    all_refs = []
    
    for batch in tqdm(test_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        perspectives = batch['perspective']
        actual_summaries = batch['Summary']
        
        with torch.no_grad():
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=150,
                num_beams=5,
                early_stopping=True
            )
        
        for i in range(len(outputs)):
            pred_summary = tokenizer.decode(outputs[i], skip_special_tokens=True)
            actual_summary = actual_summaries[i]
            perspective = perspectives[i]
            
            metrics = calculate_metrics(pred_summary, actual_summary)
            
            result = {
                "Perspective": perspective,
                "Actual Summary": actual_summary,
                "Predicted Summary": pred_summary,
                **metrics
            }
            results.append(result)
            all_preds.append(pred_summary)
            all_refs.append(actual_summary)
    
    # Calculate BERTScore for all samples at once (more efficient)
    P, R, F1 = bert_scoring(all_preds, all_refs, lang='en', verbose=True)
    
    # Add BERTScore to individual results
    for i, result in enumerate(results):
        result['BERTScore_P'] = P[i].item()
        result['BERTScore_R'] = R[i].item()
        result['BERTScore_F1'] = F1[i].item()
    
    return results, (P.mean().item(), R.mean().item(), F1.mean().item())

def calculate_perspective_wise_metrics(results):
    perspectives = set(r['Perspective'] for r in results)
    perspective_metrics = {}
    
    for perspective in perspectives:
        perspective_results = [r for r in results if r['Perspective'] == perspective]
        
        metrics = {
            'Count': len(perspective_results),
            'R1': sum(r['ROUGE-1'] for r in perspective_results) / len(perspective_results) * 100,
            'R2': sum(r['ROUGE-2'] for r in perspective_results) / len(perspective_results) * 100,
            'RL': sum(r['ROUGE-L'] for r in perspective_results) / len(perspective_results) * 100,
            'BERTScore': sum(r['BERTScore_F1'] for r in perspective_results) / len(perspective_results),
            'METEOR': sum(r['METEOR'] for r in perspective_results) / len(perspective_results),
            'BLEU': sum(r['BLEU'] for r in perspective_results) / len(perspective_results)
        }
        perspective_metrics[perspective] = metrics
    
    return perspective_metrics

def save_perspective_wise_table(perspective_metrics, filename="bart_perspective_wise_metrics.csv"):
    # Prepare data for DataFrame
    data = []
    for perspective, metrics in perspective_metrics.items():
        data.append({
            'Perspective': perspective,
            'R1': f"{metrics['R1']:.2f}",
            'R2': f"{metrics['R2']:.2f}",
            'RL': f"{metrics['RL']:.2f}",
            'BERTScore': f"{metrics['BERTScore']:.3f}",
            'METEOR': f"{metrics['METEOR']:.3f}",
            'BLEU': f"{metrics['BLEU']:.3f}"
        })
    
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)
    print(f"Perspective-wise metrics saved to {filename}")
    return df



def print_perspective_wise_table(perspective_metrics):
    print("\nPERSPECTIVE-WISE METRICS:")
    print("{:<15} {:<8} {:<8} {:<8} {:<10} {:<8} {:<8}".format(
        "Perspective", "R1", "R2", "RL", "BERTScore", "METEOR", "BLEU"))
    print("-"*65)
    
    for perspective, metrics in perspective_metrics.items():
        print("{:<15} {:<8.2f} {:<8.2f} {:<8.2f} {:<10.3f} {:<8.3f} {:<8.3f}".format(
            perspective,
            metrics['R1'],
            metrics['R2'],
            metrics['RL'],
            metrics['BERTScore'],
            metrics['METEOR'],
            metrics['BLEU']
        ))

def save_all_results_json(results, filename="bart_all_evaluation_results.json"):
    """Save all evaluation results in JSON format"""
    # Convert DataFrame-compatible results to JSON-serializable format
    json_results = []
    for result in results:
        json_result = {
            "Perspective": result["Perspective"],
            "Actual_Summary": result["Actual Summary"],
            "Predicted_Summary": result["Predicted Summary"],
            "Metrics": {
                "BLEU": float(result["BLEU"]),
                "ROUGE-1": float(result["ROUGE-1"]),
                "ROUGE-2": float(result["ROUGE-2"]),
                "ROUGE-L": float(result["ROUGE-L"]),
                "METEOR": float(result["METEOR"]),
                "BERTScore_P": float(result["BERTScore_P"]),
                "BERTScore_R": float(result["BERTScore_R"]),
                "BERTScore_F1": float(result["BERTScore_F1"])
            }
        }
        json_results.append(json_result)
    
    with open(filename, 'w') as f:
        json.dump(json_results, f, indent=4)
    print(f"All evaluation results saved to {filename}")

def save_perspective_metrics_json(perspective_metrics, filename="bart_perspective_wise_metrics.json"):
    """Save perspective-wise metrics in JSON format"""
    # Convert metrics to JSON-serializable format
    json_metrics = {}
    for perspective, metrics in perspective_metrics.items():
        json_metrics[perspective] = {
            "R1": float(metrics['R1']),
            "R2": float(metrics['R2']),
            "RL": float(metrics['RL']),
            "BERTScore": float(metrics['BERTScore']),
            "METEOR": float(metrics['METEOR']),
            "BLEU": float(metrics['BLEU']),
            "Count": int(metrics['Count'])
        }
    
    with open(filename, 'w') as f:
        json.dump(json_metrics, f, indent=4)
    print(f"Perspective-wise metrics saved to {filename}")


# Load test data
with open("/kaggle/input/palsma-data/test.json", "r") as f:
    test_data = json.load(f)

# Initialize model and tokenizer
model_name = "/kaggle/input/bart_based_updated/transformers/default/1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

# Create test dataset and loader
test_dataset = CustomDataset(test_data, tokenizer)
test_loader = test_create_dataloader(test_dataset, test_bs=4)

results, overall_bertscore = run_inference_with_metrics(model, test_loader, tokenizer)

# Calculate perspective-wise metrics
perspective_metrics = calculate_perspective_wise_metrics(results)

# Save results in JSON format
save_all_results_json(results)  # Saves all individual evaluation results
save_perspective_metrics_json(perspective_metrics)  # Saves perspective-wise metrics

# Also save in CSV format (optional)
all_results_df = pd.DataFrame(results)
all_results_df.to_csv("bart_all_evaluation_results.csv", index=False)

perspective_df = pd.DataFrame.from_dict(perspective_metrics, orient='index')
perspective_df.to_csv("bart_perspective_wise_metrics.csv")

# Print results
print_perspective_wise_table(perspective_metrics)

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
2025-04-21 04:58:57.469122: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745211537.693710      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745211537.755139      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
100%|██████████| 160/160 [05:57<00:00,  2.23s/it]


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/20 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/10 [00:00<?, ?it/s]

done in 16.84 seconds, 38.01 sentences/sec
All evaluation results saved to bart_all_evaluation_results.json
Perspective-wise metrics saved to bart_perspective_wise_metrics.json

PERSPECTIVE-WISE METRICS:
Perspective     R1       R2       RL       BERTScore  METEOR   BLEU    
-----------------------------------------------------------------
QUESTION        47.52    32.17    47.29    0.908      0.427    0.199   
SUGGESTION      35.54    18.33    33.00    0.890      0.266    0.090   
CAUSE           40.79    24.52    38.75    0.895      0.357    0.162   
INFORMATION     38.51    18.40    36.20    0.892      0.310    0.099   
EXPERIENCE      41.11    27.55    39.01    0.905      0.383    0.182   


In [6]:
# def generate_perspective_summaries(model, tokenizer, question, answers):
#     """
#     Generate summaries for all perspectives based on user-provided question and answers.
    
#     Args:
#         model: Loaded Seq2Seq model
#         tokenizer: Model tokenizer
#         question: User input question
#         answers: List of answer strings
#     """
#     perspectives = {
#         "SUGGESTION": "Advice or recommendations to assist users.",
#         "INFORMATION": "Knowledge about diseases and facts.",
#         "EXPERIENCE": "Individual experiences or insights.",
#         "CAUSE": "Reasons responsible for symptoms or conditions.",
#         "QUESTION": "Inquiry made for deeper understanding."
#     }
    
#     start_phrases = {
#         "SUGGESTION": ("It is suggested", "Advisory, Recommending",
#                        ["Advisory", "Recommending", "Cautioning", "Prescriptive", "Guiding"]),
#         "INFORMATION": ("For information purposes", "Informative, Educational",
#                         ["Clinical", "Scientific", "Informative", "Educational"]),
#         "EXPERIENCE": ("In user's experience", "Personal, Narrative",
#                        ["Personal", "Narrative", "Introspective", "Exemplary"]),
#         "CAUSE": ("Some of the causes", "Explanatory, Causal",
#                   ["Diagnostic", "Explanatory", "Causal", "Due to"]),
#         "QUESTION": ("It is inquired", "Seeking Understanding",
#                      ["Inquiry", "Rhetorical", "Exploratory Questioning"])
#     }
    
#     concatenated_answers = " ".join([ans.replace('\n', ' ').strip() for ans in answers])
    
#     results = []
    
#     for perspective, definition in perspectives.items():
#         # Get perspective-specific parameters
#         start_with, tone, _ = start_phrases.get(perspective, ("", "", []))
        
#         # Build task input
#         task_prefix = (
#             f"Adhering to the condition of 'begin summary with' and 'tone of summary' and summarize "
#             f"according to {perspective} and start the summary with '{start_with}'. "
#             f"Maintain summary tone as {tone}. "
#             f"Definition of perspective: {definition}. "
#             f"Content to summarize: {concatenated_answers} Question: {question}."
#         )
        
#         # Tokenize and generate
#         inputs = tokenizer(task_prefix, return_tensors="pt", max_length=1024, truncation=True).to(device)
        
#         with torch.no_grad():
#             outputs = model.generate(
#                 input_ids=inputs["input_ids"],
#                 attention_mask=inputs["attention_mask"],
#                 max_new_tokens=150,
#                 num_beams=5,
#                 early_stopping=True
#             )
        
#         pred_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
#         results.append({
#             "Perspective": perspective,
#             "Summary": pred_summary
#         })
    
#     return results

# # Example usage:
# if __name__ == "__main__":
#     # Load model (only needed once)
#     model_name = "/kaggle/input/bart_based_updated/transformers/default/1"
#     tokenizer = AutoTokenizer.from_pretrained(model_name)
#     model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
    
#     # Get user input
#     print("Enter your question:")
#     question = input().strip()
    
#     print("Enter answers (one per line, type 'END' when finished):")
#     answers = []
#     while True:
#         answer = input().strip()
#         if answer.upper() == 'END':
#             break
#         answers.append(answer)
    
#     # Generate summaries
#     summaries = generate_perspective_summaries(model, tokenizer, question, answers)
    
#     # Display results
#     print("\nGenerated Summaries:")
#     for summary in summaries:
#         print(f"\n[{summary['Perspective']} PERSPECTIVE]")
#         print(summary['Summary'])

# 2.2 Bart Large Model :

In [None]:
import os
import json
import math
import torch
import warnings
import numpy as np
from tqdm import tqdm
from rouge import Rouge
from scipy.spatial.distance import cosine
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from transformers import (
    BertTokenizer, BertModel, RobertaTokenizer, RobertaForSequenceClassification,
    AutoTokenizer, AutoModelForSeq2SeqLM, get_linear_schedule_with_warmup
)
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW


warnings.filterwarnings("ignore")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


### =============================== Dataset ===============================
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=1024):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        question = item.get("question", "").strip()
        answers = item.get("answers", [])

        labelled_summary_dict = item.get("labelled_summaries", {})
        
        if not labelled_summary_dict:
            return self.__getitem__((idx + 1) % len(self.data))  # Skip bad sample


        labelled_answer_spans = item.get("labelled_answer_spans", {})

        
        # Assume only one perspective (e.g., "INFORMATION")
        # if not labelled_summary_dict:
        #     raise ValueError("Missing labelled_summaries in example.")
        perspective_key = list(labelled_summary_dict.keys())[0]
        perspective = perspective_key.replace("_SUMMARY", "")
        target_text = labelled_summary_dict[perspective_key].strip()

        # Prepare answer context
        concatenated_answers = " ".join([ans.replace('\n', ' ').strip() for ans in answers])

        # Definitions and tones for guidance
        start_phrases = {
            "SUGGESTION": ("It is suggested", "Advisory, Recommending",
                           ["Advisory", "Recommending", "Cautioning", "Prescriptive", "Guiding"]),
            "INFORMATION": ("For information purposes", "Informative, Educational",
                            ["Clinical", "Scientific", "Informative", "Educational"]),
            "EXPERIENCE": ("In user's experience", "Personal, Narrative",
                           ["Personal", "Narrative", "Introspective", "Exemplary"]),
            "CAUSE": ("Some of the causes", "Explanatory, Causal",
                      ["Diagnostic", "Explanatory", "Causal", "Due to"]),
            "QUESTION": ("It is inquired", "Seeking Understanding",
                         ["Inquiry", "Rhetorical", "Exploratory Questioning"])
        }

        definitions = {
            "SUGGESTION": "Advice or recommendations to assist users.",
            "INFORMATION": "Knowledge about diseases and facts.",
            "EXPERIENCE": "Individual experiences or insights.",
            "CAUSE": "Reasons responsible for symptoms or conditions.",
            "QUESTION" : "Inquiry made for deeper understanding."
        }

        start_with, tone, _ = start_phrases.get(perspective, ("", "", []))
        definition = definitions.get(perspective, "")

        # Check and prepend start phrase if necessary
        if len(set(target_text.split()[:5]).intersection(set(start_with.split()))) < 2:
            target_text = f"{start_with} {target_text}"

        # Build task input
        task_prefix = (
            f"Adhering to the condition of 'begin summary with' and 'tone of summary' and summarize "
            f"according to {perspective} and start the summary with '{start_with}'. "
            f"Maintain summary tone as {tone}. "
            f"Definition of perspective: {definition}. "
            f"Content to summarize: {concatenated_answers} Question: {question}."
        )

        inputs = self.tokenizer(task_prefix, padding="max_length", max_length=self.max_length,
                                truncation=True, return_tensors="pt")
        labels = self.tokenizer(target_text, padding="max_length", max_length=self.max_length,
                                truncation=True, return_tensors="pt")

        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": labels["input_ids"].squeeze(),
            "perspective": perspective,
            "Summary": target_text
        }



### =============================== Dataloaders ===============================
def create_dataloader(train_dataset, valid_dataset, train_bs, valid_bs):
    return (
        DataLoader(train_dataset, batch_size=train_bs, shuffle=True),
        DataLoader(valid_dataset, batch_size=valid_bs, shuffle=True)
    )

def test_create_dataloader(test_dataset, test_bs):
    return DataLoader(test_dataset, batch_size=test_bs, shuffle=False)


### =============================== Models ===============================
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)

roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=5).to(device)

ckpt_path = "./classifier/checkpoint_classifier"
if os.path.exists(ckpt_path):
    print("Loading the trained checkpoint...")
    ckpt = torch.load(ckpt_path)
    roberta_model.load_state_dict(ckpt['model_state_dict'])


### =============================== Embedding & Scoring ===============================
def get_bert_embedding(text):
    inputs = bert_tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
    outputs = bert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze()


def Ep(summary):
    inputs = roberta_tokenizer(summary, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        logits = roberta_model(**inputs).logits
        probs = torch.nn.functional.softmax(logits, dim=-1)
    classes = ["EXPERIENCE", "SUGGESTION", "INFORMATION", "CAUSE", "QUESTION"]
    return {cls: probs[0][i].item() for i, cls in enumerate(classes)}


def Es(summary):
    if not summary.strip():  # If summary is empty after stripping whitespace
        return {ref: 0.0 for ref in [
            "In user's experience…", "It is suggested", "For information purposes",
            "Some of the causes", "It is inquired"
        ]}
    
    start_phrases = [
        "In user's experience…", "It is suggested", "For information purposes",
        "Some of the causes", "It is inquired"
    ]
    pred = ' '.join(summary.split()[:4])
    rouge = Rouge()
    scores = {}
    for ref in start_phrases:
        try:
            score = rouge.get_scores(pred.lower(), ref.lower())[0]["rouge-1"]["f"]
        except ValueError:  # In case of any ROUGE calculation error
            score = 0.0
        scores[ref] = score
    return scores


def Et(summary):
    tone_dict = {
        'sugg': ["Advisory", "Recommending", "Cautioning", "Prescriptive"],
        'exp': ["Personal", "Narrative", "Introspective"],
        'info': ["Clinical", "Scientific", "Informative"],
        'cause': ["Diagnostic", "Explanatory", "Causal"],
        'qs': ["Inquiry", "Rhetorical", "Exploratory Questioning"]
    }
    summary_emb = get_bert_embedding(summary)
    sims = {}
    for k, word_list in tone_dict.items():
        phrase_emb = get_bert_embedding(' '.join(word_list))
        sims[k] = 1 - cosine(summary_emb.detach().cpu().numpy(), phrase_emb.detach().cpu().numpy())
    return sims



### =============================== Custom Loss ===============================
def compute_custom_loss(model, input_ids, attention_mask, perspectives, tokenizer):
    outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=100, num_beams=5)
    generated_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

    if not generated_summary.strip():  # If empty summary
        return torch.tensor(0.0, device=device)  # Return zero loss for empty summaries

    try:
        Ep_dict = Ep(generated_summary)
        Es_dict = Es(generated_summary)
        Et_dict = Et(generated_summary)
    except Exception as e:
        print(f"Error calculating custom metrics: {e}")
        return torch.tensor(0.0, device=device)

    alpha, beta, gamma = 0.7, 0.3, 0.5

    E_X = {
        "EXPERIENCE": alpha * Ep_dict["EXPERIENCE"] + beta * Es_dict["In user's experience…"] + gamma * Et_dict['exp'],
        "SUGGESTION": alpha * Ep_dict["SUGGESTION"] + beta * Es_dict["It is suggested"] + gamma * Et_dict['sugg'],
        "INFORMATION": alpha * Ep_dict["INFORMATION"] + beta * Es_dict["For information purposes"] + gamma * Et_dict['info'],
        "CAUSE": alpha * Ep_dict["CAUSE"] + beta * Es_dict["Some of the causes"] + gamma * Et_dict['cause'],
        "QUESTION": alpha * Ep_dict["QUESTION"] + beta * Es_dict["It is inquired"] + gamma * Et_dict['qs']
    }

    exp_E_X = {k: math.exp(-1 / (v + 1e-6)) for k, v in E_X.items()}
    Z = sum(exp_E_X.values())
    P_X = {k: v / Z for k, v in exp_E_X.items()}

    Y = {k: 0 for k in E_X}
    Y[perspectives[0]] = 1

    P_X_tensor = torch.tensor(list(P_X.values())).to(device)
    Y_tensor = torch.tensor(list(Y.values())).to(device)

    return -torch.sum(Y_tensor * torch.log(P_X_tensor + 1e-6))


### =============================== Validation Loop ===============================
def validate(model, valid_loader, tokenizer):
    print("Starting validation...")
    model.eval()
    losses = []
    for i, batch in enumerate(tqdm(valid_loader)):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        with torch.no_grad():
            output = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            custom_loss = compute_custom_loss(model, input_ids, attention_mask, [batch["perspective"][0]], tokenizer)
            total_loss = output.loss + custom_loss
            losses.append(total_loss.item())

        print(f"Batch {i+1}/{len(valid_loader)} | Loss: {total_loss.item():.4f}")

    avg_loss = np.mean(losses)
    print(f"\nValidation completed. Avg loss: {avg_loss:.4f}")
    return avg_loss


def main():
    # Set seeds
    torch.manual_seed(42)
    np.random.seed(42)
    torch.backends.cudnn.deterministic = True

    # Load data
    with open("/kaggle/input/palsma-data/train.json", "r") as f:
        train_data = json.load(f)
    with open("/kaggle/input/palsma-data/valid.json", "r") as f:
        val_data = json.load(f)

    # Initialize BART-large with adjusted dropout
    model_name = "facebook/bart-large"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name, 
                                                dropout=0.2, 
                                                attention_dropout=0.2).to(device)

    # Dataset and Dataloader with larger effective batch size
    train_dataset = CustomDataset(train_data, tokenizer)
    val_dataset = CustomDataset(val_data, tokenizer)
    
    train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False)

    # Optimizer with weight decay and lower LR
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
         'weight_decay': 0.01},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
         'weight_decay': 0.0}
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5, eps=1e-8)

    # Longer training with more warmup
    num_epochs = 15
    total_steps = len(train_loader) * num_epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=int(0.2 * total_steps),  # 20% warmup
        num_training_steps=total_steps
    )

    # Mixed precision and gradient clipping
    scaler = torch.cuda.amp.GradScaler()
    max_grad_norm = 0.5

    # Training loop
    best_val_loss = float('inf')
    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0
        
        for step, batch in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1}")):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            perspectives = [batch["perspective"][0]]

            with torch.cuda.amp.autocast():
                outputs = model(input_ids=input_ids, 
                              attention_mask=attention_mask, 
                              labels=labels)
                loss_ce = outputs.loss
                loss_custom = compute_custom_loss(model, input_ids, attention_mask, perspectives, tokenizer)
                total_loss = loss_ce + 0.3 * loss_custom  # Balanced weighting

            scaler.scale(total_loss).backward()
            epoch_loss += total_loss.item()

            # Gradient clipping and update
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            optimizer.zero_grad()

        # Validation
        val_loss = validate(model, val_loader, tokenizer)
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            model.save_pretrained("best_bart_large")
            tokenizer.save_pretrained("best_bart_large")

        print(f"Epoch {epoch+1} | Train Loss: {epoch_loss/len(train_loader):.4f} | Val Loss: {val_loss:.4f}")

if __name__ == "__main__":
    main()


## 2.2 Bart Large Model : (Inference)

In [5]:
import json
import numpy as np
import zipfile
import os
from tqdm import tqdm
from rouge import Rouge
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from bert_score import score as bert_score
from nltk.translate.meteor_score import meteor_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt_tab')
nltk.download('wordnet')

# Initialize metrics
rouge = Rouge()
smoother = SmoothingFunction()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the trained model
model_path = "/kaggle/input/bart_large_15epoch/transformers/default/1"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)
model.eval()

# Load test data
with open("/kaggle/input/palsma-data/test.json", "r") as f:
    test_data = json.load(f)

# Prepare for evaluation
all_results = []
perspective_metrics = {
    "EXPERIENCE": {"R1": [], "R2": [], "RL": [], "BERTScore": [], "METEOR": [], "BLEU": []},
    "QUESTION": {"R1": [], "R2": [], "RL": [], "BERTScore": [], "METEOR": [], "BLEU": []},
    "INFORMATION": {"R1": [], "R2": [], "RL": [], "BERTScore": [], "METEOR": [], "BLEU": []},
    "SUGGESTION": {"R1": [], "R2": [], "RL": [], "BERTScore": [], "METEOR": [], "BLEU": []},
    "CAUSE": {"R1": [], "R2": [], "RL": [], "BERTScore": [], "METEOR": [], "BLEU": []}
}

def compute_metrics(pred, ref, perspective):
    # ROUGE scores
    try:
        rouge_scores = rouge.get_scores(pred, ref)[0]
        r1 = rouge_scores['rouge-1']['f'] * 100
        r2 = rouge_scores['rouge-2']['f'] * 100
        rl = rouge_scores['rouge-l']['f'] * 100
    except:
        r1, r2, rl = 0, 0, 0

    # BERTScore
    P, R, F1 = bert_score([pred], [ref], lang='en', verbose=False)
    bertscore = F1.mean().item() * 100

    # METEOR
    meteor = meteor_score([word_tokenize(ref)], word_tokenize(pred)) * 100

    # BLEU
    bleu = sentence_bleu([word_tokenize(ref)], word_tokenize(pred),
                         smoothing_function=smoother.method1) * 100

    # Store metrics
    perspective_metrics[perspective]["R1"].append(r1)
    perspective_metrics[perspective]["R2"].append(r2)
    perspective_metrics[perspective]["RL"].append(rl)
    perspective_metrics[perspective]["BERTScore"].append(bertscore)
    perspective_metrics[perspective]["METEOR"].append(meteor)
    perspective_metrics[perspective]["BLEU"].append(bleu)

    return {
        "ROUGE-1": r1,
        "ROUGE-2": r2,
        "ROUGE-L": rl,
        "BERTScore": bertscore,
        "METEOR": meteor,
        "BLEU": bleu
    }

# Run inference and evaluation
for item in tqdm(test_data, desc="Evaluating"):
    question = item.get("question", "").strip()
    answers = item.get("answers", [])
    concatenated_answers = " ".join([ans.replace('\n', ' ').strip() for ans in answers])

    labelled_summary_dict = item.get("labelled_summaries", {})
    if not labelled_summary_dict:
        continue

    perspective_key = list(labelled_summary_dict.keys())[0]
    perspective = perspective_key.replace("_SUMMARY", "")
    target_summary = labelled_summary_dict[perspective_key].strip()

    # Prepare input
    task_prefix = (
        f"Adhering to the condition of 'begin summary with' and 'tone of summary' and summarize "
        f"according to {perspective}. Content to summarize: {concatenated_answers} Question: {question}."
    )

    inputs = tokenizer(task_prefix, return_tensors="pt", truncation=True, max_length=1024).to(device)

    # Generate summary
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=100, num_beams=5)
    generated_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Compute metrics
    metrics = compute_metrics(generated_summary, target_summary, perspective)

    # Store results
    all_results.append({
        "question": question,
        "answers": answers,
        "perspective": perspective,
        "generated_summary": generated_summary,
        "target_summary": target_summary,
        "metrics": metrics
    })

# Calculate average metrics per perspective
final_perspective_metrics = {}
for perspective, metrics in perspective_metrics.items():
    if metrics["R1"]:  # Only if we have samples for this perspective
        final_perspective_metrics[perspective] = {
            "R1": np.mean(metrics["R1"]),
            "R2": np.mean(metrics["R2"]),
            "RL": np.mean(metrics["RL"]),
            "BERTScore": np.mean(metrics["BERTScore"]),
            "METEOR": np.mean(metrics["METEOR"]),
            "BLEU": np.mean(metrics["BLEU"])
        }

# Save results
with open("bart_large_all_evaluation_results.json", "w") as f:
    json.dump(all_results, f, indent=2)

with open("bart_large_perspective_wise_metrics.json", "w") as f:
    json.dump(final_perspective_metrics, f, indent=2)

# Print the formatted table
print("\nPERSPECTIVE-WISE METRICS:")
print("{:<12} {:<8} {:<8} {:<8} {:<10} {:<8} {:<8}".format(
    "Perspective", "R1", "R2", "RL", "BERTScore", "METEOR", "BLEU"))
print("-" * 65)
for perspective, metrics in final_perspective_metrics.items():
    print("{:<12} {:<8.2f} {:<8.2f} {:<8.2f} {:<10.3f} {:<8.3f} {:<8.3f}".format(
        perspective,
        metrics["R1"],
        metrics["R2"],
        metrics["RL"],
        metrics["BERTScore"]/100,  # Convert back to 0-1 scale for display
        metrics["METEOR"]/100,
        metrics["BLEU"]/100
    ))

print("\nAll evaluation results saved to all_evaluation_results.json")
print("Perspective-wise metrics saved to perspective_wise_metrics.json")

[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Evaluating:   0%|          | 0/640 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:   0%|          | 1/640 [00:21<3:47:20, 21.35s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:   0%|          | 2/640 [00:24<1:51:35, 10.49s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:   0%|          | 3/640 [00:25<1:07:14,


PERSPECTIVE-WISE METRICS:
Perspective  R1       R2       RL       BERTScore  METEOR   BLEU    
-----------------------------------------------------------------
EXPERIENCE   16.95    5.18     15.29    0.853      0.186    0.037   
QUESTION     12.32    2.60     11.48    0.847      0.159    0.021   
INFORMATION  33.20    14.80    30.71    0.885      0.291    0.097   
SUGGESTION   24.56    7.72     22.17    0.876      0.223    0.048   
CAUSE        31.02    12.78    27.98    0.886      0.306    0.088   

All evaluation results saved to all_evaluation_results.json
Perspective-wise metrics saved to perspective_wise_metrics.json





# 2.3 GPT-2 :

In [None]:
import os
import json
import math
import torch
import numpy as np
from tqdm import tqdm
from rouge import Rouge
import warnings
from scipy.spatial.distance import cosine
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from transformers import (
    BertTokenizer, BertModel, RobertaTokenizer, RobertaForSequenceClassification,
    GPT2Tokenizer, GPT2LMHeadModel  # Added GPT-2 components
)
from torch.utils.data import Dataset, DataLoader

warnings.filterwarnings("ignore")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### =============================== GPT-2 Setup ===============================
# Load GPT-2 model and tokenizer
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token  # Set pad token

### =============================== Dataset (Unchanged) ===============================
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=1024):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        question = item.get("question", "").strip()
        answers = item.get("answers", [])
        labelled_summary_dict = item.get("labelled_summaries", {})
        
        if not labelled_summary_dict:
            return self.__getitem__((idx + 1) % len(self.data))  # Skip bad sample

        labelled_answer_spans = item.get("labelled_answer_spans", {})
        
        perspective_key = list(labelled_summary_dict.keys())[0]
        perspective = perspective_key.replace("_SUMMARY", "")
        target_text = labelled_summary_dict[perspective_key].strip()

        concatenated_answers = " ".join([ans.replace('\n', ' ').strip() for ans in answers])

        # Definitions and tones for guidance
        start_phrases = {
            "SUGGESTION": ("It is suggested", "Advisory, Recommending",
                           ["Advisory", "Recommending", "Cautioning", "Prescriptive", "Guiding"]),
            "INFORMATION": ("For information purposes", "Informative, Educational",
                            ["Clinical", "Scientific", "Informative", "Educational"]),
            "EXPERIENCE": ("In user's experience", "Personal, Narrative",
                           ["Personal", "Narrative", "Introspective", "Exemplary"]),
            "CAUSE": ("Some of the causes", "Explanatory, Causal",
                      ["Diagnostic", "Explanatory", "Causal", "Due to"]),
            "QUESTION": ("It is inquired", "Seeking Understanding",
                         ["Inquiry", "Rhetorical", "Exploratory Questioning"])
        }

        definitions = {
            "SUGGESTION": "Advice or recommendations to assist users.",
            "INFORMATION": "Knowledge about diseases and facts.",
            "EXPERIENCE": "Individual experiences or insights.",
            "CAUSE": "Reasons responsible for symptoms or conditions.",
            "QUESTION" : "Inquiry made for deeper understanding."
        }

        start_with, tone, _ = start_phrases.get(perspective, ("", "", []))
        definition = definitions.get(perspective, "")

        if len(set(target_text.split()[:5]).intersection(set(start_with.split()))) < 2:
            target_text = f"{start_with} {target_text}"

        task_prefix = (
            f"Adhering to the condition of 'begin summary with' and 'tone of summary' and summarize "
            f"according to {perspective} and start the summary with '{start_with}'. "
            f"Maintain summary tone as {tone}. "
            f"Definition of perspective: {definition}. "
            f"Content to summarize: {concatenated_answers} Question: {question}."
        )

        return {
            "input_text": task_prefix,
            "target_text": target_text,
            "perspective": perspective
        }

### =============================== GPT-2 Helper Function ===============================
def generate_with_gpt2(prompt, max_new_tokens=150):
    try:
        inputs = gpt2_tokenizer(prompt, 
                               return_tensors="pt", 
                               truncation=True, 
                               max_length=1024 - max_new_tokens  # Leave space for generation
                              ).to(device)
        
        # Generate text with GPT-2 using max_new_tokens
        outputs = gpt2_model.generate(
            inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_new_tokens=max_new_tokens,  # This is the key change
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.1,
            pad_token_id=gpt2_tokenizer.eos_token_id,
            eos_token_id=gpt2_tokenizer.eos_token_id,
            do_sample=True
        )
        
        # Decode and clean up output
        generated_text = gpt2_tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Remove the input prompt from the generated text
        generated_text = generated_text[len(prompt):].strip()
        return generated_text
    except Exception as e:
        print(f"Error in GPT-2 generation: {e}")
        return ""

### =============================== Embedding & Scoring ===============================
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)

roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=5).to(device)

def get_bert_embedding(text):
    inputs = bert_tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
    outputs = bert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze()

def Ep(summary):
    inputs = roberta_tokenizer(summary, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        logits = roberta_model(**inputs).logits
        probs = torch.nn.functional.softmax(logits, dim=-1)
    classes = ["EXPERIENCE", "SUGGESTION", "INFORMATION", "CAUSE", "QUESTION"]
    return {cls: probs[0][i].item() for i, cls in enumerate(classes)}

def Es(summary):
    start_phrases = [
        "In user's experience…", "It is suggested", "For information purposes",
        "Some of the causes", "It is inquired"
    ]
    pred = ' '.join(summary.split()[:4])
    rouge = Rouge()
    scores = {}
    for ref in start_phrases:
        score = rouge.get_scores(pred.lower(), ref.lower())[0]["rouge-1"]["f"]
        scores[ref] = score
    return scores

def Et(summary):
    tone_dict = {
        'sugg': ["Advisory", "Recommending", "Cautioning", "Prescriptive"],
        'exp': ["Personal", "Narrative", "Introspective"],
        'info': ["Clinical", "Scientific", "Informative"],
        'cause': ["Diagnostic", "Explanatory", "Causal"],
        'qs': ["Inquiry", "Rhetorical", "Exploratory Questioning"]
    }
    summary_emb = get_bert_embedding(summary)
    sims = {}
    for k, word_list in tone_dict.items():
        phrase_emb = get_bert_embedding(' '.join(word_list))
        sims[k] = 1 - cosine(summary_emb.detach().cpu().numpy(), phrase_emb.detach().cpu().numpy())
    return sims

### =============================== Custom Loss ===============================
def compute_custom_loss(generated_summary, perspective):
    if len(generated_summary) <= 0:
        generated_summary = 'None'

    Ep_dict = Ep(generated_summary)
    Es_dict = Es(generated_summary)
    Et_dict = Et(generated_summary)

    alpha, beta, gamma = 0.7, 0.3, 0.5

    E_X = {
        "EXPERIENCE": alpha * Ep_dict["EXPERIENCE"] + beta * Es_dict["In user's experience…"] + gamma * Et_dict['exp'],
        "SUGGESTION": alpha * Ep_dict["SUGGESTION"] + beta * Es_dict["It is suggested"] + gamma * Et_dict['sugg'],
        "INFORMATION": alpha * Ep_dict["INFORMATION"] + beta * Es_dict["For information purposes"] + gamma * Et_dict['info'],
        "CAUSE": alpha * Ep_dict["CAUSE"] + beta * Es_dict["Some of the causes"] + gamma * Et_dict['cause'],
        "QUESTION": alpha * Ep_dict["QUESTION"] + beta * Es_dict["It is inquired"] + gamma * Et_dict['qs']
    }

    exp_E_X = {k: math.exp(-1 / (v + 1e-6)) for k, v in E_X.items()}
    Z = sum(exp_E_X.values())
    P_X = {k: v / Z for k, v in exp_E_X.items()}

    Y = {k: 0 for k in E_X}
    Y[perspective] = 1

    P_X_tensor = torch.tensor(list(P_X.values())).to(device)
    Y_tensor = torch.tensor(list(Y.values())).to(device)

    return -torch.sum(Y_tensor * torch.log(P_X_tensor + 1e-6))
### =============================== Training Loop Modifications ===============================
def main():
    # Load data
    with open("/kaggle/input/palsma-data/train.json", "r") as f:
        train_data = json.load(f)
    with open("/kaggle/input/palsma-data/valid.json", "r") as f:
        val_data = json.load(f)

    # Create datasets (using BERT tokenizer as before)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    train_dataset = CustomDataset(train_data, tokenizer)
    val_dataset = CustomDataset(val_data, tokenizer)

    num_epochs = 7
    best_val_loss = float("inf")

    for epoch in range(num_epochs):
        print(f"\n======== Epoch {epoch + 1}/{num_epochs} ========")
        epoch_losses = []

        for step in tqdm(range(len(train_dataset))):
            batch = train_dataset[step]
            
            # Generate with GPT-2 instead of GPT-4
            generated_summary = generate_with_gpt2(batch["input_text"])
            
            # Calculate custom loss (unchanged)
            loss_custom = compute_custom_loss(generated_summary, batch["perspective"])
            
            total_loss = loss_custom
            epoch_losses.append(total_loss.item())

            if step % 10 == 0:
                print(f"Step {step} | Loss: {total_loss.item():.4f}")
                # print(f"Generated: {generated_summary[:200]}...")
                # print(f"Target: {batch['target_text'][:200]}...")

        avg_train_loss = np.mean(epoch_losses)
        print(f"Epoch {epoch + 1} Avg Training Loss: {avg_train_loss:.4f}")

        # Validation (modified to use GPT-2)
        val_losses = []
        for val_step in range(min(20, len(val_dataset))):
            batch = val_dataset[val_step]
            generated_summary = generate_with_gpt2(batch["input_text"])
            val_loss = compute_custom_loss(generated_summary, batch["perspective"])
            val_losses.append(val_loss.item())
        
        avg_val_loss = np.mean(val_losses)
        print(f"Validation Loss: {avg_val_loss:.4f}")

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            print("New best validation loss - saving model...")
            torch.save(gpt2_model.state_dict(), "/kaggle/working/best_gpt2_model.pth")

    print("\nTraining Finished!")

if __name__ == "__main__":
    main()

# 2.3 GPT-2 (Inference ) :

In [7]:
import json
import numpy as np
from tqdm import tqdm
from rouge import Rouge
from bert_score import score as bert_score
from nltk.translate.meteor_score import meteor_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import nltk
from torch.utils.data import Dataset, DataLoader
nltk.download('wordnet')

import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Initialize device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
nltk.download('punkt_tab')

# Load GPT-2 model and tokenizer
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token  # Set pad token


# Load the trained GPT-2 model
gpt2_model.load_state_dict(torch.load("/kaggle/input/gpt-2/transformers/default/1/best_gpt2_model (1).pth"))
gpt2_model.eval()

# Load test data
with open("/kaggle/input/palsma-data/test.json", "r") as f:
    test_data = json.load(f)

class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=1024):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        question = item.get("question", "").strip()
        answers = item.get("answers", [])
        labelled_summary_dict = item.get("labelled_summaries", {})

        if not labelled_summary_dict:
            return self.__getitem__((idx + 1) % len(self.data))  # Skip bad sample

        labelled_answer_spans = item.get("labelled_answer_spans", {})

        perspective_key = list(labelled_summary_dict.keys())[0]
        perspective = perspective_key.replace("_SUMMARY", "")
        target_text = labelled_summary_dict[perspective_key].strip()

        concatenated_answers = " ".join([ans.replace('\n', ' ').strip() for ans in answers])

        # Definitions and tones for guidance
        start_phrases = {
            "SUGGESTION": ("It is suggested", "Advisory, Recommending",
                           ["Advisory", "Recommending", "Cautioning", "Prescriptive", "Guiding"]),
            "INFORMATION": ("For information purposes", "Informative, Educational",
                            ["Clinical", "Scientific", "Informative", "Educational"]),
            "EXPERIENCE": ("In user's experience", "Personal, Narrative",
                           ["Personal", "Narrative", "Introspective", "Exemplary"]),
            "CAUSE": ("Some of the causes", "Explanatory, Causal",
                      ["Diagnostic", "Explanatory", "Causal", "Due to"]),
            "QUESTION": ("It is inquired", "Seeking Understanding",
                         ["Inquiry", "Rhetorical", "Exploratory Questioning"])
        }

        definitions = {
            "SUGGESTION": "Advice or recommendations to assist users.",
            "INFORMATION": "Knowledge about diseases and facts.",
            "EXPERIENCE": "Individual experiences or insights.",
            "CAUSE": "Reasons responsible for symptoms or conditions.",
            "QUESTION" : "Inquiry made for deeper understanding."
        }

        start_with, tone, _ = start_phrases.get(perspective, ("", "", []))
        definition = definitions.get(perspective, "")

        if len(set(target_text.split()[:5]).intersection(set(start_with.split()))) < 2:
            target_text = f"{start_with} {target_text}"

        task_prefix = (
            f"Adhering to the condition of 'begin summary with' and 'tone of summary' and summarize "
            f"according to {perspective} and start the summary with '{start_with}'. "
            f"Maintain summary tone as {tone}. "
            f"Definition of perspective: {definition}. "
            f"Content to summarize: {concatenated_answers} Question: {question}."
        )

        return {
            "input_text": task_prefix,
            "target_text": target_text,
            "perspective": perspective
        }

def generate_with_gpt2(prompt, max_new_tokens=150):
    try:
        inputs = gpt2_tokenizer(prompt,
                               return_tensors="pt",
                               truncation=True,
                               max_length=1024 - max_new_tokens  # Leave space for generation
                              ).to(device)  # Ensure tensors are moved to the GPU

        # Generate text with GPT-2 using max_new_tokens
        outputs = gpt2_model.generate(
            inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_new_tokens=max_new_tokens,  # This is the key change
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.1,
            pad_token_id=gpt2_tokenizer.eos_token_id,
            eos_token_id=gpt2_tokenizer.eos_token_id,
            do_sample=True
        )

        # Decode and clean up output
        generated_text = gpt2_tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Remove the input prompt from the generated text
        generated_text = generated_text[len(prompt):].strip()
        return generated_text
    except Exception as e:
        print(f"Error in GPT-2 generation: {e}")
        return ""

# Create test dataset
test_dataset = CustomDataset(test_data, gpt2_tokenizer)

# Initialize metrics
rouge = Rouge()
smoother = SmoothingFunction()
metrics = {
    "EXPERIENCE": {"R1": [], "R2": [], "RL": [], "BERTScore": [], "METEOR": [], "BLEU": []},
    "QUESTION": {"R1": [], "R2": [], "RL": [], "BERTScore": [], "METEOR": [], "BLEU": []},
    "INFORMATION": {"R1": [], "R2": [], "RL": [], "BERTScore": [], "METEOR": [], "BLEU": []},
    "SUGGESTION": {"R1": [], "R2": [], "RL": [], "BERTScore": [], "METEOR": [], "BLEU": []},
    "CAUSE": {"R1": [], "R2": [], "RL": [], "BERTScore": [], "METEOR": [], "BLEU": []}
}

all_results = []

for item in tqdm(test_dataset, desc="Evaluating on test set"):
    # Generate summary
    generated_summary = generate_with_gpt2(item["input_text"])
    target_summary = item["target_text"]
    perspective = item["perspective"]

    # Compute ROUGE scores
    try:
        rouge_scores = rouge.get_scores(generated_summary, target_summary)[0]
        metrics[perspective]["R1"].append(rouge_scores["rouge-1"]["f"])
        metrics[perspective]["R2"].append(rouge_scores["rouge-2"]["f"])
        metrics[perspective]["RL"].append(rouge_scores["rouge-l"]["f"])
    except:
        metrics[perspective]["R1"].append(0)
        metrics[perspective]["R2"].append(0)
        metrics[perspective]["RL"].append(0)

    # Compute BERTScore
    P, R, F1 = bert_score([generated_summary], [target_summary], lang="en")
    metrics[perspective]["BERTScore"].append(F1.mean().item())

    # Compute METEOR
    gen_tokens = nltk.word_tokenize(generated_summary.lower())
    tgt_tokens = nltk.word_tokenize(target_summary.lower())
    metrics[perspective]["METEOR"].append(meteor_score([tgt_tokens], gen_tokens))

    # Compute BLEU
    metrics[perspective]["BLEU"].append(
        sentence_bleu([tgt_tokens], gen_tokens, smoothing_function=smoother.method1)
    )

    # Save all results
    all_results.append({
        "input": item["input_text"],
        "generated_summary": generated_summary,
        "target_summary": target_summary,
        "perspective": perspective,
        "rouge-1": rouge_scores["rouge-1"]["f"] if "rouge_scores" in locals() else 0,
        "rouge-2": rouge_scores["rouge-2"]["f"] if "rouge_scores" in locals() else 0,
        "rouge-l": rouge_scores["rouge-l"]["f"] if "rouge_scores" in locals() else 0,
        "bert_score": F1.mean().item(),
        "meteor": meteor_score([tgt_tokens], gen_tokens),
        "bleu": sentence_bleu([tgt_tokens], gen_tokens, smoothing_function=smoother.method1)
    })

# Calculate average metrics per perspective
perspective_metrics = {}
for perspective in metrics:
    perspective_metrics[perspective] = {
        "R1": np.mean(metrics[perspective]["R1"]) * 100,
        "R2": np.mean(metrics[perspective]["R2"]) * 100,
        "RL": np.mean(metrics[perspective]["RL"]) * 100,
        "BERTScore": np.mean(metrics[perspective]["BERTScore"]),
        "METEOR": np.mean(metrics[perspective]["METEOR"]),
        "BLEU": np.mean(metrics[perspective]["BLEU"])
    }

# Print results in table format
print("\nPERSPECTIVE-WISE METRICS:")
print("Perspective  R1       R2       RL       BERTScore  METEOR   BLEU    ")
print("-----------------------------------------------------------------")
for perspective in perspective_metrics:
    print(f"{perspective.ljust(11)} "
          f"{perspective_metrics[perspective]['R1']:.2f}    "
          f"{perspective_metrics[perspective]['R2']:.2f}    "
          f"{perspective_metrics[perspective]['RL']:.2f}    "
          f"{perspective_metrics[perspective]['BERTScore']:.3f}     "
          f"{perspective_metrics[perspective]['METEOR']:.3f}   "
          f"{perspective_metrics[perspective]['BLEU']:.3f}")

# Save all results
with open("gpt2_all_evaluation_results.json", "w") as f:
    json.dump(all_results, f, indent=2)

with open("gpt2_perspective_wise_metrics.json", "w") as f:
    json.dump(perspective_metrics, f, indent=2)

print("\nAll evaluation results saved to all_evaluation_results.json")
print("Perspective-wise metrics saved to perspective_wise_metrics.json")


[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

  gpt2_model.load_state_dict(torch.load("/kaggle/input/gpt-2/transformers/default/1/best_gpt2_model (1).pth"))
Evaluating on test set:   0%|          | 0/640 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating on test set:   0%|          | 1/640 [00:02<28:38,  2.69s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating on test set:   0%|          | 2/640 [00:04<21:34,  2.03s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.den


PERSPECTIVE-WISE METRICS:
Perspective  R1       R2       RL       BERTScore  METEOR   BLEU    
-----------------------------------------------------------------
EXPERIENCE  3.77    0.00    3.18    0.795     0.081   0.002
QUESTION    2.77    0.09    2.64    0.817     0.091   0.003
INFORMATION 4.73    0.10    4.27    0.795     0.083   0.003
SUGGESTION  4.87    0.13    4.47    0.812     0.085   0.003
CAUSE       3.89    0.22    3.50    0.810     0.111   0.003

All evaluation results saved to all_evaluation_results.json
Perspective-wise metrics saved to perspective_wise_metrics.json





# 2.4 Flan T5 Base :

In [None]:
import os
import json
import math
import torch
from tqdm import tqdm
import numpy as np
from scipy.spatial.distance import cosine
from rouge import Rouge
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    AutoConfig,  # Add this line
    BertTokenizer,
    BertModel,
    RobertaForSequenceClassification,
    RobertaTokenizer,
    get_linear_schedule_with_warmup
)
from torch.optim import AdamW
from peft import get_peft_config, get_peft_model, PrefixTuningConfig, TaskType
import warnings
warnings.filterwarnings("ignore")

# Set device
device = 'cuda' if torch.cuda.is_available() else 'cpu'

class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = []
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.perspective_map = {
            "SUGGESTION": {"prefix": "It is suggested"},
            "INFORMATION": {"prefix": "For information purposes"},
            "EXPERIENCE": {"prefix": "In user's experience"},
            "CAUSE": {"prefix": "Some of the causes"},
            "QUESTION": {"prefix": "It is inquired"}
        }
        
        # Filter out invalid entries during initialization
        for item in data:
            if 'labelled_summaries' in item and item['labelled_summaries']:
                self.data.append(item)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        answers = ' '.join(item['answers'])
        question = item['question']
        
        # Get first available perspective
        perspective_keys = list(item['labelled_summaries'].keys())
        if not perspective_keys:
            raise ValueError(f"No labelled summaries found for item {idx}")
            
        perspective = perspective_keys[0].split('_')[0]
        summary_key = f"{perspective}_SUMMARY"
        
        if summary_key not in item['labelled_summaries']:
            raise ValueError(f"Missing summary for perspective {perspective} in item {idx}")
            
        summary = item['labelled_summaries'][summary_key]
        
        # Create input and target text
        input_text = f"Summarize as {perspective}: {question} {answers}"
        target_text = f"{self.perspective_map[perspective]['prefix']} {summary}"
        
        # Tokenize
        inputs = self.tokenizer(
            input_text,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_tensors="pt"
        )
        
        labels = self.tokenizer(
            target_text,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_tensors="pt"
        )
        
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': labels['input_ids'].squeeze(),
            'perspective': perspective
        }


def create_dataloaders(train_data, valid_data, tokenizer, train_batch_size=8, valid_batch_size=4):
    train_dataset = CustomDataset(train_data, tokenizer)
    valid_dataset = CustomDataset(valid_data, tokenizer)
    
    return (
        DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True),
        DataLoader(valid_dataset, batch_size=valid_batch_size, shuffle=False)
    )

def initialize_models():
    bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)
    
    roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    roberta_model = RobertaForSequenceClassification.from_pretrained(
        'roberta-base', num_labels=5
    ).to(device)
    
    # Load classifier checkpoint if available
    ckpt_path = "./classifier/checkpoint_classifier"
    if os.path.exists(ckpt_path):
        roberta_model.load_state_dict(torch.load(ckpt_path)['model_state_dict'])
    
    return bert_tokenizer, bert_model, roberta_tokenizer, roberta_model


def train_model(
    train_data,
    valid_data,
    model_name="google/flan-t5-small",
    train_batch_size=2,
    valid_batch_size=1,
    learning_rate=3e-5,
    num_epochs=10,
    gradient_accumulation_steps=4
):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    config = AutoConfig.from_pretrained(model_name)

    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    model.to(device)
    
    # Data loaders
    train_dataset = CustomDataset(train_data, tokenizer)
    valid_dataset = CustomDataset(valid_data, tokenizer)
    
    train_loader = DataLoader(
        train_dataset,
        batch_size=train_batch_size,
        shuffle=True,
        drop_last=True
    )
    valid_loader = DataLoader(
        valid_dataset,
        batch_size=valid_batch_size,
        shuffle=False
    )
    
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        
        for step, batch in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1}")):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
                use_cache=False
            )
            
            loss = outputs.loss / gradient_accumulation_steps
            loss.backward()
            
            if (step + 1) % gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                optimizer.zero_grad()
            
            total_loss += loss.item() * gradient_accumulation_steps
            
            if step % 50 == 0:
                torch.cuda.empty_cache()
        
        # Validation
        model.eval()
        val_loss = 0
        for batch in tqdm(valid_loader, desc="Validating"):
            with torch.no_grad():
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels,
                    use_cache=False
                )
                val_loss += outputs.loss.item()
        
        print(f"Epoch {epoch+1} | Train Loss: {total_loss/len(train_loader):.4f} | Val Loss: {val_loss/len(valid_loader):.4f}")
    
    return model



# Example usage
if __name__ == "__main__":
    # Load your data
    with open('/kaggle/input/pu-ma/train.json') as f:
        train_data = json.load(f)
    with open('/kaggle/input/pu-ma/valid.json') as f:
        valid_data = json.load(f)
    
    # Initialize tokenizer
    tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
    
    # Check dataset initialization
    try:
        train_dataset = CustomDataset(train_data, tokenizer)
        valid_dataset = CustomDataset(valid_data, tokenizer)
        print(f"Loaded {len(train_dataset)} training samples and {len(valid_dataset)} validation samples")
    except Exception as e:
        print(f"Error initializing datasets: {e}")
        exit()
    
    # Train and save model
    trained_model = train_model(
        train_data=train_data,  # The dataset will use the filtered data
        valid_data=valid_data,
        model_name="google/flan-t5-base",  # Using small variant for memory
        num_epochs=10,
        train_batch_size=2,
        valid_batch_size=1
    )
    
    # Save the model
    trained_model.save_pretrained("trained_model")
    tokenizer.save_pretrained("trained_model")

# 2.4 Flan T5 Base : (Inference)

In [8]:
import json
import numpy as np
from tqdm import tqdm
from rouge import Rouge
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from bert_score import score as bert_score
from nltk.translate.meteor_score import meteor_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

class Evaluator:
    def __init__(self, model_path, device='cuda' if torch.cuda.is_available() else 'cpu'):
        self.device = device
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)
        self.rouge = Rouge()
        self.smoothie = SmoothingFunction().method4
        
        self.perspective_map = {
            "SUGGESTION": {"prefix": "It is suggested"},
            "INFORMATION": {"prefix": "For information purposes"},
            "EXPERIENCE": {"prefix": "In user's experience"},
            "CAUSE": {"prefix": "Some of the causes"},
            "QUESTION": {"prefix": "It is inquired"}
        }
    
    def generate_summary(self, input_text, perspective):
        """Generate summary for given input text and perspective"""
        full_input = f"Summarize as {perspective}: {input_text}"
        inputs = self.tokenizer(
            full_input, 
            return_tensors="pt", 
            max_length=512, 
            truncation=True
        ).to(self.device)
        
        outputs = self.model.generate(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_length=150,
            num_beams=4,
            early_stopping=True
        )
        
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    def evaluate_sample(self, question, answers, labelled_summaries):
        """Evaluate a single sample against all available perspectives"""
        input_text = f"{question} {' '.join(answers)}"
        results = {}
        
        for key in labelled_summaries.keys():
            perspective = key.split('_')[0]
            reference = labelled_summaries[key]
            
            # Generate summary
            prediction = self.generate_summary(input_text, perspective)
            
            # Compute metrics
            rouge_scores = self.rouge.get_scores(prediction, reference)[0]
            bert_scores = bert_score([prediction], [reference], lang='en')
            meteor = meteor_score([reference.split()], prediction.split())
            
            # Compute BLEU with smoothing
            bleu = sentence_bleu(
                [reference.split()], 
                prediction.split(),
                smoothing_function=self.smoothie
            )
            
            results[perspective] = {
                'prediction': prediction,
                'reference': reference,
                'rouge-1': rouge_scores['rouge-1']['f'],
                'rouge-2': rouge_scores['rouge-2']['f'],
                'rouge-l': rouge_scores['rouge-l']['f'],
                'bertscore': bert_scores[0].mean().item(),
                'meteor': meteor,
                'bleu': bleu
            }
        
        return results
    
    def evaluate_dataset(self, dataset):
        """Evaluate entire dataset"""
        all_results = []
        perspective_metrics = {
            p: {'rouge-1': [], 'rouge-2': [], 'rouge-l': [], 
                'bertscore': [], 'meteor': [], 'bleu': []}
            for p in self.perspective_map
        }
        
        for item in tqdm(dataset, desc="Evaluating"):
            if 'labelled_summaries' not in item or not item['labelled_summaries']:
                continue
                
            results = self.evaluate_sample(
                item['question'],
                item['answers'],
                item['labelled_summaries']
            )
            all_results.append(results)
            
            # Aggregate metrics by perspective
            for perspective, metrics in results.items():
                for metric, value in metrics.items():
                    if metric in perspective_metrics[perspective]:
                        perspective_metrics[perspective][metric].append(value)
        
        # Compute averages
        perspective_wise_metrics = {}
        for perspective, metrics in perspective_metrics.items():
            if not metrics['rouge-1']:  # Skip if no samples for this perspective
                continue
                
            perspective_wise_metrics[perspective] = {
                'R1': np.mean(metrics['rouge-1']),
                'R2': np.mean(metrics['rouge-2']),
                'RL': np.mean(metrics['rouge-l']),
                'BERTScore': np.mean(metrics['bertscore']),
                'METEOR': np.mean(metrics['meteor']),
                'BLEU': np.mean(metrics['bleu'])
            }
        
        return all_results, perspective_wise_metrics
    
    def print_metrics_table(self, perspective_metrics):
        """Print the metrics in the requested table format"""
        print("\nPERSPECTIVE-WISE METRICS:")
        print("Perspective  R1       R2       RL       BERTScore  METEOR   BLEU    ")
        print("-----------------------------------------------------------------")
        
        for perspective, metrics in perspective_metrics.items():
            print(f"{perspective.ljust(11)} "
                  f"{metrics['R1']:.2f}    "
                  f"{metrics['R2']:.2f}    "
                  f"{metrics['RL']:.2f}    "
                  f"{metrics['BERTScore']:.3f}    "
                  f"{metrics['METEOR']:.3f}    "
                  f"{metrics['BLEU']:.3f}")

# Example usage
if __name__ == "__main__":
    # Load your validation data
    with open('/kaggle/input/palsma-data/test.json') as f:
        valid_data = json.load(f)
    
    # Initialize evaluator with your trained model
    evaluator = Evaluator("/kaggle/input/flan_t5_model/transformers/default/1")
    
    # Run evaluation
    all_results, perspective_metrics = evaluator.evaluate_dataset(valid_data)
    
    # Print results
    evaluator.print_metrics_table(perspective_metrics)
    
    # Save results
    with open('flant5_base_all_evaluation_results.json', 'w') as f:
        json.dump(all_results, f, indent=2)
    
    with open('flant5_base_perspective_wise_metrics.json', 'w') as f:
        json.dump(perspective_metrics, f, indent=2)
    
    print("\nAll evaluation results saved to all_evaluation_results.json")
    print("Perspective-wise metrics saved to perspective_wise_metrics.json")

Evaluating:   0%|          | 0/640 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:   0%|          | 1/640 [00:04<45:00,  4.23s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:   0%|          | 2/640 [00:06<30:18,  2.85s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Som


PERSPECTIVE-WISE METRICS:
Perspective  R1       R2       RL       BERTScore  METEOR   BLEU    
-----------------------------------------------------------------
SUGGESTION  0.32    0.15    0.30    0.875    0.239    0.079
INFORMATION 0.36    0.17    0.34    0.885    0.304    0.100
EXPERIENCE  0.35    0.18    0.33    0.882    0.293    0.104
CAUSE       0.32    0.14    0.28    0.879    0.298    0.086
QUESTION    0.44    0.28    0.42    0.894    0.382    0.172

All evaluation results saved to all_evaluation_results.json
Perspective-wise metrics saved to perspective_wise_metrics.json



