In [None]:
!pip install transformers datasets torch

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
import json
from torch.utils.data import Dataset

# Metadata for each perspective
PERSPECTIVE_METADATA = {
    "INFORMATION": {
        "definition": "Defined as knowledge about diseases, disorders, and health-related facts, providing insights into symptoms and diagnosis.",
        "start_phrase": "For information purposes.",
        "tone": "Informative, Educational"
    },
    "CAUSE": {
        "definition": "Defined as reasons responsible for the occurrence of a particular medical condition, symptom, or disease.",
        "start_phrase": "Some of the causes",
        "tone": "Explanatory, Causal"
    },
    "SUGGESTION": {
        "definition": "Defined as advice or recommendations to assist users in making informed medical decisions, solving problems, or improving health issues.",
        "start_phrase": "It is suggested",
        "tone": "Advisory, Recommending"
    },
    "EXPERIENCE": {
        "definition": "Defined as individual experiences, anecdotes, or firsthand insights related to health, medical treatments, medication usage, and coping strategies.",
        "start_phrase": "In user’s experience",
        "tone": "Personal, Narrative"
    },
    "QUESTION": {
        "definition": "Defined as inquiry made for deeper understanding.",
        "start_phrase": "It is inquired.",
        "tone": "Seeking Understanding"
    }
}

class PerspectiveSummarizationDataset(Dataset):
    def __init__(self, json_file):
        with open(json_file, 'r') as f:
            self.data = json.load(f)

        self.samples = []
        self._prepare_data()

    def _prepare_data(self):
        for sample in self.data:
            question = sample.get("question", "")
            answers = sample.get("answers", [])
            labelled_spans = sample.get("labelled_answer_spans", {})
            labelled_summaries = sample.get("labelled_summaries", {})

            # Separate answers by a newline without formatting anything inside answers
            content_to_summarize = "\n".join([f"{i}) \"{answer}\"" for i, answer in enumerate(answers)])

            # Process each perspective available in labelled spans
            for perspective, spans in labelled_spans.items():
                perspective_key = perspective.upper()
                if perspective_key in PERSPECTIVE_METADATA:
                    summary = labelled_summaries.get(f"{perspective_key}_SUMMARY", "")
                    if summary:
                        meta = PERSPECTIVE_METADATA[perspective_key]
                        prompt = (
                            f"Summarize the following content according to Perspective: {perspective_key};\n"
                            f"{perspective_key} Definition: {meta['definition']};\n"
                            f"Begin Summary with: {meta['start_phrase']};\n"
                            f"Tone of summary: {meta['tone']}\n"
                            f"Content to summarize: {content_to_summarize};\n"  # All answers separated by newline
                            f"Associated question: {question}"
                        )
                        self.samples.append({
                            "input": prompt,
                            "target": summary
                        })

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]

In [5]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from datasets import Dataset
from torch.utils.data import DataLoader
import json

In [None]:
# Load the model and tokenizer directly from Hugging Face
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

In [7]:
# Tokenization function
def tokenize_function(examples):
    inputs = tokenizer(examples['input'], truncation=True, padding="max_length", max_length=512)
    targets = tokenizer(examples['target'], truncation=True, padding="max_length", max_length=512)
    inputs['labels'] = targets['input_ids']  # The labels are the target sequences
    return inputs


In [None]:
dataset = PerspectiveSummarizationDataset("/content/train.json")  # Update path accordingly

# Prepare the dataset
tokenized_dataset = Dataset.from_dict({
    "input": [item["input"] for item in dataset.samples],
    "target": [item["target"] for item in dataset.samples]
})

# Tokenize the dataset
tokenized_dataset = tokenized_dataset.map(tokenize_function, batched=True)

# Split dataset into train and validation sets
train_size = int(0.9 * len(tokenized_dataset))
train_dataset = tokenized_dataset.select(range(train_size))
eval_dataset = tokenized_dataset.select(range(train_size, len(tokenized_dataset)))

In [9]:
output_dir='/content/drive/MyDrive/IIITD/NLP_PROJ/BART-cnn-large/results'
logging_dir='/content/drive/MyDrive/IIITD/NLP_PROJ/BART-cnn-large/logs'

In [None]:
# Set up the training arguments
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/IIITD/NLP_PROJ/BART-cnn-large/results',          # output directory for the model
    num_train_epochs=3,                                                                 # number of training epochs
    per_device_train_batch_size=8,                                                      # batch size for training
    per_device_eval_batch_size=8,                                                       # batch size for evaluation
    warmup_steps=500,                                                                   # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                                                                  # strength of weight decay
    logging_dir='/content/drive/MyDrive/IIITD/NLP_PROJ/BART-cnn-large/logs',            # directory for storing logs
    logging_steps=10,                                                                   # log every 10 steps
    evaluation_strategy="steps",                                                        # evaluate every `logging_steps`
    save_steps=500,                                                                     # save checkpoint every 500 steps
    save_total_limit=2,                                                                 # save only the last 2 checkpoints
    learning_rate=5e-5,
    fp16=True
)

## Next time try following args
# training_args = TrainingArguments(
#     output_dir='./results',
#     num_train_epochs=3,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     warmup_steps=500,
#     weight_decay=0.01,
#     logging_dir='./logs',
#     logging_steps=10,
#     evaluation_strategy="steps",
#     save_steps=500,
#     save_total_limit=2,
#     learning_rate=2e-5,
#     fp16=True          
# )


# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer
)

# Start the training process
trainer.train()

# Save the trained model
# trainer.save_model()

# Evaluate the model on the validation dataset
trainer.evaluate(eval_dataset)


In [25]:
trainer.save_model("bart_large_cnn_save")

## INFERENCE

In [18]:
val_dataset = PerspectiveSummarizationDataset("/content/valid.json")

In [None]:
val_dataset[0]

In [20]:
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/IIITD/NLP_PROJ/BART-cnn-large/results/',
    per_device_eval_batch_size=8,
    # logging_dir='./logs'
)

In [None]:
model_path = "/content/drive/MyDrive/IIITD/NLP_PROJ/BART-cnn-large/results"  # Path to your saved model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

In [None]:
# Reinitialize the Trainer with loaded model
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

In [None]:
## Final Evaluation code wchich generates summaries and save in csv and then calculate bert bleu score

import csv
model.eval()

# Initialize CSV file to store predictions and actual targets
csv_file = "predictions_vs_actual.csv"
csv_columns = ["Predicted", "Actual"]

# Open CSV file in write mode
with open(csv_file, mode="w", newline='', encoding="utf-8") as file:
    writer = csv.DictWriter(file, fieldnames=csv_columns)
    writer.writeheader()

    for i in range(len(val_dataset)):
        input_text = val_dataset[i]["input"]
        actual_target = val_dataset[i]["target"]

        inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True, max_length=512)

        output = model.generate(
            inputs["input_ids"],
            max_length=100,          # Adjust max length as needed
            num_return_sequences=1,  # Number of sequences to generate
            do_sample=True,          # Enable sampling (for creative output)
            top_k=50,                # Top-k sampling
            top_p=0.95,              # Nucleus sampling
            temperature=0.7          # Randomness control
        )

        generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
        print(f"{i} summaries generated: {generated_text}")
        writer.writerow({"Predicted": generated_text, "Actual": actual_target})

print(f"Predictions saved to {csv_file}.")

In [None]:
# !pip install bert-score nltk

In [None]:
import csv
import bert_score
from nltk.translate.bleu_score import corpus_bleu

# csv_file = "predictions_vs_actual.csv"
predictions = []
references = []

# Read the CSV file
with open(csv_file, mode="r", encoding="utf-8") as file:
    reader = csv.DictReader(file)
    for row in reader:
        predictions.append(row["Predicted"])
        references.append(row["Actual"])

def compute_bertscore(predictions, references):
    P, R, F1 = bert_score.score(predictions, references, lang="en")
    return {"Precision": P.mean().item(), "Recall": R.mean().item(), "F1": F1.mean().item()}

def compute_bleu_score(predictions, references):
    references = [[ref.split()] for ref in references]
    predictions = [pred.split() for pred in predictions]
    return corpus_bleu(references, predictions)

# Compute BERT score
bert_scores = compute_bertscore(predictions, references)
print(f"BERT Score: Precision={bert_scores['Precision']:.4f}, Recall={bert_scores['Recall']:.4f}, F1={bert_scores['F1']:.4f}")

# Compute BLEU score
bleu_score = compute_bleu_score(predictions, references)
print(f"BLEU Score: {bleu_score:.4f}")


In [None]:
!zip -r zipped_model.zip /content/bart_large_cnn_save