## ***Notebook 2: BART Fine_tuning & Ablation Study***

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#importing libs
import pandas as pd
import torch
from datasets import Dataset
from transformers import BartForConditionalGeneration,BartTokenizer,Seq2SeqTrainingArguments,Seq2SeqTrainer,DataCollatorForSeq2Seq
import os
import numpy as np

In [None]:
try:
    train_df = pd.read_csv('/content/drive/MyDrive/MScDissertation-Aman/FinalDataset/trainFinal.csv').dropna()
    valid_df = pd.read_csv('/content/drive/MyDrive/MScDissertation-Aman/FinalDataset/validFinal.csv').dropna()
    test_df = pd.read_csv('/content/drive/MyDrive/MScDissertation-Aman/FinalDataset/testFinal.csv').dropna()
    for df in [train_df, valid_df, test_df]:
      df.dropna(inplace=True)
      df.drop(df[df.dialogue == ''].index, inplace=True)
      df.drop(df[df.target_summary == ''].index, inplace=True)

    print("Data loaded and cleaned successfully.")

except Exception as e:
    print("Error :- data files not found. please run Notebook 01 to genrate the these files ")
    raise e

In [None]:
#converting the pandas DF into huggingface dataset
train_data = Dataset.from_pandas(train_df)
valid_data = Dataset.from_pandas(valid_df)
test_data = Dataset.from_pandas(test_df)

print(f"Data loaded :- \n tarin size:- {len(train_data)} \n valid size:- {len(valid_data)} \n test size:- {len(test_data)}")

### ***fine tunning setup***

In [None]:
modelCheckpoint = "facebook/bart-base"
tokenizer = BartTokenizer.from_pretrained(modelCheckpoint)

def preprcsFun(examples, include_emotion=True):
    if include_emotion:
        inputs = examples["input_text"]  # input_text includes emotion label
    else:
        inputs = examples["dialogue"]

    targets = examples["target_summary"]

    #tokenizing inputs
    model_inputs = tokenizer(inputs,max_length=1024,truncation=True,padding="max_length")

    #tokenizing targets
    labels = tokenizer(text_target=targets,max_length=128,truncation=True,padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
#reusable function that ensures both experiments run under identical conditions
def bart_finetune_model(experiment_name, include_emotion_in_input):
    print(f"\n{'='*20} STARTING EXPERIMENT: {experiment_name} {'='*20}")

    #tokenizing the datasets..
    print(f"Tokenizing datasets for '{experiment_name}'..")
    tokenized_train = train_data.map(lambda x: preprcsFun(x, include_emotion=include_emotion_in_input), batched=True)
    tokenized_valid = valid_data.map(lambda x: preprcsFun(x, include_emotion=include_emotion_in_input), batched=True)

    #toading the pretrained BART model
    model = BartForConditionalGeneration.from_pretrained(modelCheckpoint)

    #training arguments
    rslt_dir = "/content/drive/MyDrive/MScDissertation-Aman/models_and_results"
    op_dir = os.path.join(rslt_dir, experiment_name)

    training_args = Seq2SeqTrainingArguments(output_dir=op_dir,
        num_train_epochs=3,per_device_train_batch_size=4,
        per_device_eval_batch_size=4,warmup_steps=500,
        weight_decay=0.01,logging_dir=f"{op_dir}/logs",
        logging_steps=100,eval_strategy="epoch",
        save_strategy="epoch",load_best_model_at_end=True,
        predict_with_generate=True,fp16=torch.cuda.is_available(),
        save_total_limit=1)

    data_collector = DataCollatorForSeq2Seq(tokenizer, model=model)

    #initializing the trainer object
    trainer = Seq2SeqTrainer(
        model=model,args=training_args,
        train_dataset=tokenized_train,eval_dataset=tokenized_valid,
        tokenizer=tokenizer,data_collator=data_collector,)

    print("Starting model fine-tuning...")
    trainer.train()
    print("Training complete.")

    #saving the best model + tokenizer
    final_mdl_pth = os.path.join(op_dir, "final_model")
    trainer.save_model(final_mdl_pth)
    tokenizer.save_pretrained(final_mdl_pth)  # <--- ADDED FIX

    print(f"Final model for '{experiment_name}' saved to '{final_mdl_pth}'")
    return final_mdl_pth

In [None]:
def generate_summaries(model_path, experiment_name, include_emotion_in_input):
    print(f"\nGenerating summaries for experiment: {experiment_name}...")

    print(f"Loading model from: {model_path}")
    model = BartForConditionalGeneration.from_pretrained(model_path)
    tokenizer = BartTokenizer.from_pretrained(model_path) 

    rslt_dir = "/content/drive/MyDrive/MScDissertation-Aman/models_and_results"
    op_dir = os.path.join(rslt_dir, experiment_name)

    prediction_args = Seq2SeqTrainingArguments(
        output_dir=op_dir,per_device_eval_batch_size=4,
        predict_with_generate=True,fp16=torch.cuda.is_available())

    trainer = Seq2SeqTrainer(model=model, args=prediction_args, tokenizer=tokenizer)

    print("Tokenizing test data...")
    tokenized_test = test_data.map(lambda x: preprcsFun(x, include_emotion=include_emotion_in_input),
        batched=True)

    print("Generating predictions...")
    predictions = trainer.predict(tokenized_test)

    pred_ids = predictions.predictions
    if isinstance(pred_ids, tuple): 
        pred_ids = pred_ids[0]
    pred_ids = np.asarray(pred_ids)

    if pred_ids.ndim == 3: 
        pred_ids = pred_ids.argmax(-1)

    bad_mask = (pred_ids < 0) | (pred_ids >= tokenizer.vocab_size)
    pred_ids[bad_mask] = tokenizer.pad_token_id

    decoded_preds = tokenizer.batch_decode(
        pred_ids,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True
    )

    test_results_df = test_data.to_pandas()
    test_results_df['generated_summary'] = decoded_preds
    results_path = os.path.join(rslt_dir, f"{experiment_name}_summaries.csv")
    test_results_df[['conv_id', 'target_summary', 'generated_summary']].to_csv(results_path, index=False)

    print(f"Test set summaries saved to '{results_path}'")
    print(f"--- Finished summary generation for: {experiment_name} ---")

In [None]:
exp1_name = "bart_finetuned_with_emotion"
exp1_model_path = bart_finetune_model(experiment_name=exp1_name, include_emotion_in_input=True)

In [None]:
generate_summaries(model_path=exp1_model_path, experiment_name=exp1_name, include_emotion_in_input=True)
print("\nAll experiments are complete!")

In [None]:
exp2_name = "bart_finetuned_without_emotion"
exp2_model_path = bart_finetune_model(experiment_name=exp2_name, include_emotion_in_input=False)

In [None]:
generate_summaries(model_path=exp2_model_path, experiment_name=exp2_name, include_emotion_in_input=False)
print("\nAll experiments are complete!")