In [1]:
# Importing Required Packages
from datasets import Dataset
import pandas as pd
import json
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments, DataCollatorForSeq2Seq
import torch
print("[DEBUG] All libraries Imported")

[DEBUG] All libraries Imported


In [2]:
# Load synthetic data
json_file = r"C:/Users/Webbies/Jupyter_Notebooks/Assessli_LBM/Modified_Data.json"
with open(json_file, "r") as f:
    data = json.load(f)

print("[DEBUG] JSON Data Loaded")

rows = []
for item in data:
    profile = item["user_profile"]
    text_input = (
        f"summarize with tone={profile['tone_preference']}, "
        f"length={profile['summary_length_preference']}, "
        f"focus={profile['focus_area']}: {item['document_text']}"
    )
    target = item["user_generated_summary"]
    rows.append({"input_text": text_input, "target_text": target})

df = pd.DataFrame(rows)
dataset = Dataset.from_pandas(df)

print("[DEBUG] Dataframe Conversion is Successfull")

[DEBUG] JSON Data Loaded
[DEBUG] Dataframe Conversion is Successfull


In [3]:
# Initialize tokenizer and model
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)
model_name = "t5-base" # t5-small
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)
print("[DEBUG] T5-Base Model with tokenizer Loaded Successfully")

Using device: cuda


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


[DEBUG] T5-Base Model with tokenizer Loaded Successfully


In [4]:
# Tokenize dataset
max_input_length = 1024
max_target_length = 256

In [5]:
# The Preprocess Function
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["input_text"], 
        max_length=max_input_length, 
        truncation=True
    )
    labels = tokenizer(
        examples["target_text"], 
        max_length=max_target_length, 
        truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [6]:
# The Tokenized dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset.column_names)

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [7]:
# Train-test split
split_dataset = tokenized_dataset.train_test_split(test_size = 0.2, seed = 42)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

In [8]:
# # Setting the Trainer object
# trainer = transformers.Trainer(
#     model = model,
#     train_dataset = tokenized_train_dataset,
#     args = transformers.TrainingArguments(
#         output_dir = "./BergerFineTunnedModel", # The directory where the Fine Tunned Model will be stored
#         per_device_train_batch_size = 4, # The default is 2
#         gradient_accumulation_steps = 2,
#         num_train_epochs = 3, # Increase the number of epochs if needed for better training
#         learning_rate = 5e-5,
#         max_steps = 20, # Use max steps to show the result for those steps
#         bf16 = False,
#         optim = "paged_adamw_8bit",
#         logging_dir = "./log",
#         save_strategy = "epoch",
#         save_steps = 50,
#         logging_steps = 10

# ),
#     data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm = False),
# )
# model.config.use_cache = False
# trainer.train()

# print("Trainer Loaded Successfully")

# Training Arguments
training_args = TrainingArguments(
    output_dir="./lbm_model",
    save_strategy="epoch",
    learning_rate = 3e-4,
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4,
    num_train_epochs = 5,
    weight_decay = 0.01,
    save_total_limit = 2,
    logging_dir = "./log",
    optim = "paged_adamw_8bit",
    logging_steps = 10,
    push_to_hub = False
)
print("[DEBUG] Training Arguements are Set")

[DEBUG] Training Arguements are Set


In [9]:
# The data collator object
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [10]:
# Trainer
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    tokenizer = tokenizer,
    data_collator = data_collator,
)

print("[DEBUG] Trainer created")

  trainer = Trainer(


[DEBUG] Trainer created


In [None]:
# Train
trainer.train()
print("[DEBUG] Into the Training Process")

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss


In [None]:
# Save the fine-tuned model
model.save_pretrained("./lbm_model")
tokenizer.save_pretrained("./lbm_model")
print("[DEBUG] Saving the Model and Tokenizer")

print("Behaviour-conditioned summarization model saved at ./lbm_model")

In [None]:
# Example inference
def generate_summary(doc_text, tone, length, focus):
    input_text = f"summarize with tone = {tone}, length = {length}, focus = {focus}: {doc_text}"
    input_ids = tokenizer(input_text, return_tensors = "pt", truncation = True, max_length = 1024).input_ids
    outputs = model.generate(input_ids, max_length = 256, num_beams = 4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
# Example
test_doc = "Artificial Intelligence is transforming healthcare by improving diagnosis and treatment."
print(generate_summary(test_doc, "enthusiastic", "medium", "methods"))