# Fine-tune T5 with LoRA and run inference
This notebook trains a LoRA-adapted `google/flan-t5-small` model to convert NLP prompts into XML format.

It includes:
- Dataset loading and preprocessing
- LoRA setup and training
- Model saving
- Inference example

In [None]:
!pip install transformers datasets peft accelerate bitsandbytes --quiet

In [163]:
import json
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, DataCollatorForSeq2Seq,Seq2SeqTrainingArguments
from peft import get_peft_model, LoraConfig, TaskType
import torch
import os

In [164]:
torch.cuda.empty_cache()
torch.cuda.set_device(0)

In [165]:
def get_gpu_devices():
    if not torch.cuda.is_available():
        return []

    devices = []
    for i in range(torch.cuda.device_count()):
        name = torch.cuda.get_device_name(i)
        devices.append((i, name))
    return devices

gpus = get_gpu_devices()
print("Available GPUs:", gpus)

Available GPUs: [(0, 'NVIDIA GeForce RTX 3060 Laptop GPU')]


## Load Dataset

In [166]:
# Change this path to your dataset file location
DATA_PATH = "./../data-generation/nlp_to_math_func.jsonl"

def load_jsonl(path):
    with open(path, 'r') as f:
        data = [json.loads(line) for line in f]
    return Dataset.from_list(data)

dataset = load_jsonl(DATA_PATH)
dataset = dataset.shuffle(seed=42).select(range(10000))  # Limit to 1000 samples for testing
print(f"Loaded {len(dataset)} samples")
print(dataset[0])

Loaded 10000 samples
{'prompt': 'What do you get when you multiply 46 and 42?', 'output': 'multiply(46, 42)'}


## Tokenizer and Preprocessing

In [167]:
MODEL_NAME = "google/flan-t5-small"
MAX_LENGTH = 256

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# def preprocess(example):
#     # Tokenize the input prompt
#     model_inputs = tokenizer(
#         example["prompt"],
#         max_length=MAX_LENGTH,
#         padding="max_length",
#         truncation=True,
#     )

#     # Tokenize the target/output with target tokenizer context
#     with tokenizer.as_target_tokenizer():
#         labels = tokenizer(
#             example["output"],
#             max_length=MAX_LENGTH,
#             padding="max_length",
#             truncation=True,
#         )

#     # Replace pad token ids by -100 for loss ignoring
#     labels_ids = labels["input_ids"]
#     labels_ids = [
#         (l if l != tokenizer.pad_token_id else -100)
#         for l in labels_ids
#     ]

#     model_inputs["labels"] = labels_ids
#     return model_inputs

def preprocess(example):
    # Add instructional prefix (optional but helpful for Flan-T5)
    prompt = f"Answer this math problem: {example['prompt']}"
    output = example['output']

    # Tokenize input (source)
    inputs = tokenizer(
        prompt,
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH,
    )

    # Tokenize target (output/label) correctly using `text_target`
    targets = tokenizer(
        text_target=output,
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH,
    )

    # Replace pad token ids with -100 in labels
    labels = targets["input_ids"]
    labels = [
        (token_id if token_id != tokenizer.pad_token_id else -100)
        for token_id in labels
    ]

    inputs["labels"] = labels
    return inputs


tokenized_dataset = dataset.map(preprocess, batched=False)
print("Tokenization complete.")

Map: 100%|██████████| 10000/10000 [00:13<00:00, 752.86 examples/s]

Tokenization complete.





In [168]:
print("Sample tokenized input:", tokenized_dataset[0])

Sample tokenized input: {'prompt': 'What do you get when you multiply 46 and 42?', 'output': 'multiply(46, 42)', 'input_ids': [11801, 48, 7270, 682, 10, 363, 103, 25, 129, 116, 25, 30333, 9668, 11, 6426, 58, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0

In [169]:
split_ratio = 0.9
splits = tokenized_dataset.train_test_split(test_size=1 - split_ratio, shuffle=True, seed=42)

train_dataset = splits["train"]
eval_dataset = splits["test"]

print(f"Train samples: {len(train_dataset)}")
print(f"Eval samples: {len(eval_dataset)}")

Train samples: 9000
Eval samples: 1000


In [170]:
print("Sample tokenized input:", train_dataset[0])
print("Sample tokenized output:", train_dataset[0]["labels"])

Sample tokenized input: {'prompt': 'What is the sum of matrix [[6, 7], [5, 10]] and matrix [[8, 6], [6, 1]]?', 'output': 'mat_add([[6, 7], [5, 10]], [[8, 6], [6, 1]])', 'input_ids': [11801, 48, 7270, 682, 10, 363, 19, 8, 4505, 13, 16826, 784, 6306, 11071, 489, 13679, 784, 11116, 335, 908, 908, 11, 16826, 784, 6306, 11864, 431, 13679, 784, 11071, 209, 908, 908, 58, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

## Load Model and Apply LoRA

In [171]:
#base_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME, device_map="auto")
base_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to("cuda")

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q", "v"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

trainable params: 344,064 || all params: 77,305,216 || trainable%: 0.4451


In [172]:
sample = train_dataset[0]
print("Input IDs:", sample['input_ids'])
print("Labels:", sample['labels'])



Input IDs: [11801, 48, 7270, 682, 10, 363, 19, 8, 4505, 13, 16826, 784, 6306, 11071, 489, 13679, 784, 11116, 335, 908, 908, 11, 16826, 784, 6306, 11864, 431, 13679, 784, 11071, 209, 908, 908, 58, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Labels: [6928, 834, 13039, 599, 6306, 6306, 11071, 489, 13679, 784, 11116, 335, 908, 13679, 784, 6306, 11864, 431, 13679, 784, 11071, 209,

## Training Arguments and Trainer Setup

In [173]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5_lora_xml_model",
    per_device_train_batch_size=3,
    gradient_accumulation_steps=2,
    num_train_epochs=1,
    learning_rate=5e-4,
    save_strategy="epoch",
    logging_dir="./logs",
    fp16=torch.cuda.is_available(),
    save_total_limit=2,
    report_to="none",
    logging_steps=100,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=-100)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)


  trainer = Seq2SeqTrainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


## Train the Model

In [174]:
trainer.train()

Step,Training Loss
100,0.0
200,0.0
300,0.0
400,0.0
500,0.0
600,0.0
700,0.0
800,0.0
900,0.0
1000,0.0


TrainOutput(global_step=1500, training_loss=0.0, metrics={'train_runtime': 479.5889, 'train_samples_per_second': 18.766, 'train_steps_per_second': 3.128, 'total_flos': 841263611904000.0, 'train_loss': 0.0, 'epoch': 1.0})

## Save the Fine-tuned Model and Tokenizer

In [175]:
trainer.save_model("./t5_lora_xml_model")
tokenizer.save_pretrained("./t5_lora_xml_model")

('./t5_lora_xml_model/tokenizer_config.json',
 './t5_lora_xml_model/special_tokens_map.json',
 './t5_lora_xml_model/spiece.model',
 './t5_lora_xml_model/added_tokens.json',
 './t5_lora_xml_model/tokenizer.json')

In [176]:
def debug_batch(tokenizer, model, dataset, index=0, max_length=256):
    import torch
    from transformers import DataCollatorForSeq2Seq

    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Grab a single example
    example = dataset[index]
    prompt = f"Simplify the expression : {example['prompt']}"
    output = example["output"]

    # Tokenize input and target
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=max_length
    )

    targets = tokenizer(
        output,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=max_length
    )

    # Replace padding token ids with -100 for loss calculation
    labels = targets["input_ids"]
    labels[labels == tokenizer.pad_token_id] = -100

    inputs = {k: v.to(device) for k, v in inputs.items()}
    labels = labels.to(device)

    print("📥 Prompt:", prompt)
    print("📤 Expected Output:", output)
    print("🧾 Input IDs:", inputs["input_ids"])
    print("🧾 Labels:", labels)

    # Run through model
    with torch.no_grad():
        outputs = model(**inputs, labels=labels)
    
    print("📉 Loss:", outputs.loss.item())

    return outputs.loss.item()


In [177]:
debug_single_sample(model, tokenizer, train_dataset, data_collator, index=0)



🔎 Sample Prompt:
Answer this math problem: What is the sum of matrix [[6, 7], [5, 10]] and matrix [[8, 6], [6, 1]]?

🔎 Expected Output (Decoded Labels):
mat_add([[6, 7], [5, 10]], [[8, 6], [6, 1]])

🔎 Raw Label Token IDs:
[6928, 834, 13039, 599, 6306, 6306, 11071, 489, 13679, 784, 11116, 335, 908, 13679, 784, 6306, 11864, 431, 13679, 784, 11071, 209, 908, 908, 61, 1, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`prompt` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

## Inference Example

Load the fine-tuned LoRA model and generate XML for a new prompt.

In [178]:
from transformers import AutoModelForSeq2SeqLM
from peft import PeftModel

def load_model_for_inference(base_model_name, lora_model_path):
    base_model = AutoModelForSeq2SeqLM.from_pretrained(base_model_name, device_map="auto")
    model = PeftModel.from_pretrained(base_model, lora_model_path)
    model.eval()
    return model

def generate_output(prompt, model, tokenizer, max_length=256):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_length).to(model.device)
    outputs = model.generate(**inputs, max_length=max_length, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Load fine-tuned LoRA model
model = load_model_for_inference(MODEL_NAME, "./t5_lora_xml_model")



In [182]:
# Test prompt
test_prompt = "simplify expression : what is sum of 2 and 3?"
output = generate_output(test_prompt, model, tokenizer)
print("\nPrompt:\n", test_prompt)
print("\nGenerated:\n", output)


Prompt:
 simplify expression : what is sum of 2 and 3?

Generated:
 2 and 3
