In [None]:
# !pip install transformers datasets accelerate peft

In [None]:
# !pip install -U bitsandbytes

## Imports

In [None]:
import ast
import torch
import transformers

from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import BitsAndBytesConfig

from torch.optim import AdamW
from torch.utils.data import DataLoader

from peft import prepare_model_for_kbit_training
from peft import LoraConfig, get_peft_model

from datasets import Dataset,load_dataset
from tqdm import tqdm

from accelerate import Accelerator

## Reading the data..!

In [None]:
data_path = r"/kaggle/input/llm-fine-tune-dataset/event_text_mapping.jsonl"
ds = load_dataset("json", data_files=data_path)["train"]

print("Dataset features:", ds.features)
print("Number of examples:", len(ds))
print("\nFirst 3 examples:")
for i in range(min(3, len(ds))):
    print(ds[i])

### Dataset Overview and Initial Exploration

We begin by loading a JSON dataset consisting of 792 examples, each containing a natural language event description and a structured `output` dictionary. The schema reveals that each event is broken down into actionable fields such as `action`, `date`, `time`, `location`, `duration`, and optional fields like `attendees`, `recurrence`, and `notes`. The initial few samples confirm the consistency in format and provide confidence that the dataset is well-suited for training models to perform structured information extraction from free-form text.


## Loading the model..!

### Model Loading with 4-bit Quantization

We load the `SmolLM-360M` model using 4-bit NF4 quantization via `BitsAndBytesConfig` for efficient memory usage and faster inference. The model architecture is based on LLaMA with 32 decoder layers and linear projections quantized to 4-bit. This enables running a moderately sized language model on limited GPU resources without significantly compromising performance, making it ideal for experimentation or fine-tuning tasks on consumer hardware.


In [None]:
MODEL_NAME = "HuggingFaceTB/SmolLM-360M"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, quantization_config=bnb_config,device_map={"": 0}
)

model

### Setting up the tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

### Checking the text generation capability of the model..!

In [None]:
input_text = "Hello, I'm Abhijeet. How are you?"
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=25)
decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"Input: {input_text}")
print("-"*30)
print(f"Output: {decoded_output}")

### Checking the output for our NER task..!
- I validated the model's ability to learn from few-shot prompting by providing a structured example followed by a new user input.
- The prompt design helps guide the model by setting a clear pattern to follow.
- I observed that the model follows the example structure well, but tends to repeat parts of the prompt and occasionally truncates the response.
- These issues hint at limitations in generalization from just in-context examples.
- Fine-tuning the model on a larger set of such input-output JSON pairs could help it learn the structure more robustly.
- With fine-tuning, the model would better internalize entity extraction logic and reduce over-reliance on prompt templates.
- It would also improve consistency in output formatting, casing, and handling edge cases like ambiguous durations or varying phrasing.
- Overall, fine-tuning can make the model more reliable, reduce prompt engineering overhead, and produce cleaner, more accurate extractions.



In [None]:
example = ds[0]
question = ds[1]
prompt_template = (
    "See the given example to extract the entities based on given input in JSON format.\n\n"
    "Example Input: {event_text}\n"
    "Example Output: {output}\n"
    "--------------------------\n"
    "Please extract the entities for the below user input in JSON format. And do not output anything else.\n"
    "User Input: {user_input}\n"
)

formatted_example = {
    "text": prompt_template.format(event_text=example['event_text'], output=example['output'],user_input=question['event_text'])
}

print(">> PROMPT FOR THE MODEL:")
print("-"*len("PROMPT FOR THE MODEL:"))
print(formatted_example['text'])
print("\n\n")

input_text = formatted_example['text']
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=100)
decoded_output = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)

# print(f"Input: {input_text}")
print(">> Response from SLM:".upper())
print("-"*len(">> Response from LM:"))
print(decoded_output)
print("\n\n")
print(">> Actual Output:".upper())
print("-"*len(">> Actual Output:"))
print(question['output'])

## Preparing the dataset for fine-tuning task..!
- I structured the dataset using a consistent prompt-response format to guide the model during training.
- Using `map(batched=True)` allowed efficient batch processing while embedding each input into a few-shot prompt with a fixed example.
- The inclusion of a reference example in every prompt establishes a clear pattern for the model to imitate during fine-tuning.
- This consistency in formatting helps the SLM (Small Language Model) learn how to extract entities reliably in the expected JSON format.
- By retaining both the prompt (`text`) and the ground truth (`output`), I can directly train the model in a supervised manner.
- This setup encourages the model to understand contextual clues and align its outputs closely with structured targets.


In [None]:
def serialize(examples):
    """
    Constructs a batch of prompt strings using few-shot learning format for NER-style entity extraction.

    This function formats each input text in `examples['event_text']` by embedding it into a predefined
    prompt template. It uses a fixed example (the first instance from the `raw` dataset) as a demonstration
    to guide the model. The function returns a dictionary containing:

    - "text": List of formatted prompt strings for each input.
    - "output": Corresponding expected outputs as strings for comparison or training.

    Parameters:
    ----------
    examples : dict
        A dictionary with two keys:
        - 'event_text': List of user inputs to extract entities from.
        - 'output': List of corresponding ground truth outputs in dictionary format.

    Returns:
    -------
    dict
        A dictionary with:
        - 'text': List of formatted prompt strings.
        - 'output': List of expected output strings.
    """

    prompt_template = (
    "See the given example to extract the entities based on given input in JSON format.\n\n"
    "Example Input: {example_event_text}\n"
    "Example Output: {example_output}\n"
    "--------------------------\n"
    "Please extract the entities for the below user input in JSON format. And do not output anything else.\n\n"
    "Human Input: {user_input}\n"
    "AI: "
    )



    # Use the first example from the raw dataset as the fixed example for the prompt
    example_instance={}
    example_instance["event_text"] = """Late night study session at the café on 15th, Dec 2024 at 9:00 pm for 2 hours."""
    example_instance['output'] = """{'action': 'study session', 'date': '15/12/2024', 'time': '9:00 PM', 'attendees': None, 'location': 'café', 'duration': '2 hours', 'recurrence': None, 'notes': None}"""
    formatted_texts = []


    # Iterate through the batch using the length of one of the lists (assuming all lists have the same length)
    for i in range(len(examples['event_text'])):
        formatted_text = prompt_template.format(
            example_event_text=example_instance['event_text'],
            example_output=example_instance['output'],
            user_input=examples['event_text'][i] # Access each example in the batch correctly
        )
        formatted_texts.append(formatted_text)


    return {"text": formatted_texts, "output": [str(output) for output in examples['output']]} # Access each output in the batch correctly

In [None]:
# Process the dataset in batches using map with batched=True
dataset = load_dataset("json", data_files=data_path)['train']
dataset = dataset.train_test_split(test_size=0.1)

ds_val = dataset['test']
ds_train = dataset['train']
ds_train = ds_train.map(serialize,batched=True)


print("Formatted dataset example:".upper())
print("-"*len("Formatted dataset example:"))
print(ds_train[2]['text']+ds_train[2]['output'])
print("\n\n")
print("Corresponding output:".upper())
print("-"*len("Corresponding output:"))
print(ds_train[2]['output'])

### Creating a `data_loader` to generate batch of training sample for fine-tuning..!

In [None]:
def tokenize_fn(example):

    """
    Prepares tokenized input for supervised fine-tuning of a language model using prompt-response format.

    This function takes a dictionary containing a prompt (`example["text"]`) and a target output
    (`example["output"]`), concatenates them into a single training string, and tokenizes it using
    the provided tokenizer. It ensures that only the target portion contributes to the training loss
    by masking the prompt tokens with -100 in the label tensor.

    Parameters:
    ----------
    example : dict
        A dictionary with:
        - 'text': The input prompt string used to condition the model.
        - 'output': The expected target string to be predicted by the model.

    Returns:
    -------
    dict
        A dictionary containing:
        - 'input_ids': Token IDs of the concatenated prompt and target.
        - 'attention_mask': Attention mask for the input sequence.
        - 'labels': Token IDs with prompt tokens masked (-100) to compute loss only on the target.
    """

    prompt = example["text"]
    target = example["output"]


    # Concatenate prompt and expected output as training input
    full_text = prompt + target
    # print(f"Full Text:\n{full_text}")


    # Tokenize the full sequence
    inputs = tokenizer(full_text, return_tensors="pt", padding="max_length", truncation=True, max_length=512)

    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]


    # Create labels: only the target portion should contribute to loss
    labels = input_ids.clone()
    prompt_len = len(tokenizer(prompt)["input_ids"])-1
    labels[0,:prompt_len] = -100  # Mask out prompt tokens

    # Final training batch
    batch = {
        "input_ids": input_ids.squeeze(1),
        "attention_mask": attention_mask.squeeze(1),
        "labels": labels.squeeze(1),
        "prompt_len":prompt_len
    }

    return batch

In [None]:
def collate_fn(batch):
    return tokenizer.pad(batch, padding = True,return_tensors="pt")

In [None]:
tokenized_train = ds_train.map(tokenize_fn, remove_columns=ds_train.column_names)
train_loader = DataLoader(tokenized_train, batch_size=4, shuffle=True, collate_fn=collate_fn)

- I applied the `tokenize_fn` to the training dataset using `map`, which ensures each example is tokenized consistently.
- Removing original columns keeps the dataset lean and avoids redundancy during training.
- I wrapped the tokenized dataset into a `DataLoader` to enable efficient batching, shuffling, and feeding into the model.
- The use of `collate_fn` ensures dynamic padding and batch formatting, making the training pipeline robust to variable input lengths.


In [None]:
batch = next(iter(train_loader))
print("BATCH KEYS:")
print("-"*len("BATCH KEYS:"))
print(batch.keys())
print("\n\n")
print("-"*len("TRAINING BATCH EXAMPLE:  "))
print("SAMPLE BATCH EXAMPLES:")
print("-"*len("TRAINING BATCH EXAMPLE:  "))
print("\n\n")


for i in range(batch['input_ids'].shape[0]):
    prompt_len = batch['prompt_len'][i].item()
    input_text = tokenizer.decode(batch['input_ids'][i][0],skip_special_tokens=True)
    label_text = tokenizer.decode(batch['labels'][i][0][prompt_len:],skip_special_tokens=True)

    print("-"*len("EXAMPLE:    "))
    print(f"EXAMPLE: {i+1}")
    print("-"*len("EXAMPLE:    "))
    print(f"{input_text}")
    print("\n")
    print(f"OUTPUT:")
    print("-"*len("OUTPUT:"))
    print(f"{label_text}")
    print("-"*60)
    print("\n\n")

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

batch = next(iter(train_loader))
del batch['prompt_len']

# ✅ Move batch tensors to same device
batch = {k: v.to(device).squeeze(1) for k, v in batch.items()}

output = model(**batch)
loss = output.loss
print(f"Loss on Training Sample: {loss.item():.4f}")


----------

In [None]:
def prepare_test_prompt(example):
    example_prompt = {
        "event_text": [example["event_text"]],
        "output": [""]  # we don't use actual output at test time
    }
    prompt_data = serialize(example_prompt)
    return prompt_data["text"][0]  # return serialized string



def tokenize_fn_val(example):

    """
    Prepares tokenized input for supervised fine-tuning of a language model using prompt-response format.

    This function takes a dictionary containing a prompt (`example["text"]`) and a target output
    (`example["output"]`), concatenates them into a single training string, and tokenizes it using
    the provided tokenizer. It ensures that only the target portion contributes to the training loss
    by masking the prompt tokens with -100 in the label tensor.

    Parameters:
    ----------
    example : dict
        A dictionary with:
        - 'text': The input prompt string used to condition the model.
        - 'output': The expected target string to be predicted by the model.

    Returns:
    -------
    dict
        A dictionary containing:
        - 'input_ids': Token IDs of the concatenated prompt and target.
        - 'attention_mask': Attention mask for the input sequence.
        - 'labels': Token IDs with prompt tokens masked (-100) to compute loss only on the target.
    """

    prompt = prepare_test_prompt(example)
    target = str(example.get("output",""))


    # Concatenate prompt and expected output as training input
    full_text = prompt + target
    # print(f"Full Text:\n{full_text}")


    # Tokenize the full sequence
    inputs = tokenizer(full_text, return_tensors="pt", padding="max_length", truncation=True, max_length=512)

    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]



    # Create labels: only the target portion should contribute to loss
    labels = input_ids.clone()
    prompt_len = len(tokenizer(prompt)["input_ids"])-1
    labels[0,:prompt_len] = -100  # Mask out prompt tokens

    # Final training batch
    batch = {
        "input_ids": input_ids.squeeze(1),
        "attention_mask": attention_mask.squeeze(1),
        "labels": labels.squeeze(1),
        "prompt_len":prompt_len,

    }

    return batch

In [None]:
tokenized_val = ds_val.map(tokenize_fn_val, remove_columns=ds_val.column_names)
val_loader = DataLoader(tokenized_val, batch_size=4, shuffle=True, collate_fn=collate_fn)

In [None]:
batch = next(iter(val_loader))
print("BATCH KEYS:")
print("-"*len("BATCH KEYS:"))
print(batch.keys())
print("\n\n")
print("-"*len("TRAINING BATCH EXAMPLE:  "))
print("SAMPLE BATCH EXAMPLES:")
print("-"*len("TRAINING BATCH EXAMPLE:  "))
print("\n\n")


for i in range(batch['input_ids'].shape[0]):
    prompt_len = batch['prompt_len'][i].item()
    input_text = tokenizer.decode(batch['input_ids'][i][0],skip_special_tokens=True)
    label_text = tokenizer.decode(batch['labels'][i][0][prompt_len:],skip_special_tokens=True)

    print("-"*len("EXAMPLE:    "))
    print(f"EXAMPLE: {i+1}")
    print("-"*len("EXAMPLE:    "))
    print(f"{input_text}")
    print("\n")
    print(f"OUTPUT:")
    print("-"*len("OUTPUT:"))
    print(f"{label_text}")
    print("-"*60)
    print("\n\n")

In [None]:
def compute_loss_on_val_set(model,data_loader,device):


    model = model.to(device)

    model.eval()

    total_loss=0
    with torch.no_grad():
        for batch in tqdm(data_loader,desc="Evaluating loss on entire set..."):

            del batch['prompt_len']

            # ✅ Move batch tensors to same device
            batch = {k: v.to(device).squeeze(1) for k, v in batch.items()}

            output = model(**batch)
            batch_loss = output.loss

            total_loss+=batch_loss


    return total_loss.item() / data_loader.__len__()




In [None]:
print(compute_loss_on_val_set(model,train_loader,device))
print(compute_loss_on_val_set(model,val_loader,device))

## Setting up PEFT config..!

In [None]:
peft_model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    task_type="CAUSAL_LM",
    inference_mode=False,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"],
)

peft_model = get_peft_model(peft_model, lora_config)

peft_model.print_trainable_parameters()

## Getting started with Training Loop..!

In [None]:
accelerator = Accelerator(cpu=False, split_batches=False)
peft_model = accelerator.prepare(peft_model)
batch = {k: v.to(device) for k, v in batch.items()}


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
peft_model = peft_model.to(device)

optimizer = AdamW(peft_model.parameters(), lr=2e-4)

epochs = 10
step_count = 0

loss_metric = {}
loss_metric['train_loss']=[]
loss_metric['val_loss']=[]

print("-"*len("TRAINING LOOP BEGINS:  "))
print("TRAINING LOOP BEGINS:")
print("-"*len("TRAINING LOOP BEGINS:  "))

for epoch in range(epochs):
    peft_model.train()
    epoch_loss = 0.0
    print(f"\n🔁 Epoch {epoch+1}/{epochs}")

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        del batch['prompt_len']
        batch = {k: v.to(device).squeeze(1) for k, v in batch.items()}


        outputs = peft_model(**batch)

        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        epoch_loss += loss.item()
        step_count += 1

        if step_count % 50 == 0:
            print(f"🔹 Step {step_count}, Loss: {loss.item():.4f}")

    avg_loss = compute_loss_on_val_set(peft_model,train_loader,device)
    avg_val_loss = compute_loss_on_val_set(peft_model,val_loader,device)

    print(f"✅ Epoch {epoch+1} completed. Avg Trainig Loss: {avg_loss:.4f}. Avg Val Loss: {avg_val_loss:.4f}")

    loss_metric['train_loss'].append(avg_loss)
    loss_metric['val_loss'].append(avg_val_loss)

In [None]:
import matplotlib.pyplot as plt
x = [i for i in range(len(loss_metric['train_loss']))]
plt.plot(x,loss_metric['train_loss'])
plt.plot(x,loss_metric['val_loss'])
plt.legend(['Train','Val'])
plt.xlabel('No. of Epoch')
plt.ylabel('Loss')
plt.grid(True)
plt.show()

## Checking performance for the NER task..!

In [None]:
def compare_dicts(result_dict, output_dict):
    comparison = {}
    for key in output_dict.keys():
        expected = output_dict[key]
        predicted = result_dict.get(key, None)

        # Normalize string values
        if isinstance(expected, str) and isinstance(predicted, str):
            expected = expected.strip().lower()
            predicted = predicted.strip().lower()

        # For lists: compare ignoring order
        if isinstance(expected, list) and isinstance(predicted, list):
            correct = sorted(expected) == sorted(predicted)
        else:
            correct = expected == predicted

        comparison[key] = {
            "expected": output_dict[key],
            "predicted": result_dict.get(key, None),
            "match": correct
        }

    return comparison

def dict_accuracy(result_dict, output_dict):
    comp = compare_dicts(result_dict, output_dict)
    matches = sum(1 for k in comp if comp[k]['match'])
    acc = {key: int(comp[key]["match"]) for key in comp}
    return acc



In [None]:
def run_generation(model, tokenizer, example):
    model.eval()
    prompt = prepare_test_prompt(example)
    # print(prompt)

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    input_length = inputs["input_ids"].shape[-1]

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=512,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )

    # Get generated text *excluding* the input
    generated_ids = output_ids[:, input_length:]
    decoded = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    try:
        result_dict = ast.literal_eval(decoded.strip())
    except Exception as e:
        result_dict = {"error": str(e), "raw_output": decoded}

    return result_dict


In [None]:
example = ds_train[1]
predicted_dict = run_generation(peft_model, tokenizer, example)

print("Predicted:", predicted_dict)
print("Actual:", example["output"])


### Saving the model and tokenizer..!

In [None]:
peft_model.save_pretrained("lora-ner-model")
tokenizer.save_pretrained("lora-ner-model")

!tar -zcvf lora-ner-model.tar.gz /kaggle/working/lora-ner-model

### Calculating accuracy for NER task..!

In [None]:
import pandas as pd

comp = ast.literal_eval(ds_train[0]['output']).keys()

result_df = {}
for i in tqdm(range(ds_train.__len__())):
    example = ds_train[i]
    predicted_dict = run_generation(peft_model, tokenizer, example)
    example['output'] = ast.literal_eval(example['output'])
    matches = dict_accuracy(predicted_dict,example['output'])

    for com in comp:
        if com in result_df.keys():
            result_df[com].append(matches[com])
        else:
            result_df[com]=[]
            result_df[com].append(matches[com])


In [None]:
result_df = pd.DataFrame(result_df)
print("Average accuracy for each `NER` on training set..!")
print("\n")
print(result_df.describe().loc['mean'])
result_df.to_csv('Train_Error.csv')

In [None]:
comp = ast.literal_eval(ds_train[0]['output']).keys()

result_df = {}
for i in tqdm(range(ds_val.__len__())):
    example = ds_val[i]
    predicted_dict = run_generation(peft_model, tokenizer, example)
    # example['output'] = ast.literal_eval(example['output'])
    matches = dict_accuracy(predicted_dict,example['output'])

    for com in comp:
        if com in result_df.keys():
            result_df[com].append(matches[com])
        else:
            result_df[com]=[]
            result_df[com].append(matches[com])

In [None]:
result_df = pd.DataFrame(result_df)
print("Average accuracy for each `NER` on validation set..!")
print("\n")
print(result_df.describe().loc['mean'])
result_df.to_csv('Val_Error.csv')

-------------