## Install & Imports..!

In [3]:
# !pip install transformers datasets accelerate peft trl
# !pip install -U bitsandbytes

In [4]:
import ast
import torch
import transformers

from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import BitsAndBytesConfig

from torch.optim import AdamW
from torch.utils.data import DataLoader

from peft import prepare_model_for_kbit_training
from peft import LoraConfig, get_peft_model
from peft import PeftModel, PeftConfig

from datasets import Dataset,load_dataset
from tqdm import tqdm

from accelerate import Accelerator

## Helper Functions..!

In [5]:
def prepare_test_prompt(example):
    example_prompt = {
        "event_text": [example["event_text"]],
        "output": [""]  # we don't use actual output at test time
    }
    prompt_data = serialize(example_prompt)
    return prompt_data["text"][0]  # return serialized string

In [6]:
def serialize(examples):
    """
    Constructs a batch of prompt strings using few-shot learning format for NER-style entity extraction.

    This function formats each input text in `examples['event_text']` by embedding it into a predefined
    prompt template. It uses a fixed example (the first instance from the `raw` dataset) as a demonstration
    to guide the model. The function returns a dictionary containing:

    - "text": List of formatted prompt strings for each input.
    - "output": Corresponding expected outputs as strings for comparison or training.

    Parameters:
    ----------
    examples : dict
        A dictionary with two keys:
        - 'event_text': List of user inputs to extract entities from.
        - 'output': List of corresponding ground truth outputs in dictionary format.

    Returns:
    -------
    dict
        A dictionary with:
        - 'text': List of formatted prompt strings.
        - 'output': List of expected output strings.
    """

    prompt_template = (
    "See the given example to extract the entities based on given input in JSON format.\n\n"
    "If anything can't be extracted, use None.\n\n"
    "Example Input: {example_event_text}\n"
    "Example Output: {example_output}\n"
    "--------------------------\n"
    "Please extract the entities for the below user input in JSON format. And do not output anything else.\n\n"
    "Human Input: {user_input}\n"
    "AI: "
    )



    # Use the first example from the raw dataset as the fixed example for the prompt
    example_instance={}
    example_instance["event_text"] = """Late night study session at the café on 15th, Dec 2024 at 9:00 pm for 2 hours."""
    example_instance['output'] = """{'action': 'study session', 'date': '15/12/2024', 'time': '9:00 PM', 'attendees': None, 'location': 'café', 'duration': '2 hours', 'recurrence': None, 'notes': None}"""
    formatted_texts = []


    # Iterate through the batch using the length of one of the lists (assuming all lists have the same length)
    for i in range(len(examples['event_text'])):
        formatted_text = prompt_template.format(
            example_event_text=example_instance['event_text'],
            example_output=example_instance['output'],
            user_input=examples['event_text'][i] # Access each example in the batch correctly
        )
        formatted_texts.append(formatted_text)


    return {"text": formatted_texts, "output": [str(output) for output in examples['output']]} # Access each output in the batch correctly

In [7]:
def run_generation(model, tokenizer, example):
    model.eval()
    prompt = prepare_test_prompt(example)
    # print(prompt)

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    input_length = inputs["input_ids"].shape[-1]

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=512,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )

    # Get generated text *excluding* the input
    generated_ids = output_ids[:, input_length:]
    decoded = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    try:
        result_dict = ast.literal_eval(decoded.strip())
    except Exception as e:
        result_dict = {"error": str(e), "raw_output": decoded}

    return result_dict


## Setting and Loading fine-tuned model from HF..!

In [8]:
# Setting up `device`
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [9]:
# Loading base model..

MODEL_NAME = "HuggingFaceTB/SmolLM-360M"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, quantization_config=bnb_config,device_map={"": 0}
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.45G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [10]:
# Loading the fine-tuned `PEFT` model & tokenizer from `HFHub` and integrating the adapters with `base` model

peft_model = PeftModel.from_pretrained(model, "abhxaxhbshxahxn/lora-ner-model")
tokenizer = AutoTokenizer.from_pretrained("abhxaxhbshxahxn/lora-ner-model")

adapter_config.json:   0%|          | 0.00/781 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/6.57M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/863 [00:00<?, ?B/s]

## Reading the data..!

In [11]:
data_path = r"/content/drive/MyDrive/Fine-Tuning Assignment/event_text_mapping.jsonl"
ds = load_dataset("json", data_files=data_path)["train"]

print("Dataset features:", ds.features)
print("Number of examples:", len(ds))
print("\nFirst 3 examples:")
for i in range(min(3, len(ds))):
    print(ds[i])

Generating train split: 0 examples [00:00, ? examples/s]

Dataset features: {'event_text': Value('string'), 'output': {'action': Value('string'), 'date': Value('string'), 'time': Value('string'), 'attendees': List(Value('string')), 'location': Value('string'), 'duration': Value('string'), 'recurrence': Value('string'), 'notes': Value('string')}}
Number of examples: 792

First 3 examples:
{'event_text': 'Late night study session at the café on 15th, Dec 2024 at 9:00 pm for 2 hours.', 'output': {'action': 'study session', 'date': '15/12/2024', 'time': '9:00 PM', 'attendees': None, 'location': 'café', 'duration': '2 hours', 'recurrence': None, 'notes': None}}
{'event_text': 'Hang out at the beach on 18th, Jul 2024 around 10:00 am for 3 hours or so.', 'output': {'action': 'Hang out', 'date': '18/07/2024', 'time': '10:00 AM', 'attendees': None, 'location': 'beach', 'duration': '3 hours', 'recurrence': None, 'notes': None}}
{'event_text': 'Business lunch at that seafood spot on 2nd, Nov 2024 at 1:00 pm for roughly 2 hours.', 'output': {'action': 'B

## Generating the inference for a given `example`..!

In [12]:
example = ds[786]
predicted_dict = run_generation(peft_model, tokenizer, example)

print("Input:", example['event_text'])
print("\n")
print("Predicted:", predicted_dict)
print("\n")
print("Actual:", example["output"])


Input: Lunch with Claire at Sunny Diner on April 29, 2024, 1:00 pm for 1 hour.


Predicted: {'action': 'Lunch', 'date': '29/04/2024', 'time': '1:00 PM', 'attendees': ['Claire'], 'location': 'Sunny Diner', 'duration': '1 hour', 'recurrence': None, 'notes': None}


Actual: {'action': 'Lunch', 'date': '29/04/2024', 'time': '1:00 PM', 'attendees': ['Claire'], 'location': 'Sunny Diner', 'duration': '1 hour', 'recurrence': None, 'notes': None}


In [15]:
example = {}
example['event_text'] = {"Gotcha! Let's discuss sports weekly on each Friday at 6 in the evening, over a ZOom call beginning 1st May,2024!"}
example['output']={}
predicted_dict = run_generation(peft_model, tokenizer, example)

print("Input:", example['event_text'])
print("\n")
print("Predicted:", predicted_dict)
print("\n")
print("Actual:", example["output"])


Input: {"Gotcha! Let's discuss sports weekly on each Friday at 6 in the evening, over a ZOom call beginning 1st May,2024!"}


Predicted: {'action': 'Discussing sports weekly', 'date': '05/06/2024', 'time': '6:00 PM', 'attendees': None, 'location': 'Zoom', 'duration': '1 hour', 'recurrence': None, 'notes': None}


Actual: {}


-----------