## Install & Imports..!

In [None]:
# !pip install transformers datasets accelerate peft trl
# !pip install -U bitsandbytes

In [None]:
import ast
import torch
import transformers

from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import BitsAndBytesConfig

from torch.optim import AdamW
from torch.utils.data import DataLoader

from peft import prepare_model_for_kbit_training
from peft import LoraConfig, get_peft_model
from peft import PeftModel, PeftConfig

from datasets import Dataset,load_dataset
from tqdm import tqdm

from accelerate import Accelerator

## Helper Functions..!

In [None]:
def prepare_test_prompt(example):
    example_prompt = {
        "event_text": [example["event_text"]],
        "output": [""]  # we don't use actual output at test time
    }
    prompt_data = serialize(example_prompt)
    return prompt_data["text"][0]  # return serialized string

In [None]:
def serialize(examples):
    """
    Constructs a batch of prompt strings using few-shot learning format for NER-style entity extraction.

    This function formats each input text in `examples['event_text']` by embedding it into a predefined
    prompt template. It uses a fixed example (the first instance from the `raw` dataset) as a demonstration
    to guide the model. The function returns a dictionary containing:

    - "text": List of formatted prompt strings for each input.
    - "output": Corresponding expected outputs as strings for comparison or training.

    Parameters:
    ----------
    examples : dict
        A dictionary with two keys:
        - 'event_text': List of user inputs to extract entities from.
        - 'output': List of corresponding ground truth outputs in dictionary format.

    Returns:
    -------
    dict
        A dictionary with:
        - 'text': List of formatted prompt strings.
        - 'output': List of expected output strings.
    """

    prompt_template = (
    "See the given example to extract the entities based on given input in JSON format.\n\n"
    "If anything can't be extracted, use None.\n\n"
    "Example Input: {example_event_text}\n"
    "Example Output: {example_output}\n"
    "--------------------------\n"
    "Please extract the entities for the below user input in JSON format. And do not output anything else.\n\n"
    "Human Input: {user_input}\n"
    "AI: "
    )



    # Use the first example from the raw dataset as the fixed example for the prompt
    example_instance={}
    example_instance["event_text"] = """Late night study session at the café on 15th, Dec 2024 at 9:00 pm for 2 hours."""
    example_instance['output'] = """{'action': 'study session', 'date': '15/12/2024', 'time': '9:00 PM', 'attendees': None, 'location': 'café', 'duration': '2 hours', 'recurrence': None, 'notes': None}"""
    formatted_texts = []


    # Iterate through the batch using the length of one of the lists (assuming all lists have the same length)
    for i in range(len(examples['event_text'])):
        formatted_text = prompt_template.format(
            example_event_text=example_instance['event_text'],
            example_output=example_instance['output'],
            user_input=examples['event_text'][i] # Access each example in the batch correctly
        )
        formatted_texts.append(formatted_text)


    return {"text": formatted_texts, "output": [str(output) for output in examples['output']]} # Access each output in the batch correctly

In [None]:
def run_generation(model, tokenizer, example):
    model.eval()
    prompt = prepare_test_prompt(example)
    # print(prompt)

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    input_length = inputs["input_ids"].shape[-1]

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=512,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )

    # Get generated text *excluding* the input
    generated_ids = output_ids[:, input_length:]
    decoded = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    try:
        result_dict = ast.literal_eval(decoded.strip())
    except Exception as e:
        result_dict = {"error": str(e), "raw_output": decoded}

    return result_dict


## Setting and Loading fine-tuned model from HF..!

In [None]:
# Setting up `device`
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
# Loading base model..

MODEL_NAME = "HuggingFaceTB/SmolLM-360M"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, quantization_config=bnb_config,device_map={"": 0}
)

In [None]:
# Loading the fine-tuned `PEFT` model & tokenizer from `HFHub` and integrating the adapters with `base` model

peft_model = PeftModel.from_pretrained(model, "abhxaxhbshxahxn/lora-ner-model")
tokenizer = AutoTokenizer.from_pretrained("abhxaxhbshxahxn/lora-ner-model")

## Reading the data..!

In [None]:
data_path = r"/content/drive/MyDrive/Fine-Tuning Assignment/event_text_mapping.jsonl"
ds = load_dataset("json", data_files=data_path)["train"]

print("Dataset features:", ds.features)
print("Number of examples:", len(ds))
print("\nFirst 3 examples:")
for i in range(min(3, len(ds))):
    print(ds[i])

## Generating the inference for a given `example`..!

In [None]:
example = ds[786]
predicted_dict = run_generation(peft_model, tokenizer, example)

print("Input:", example['event_text'])
print("\n")
print("Predicted:", predicted_dict)
print("\n")
print("Actual:", example["output"])


In [None]:
example = {}
example['event_text'] = {"Gotcha! Let's discuss sports weekly on each Friday at 6 in the evening, over a ZOom call beginning 1st May,2024!"}
example['output']={}
predicted_dict = run_generation(peft_model, tokenizer, example)

print("Input:", example['event_text'])
print("\n")
print("Predicted:", predicted_dict)
print("\n")
print("Actual:", example["output"])


-----------