In [1]:
# !pip install transformers datasets accelerate peft
# !pip install -U bitsandbytes

In [2]:
import ast
import torch
import random
import transformers
import pandas as pd

from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import BitsAndBytesConfig

from torch.optim import AdamW
from torch.utils.data import DataLoader

from peft import PeftModel
from peft import prepare_model_for_kbit_training
from peft import LoraConfig, get_peft_model

from datasets import Dataset,load_dataset
from tqdm import tqdm

from accelerate import Accelerator

2025-07-27 20:37:54.179824: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753648674.202539     122 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753648674.209408     122 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
from transformers import logging
logging.set_verbosity_error()

### Loading the Data..!

In [4]:
data_path = r"/kaggle/input/llm-fine-tune-dataset/event_text_mapping.jsonl"
ds = load_dataset("json", data_files=data_path)["train"]

print("Dataset features:", ds.features)
print("Number of examples:", len(ds))
print("\nFirst 3 examples:")
for i in range(min(3, len(ds))):
    print(ds[i])

Dataset features: {'event_text': Value(dtype='string', id=None), 'output': {'action': Value(dtype='string', id=None), 'date': Value(dtype='string', id=None), 'time': Value(dtype='string', id=None), 'attendees': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'location': Value(dtype='string', id=None), 'duration': Value(dtype='string', id=None), 'recurrence': Value(dtype='string', id=None), 'notes': Value(dtype='string', id=None)}}
Number of examples: 792

First 3 examples:
{'event_text': 'Late night study session at the café on 15th, Dec 2024 at 9:00 pm for 2 hours.', 'output': {'action': 'study session', 'date': '15/12/2024', 'time': '9:00 PM', 'attendees': None, 'location': 'café', 'duration': '2 hours', 'recurrence': None, 'notes': None}}
{'event_text': 'Hang out at the beach on 18th, Jul 2024 around 10:00 am for 3 hours or so.', 'output': {'action': 'Hang out', 'date': '18/07/2024', 'time': '10:00 AM', 'attendees': None, 'location': 'beach', 'duration': '3 hou

### Setting up the base model..!

In [5]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)

In [6]:
# Setting up `device`
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [7]:
# Loading base model..
MODEL_NAME = "HuggingFaceTB/SmolLM-360M"
print("Loading Model...!")
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, 
    quantization_config=bnb_config,
    device_map=device
)

print("Loading Tokenizer...!")
base_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
base_tokenizer.pad_token = base_tokenizer.eos_token


Loading Model...!


model.safetensors:   0%|          | 0.00/1.45G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Loading Tokenizer...!


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/831 [00:00<?, ?B/s]

In [8]:
# Load a fresh base model again to attach LoRA
print("Loading Model...!")
model_for_lora = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, 
    quantization_config=bnb_config,
    device_map=device
)

print("Loading Tokenizer...!")
lora_model = PeftModel.from_pretrained(model_for_lora, "abhxaxhbshxahxn/lora-ner-model")
lora_tokenizer = AutoTokenizer.from_pretrained("abhxaxhbshxahxn/lora-ner-model")
lora_tokenizer.pad_token = lora_tokenizer.eos_token

Loading Model...!
Loading Tokenizer...!


adapter_config.json:   0%|          | 0.00/781 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/6.57M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/863 [00:00<?, ?B/s]

### Generating inference from the base model..!

In [9]:
def prepare_prompt(event_text):
  
  prompt = """See the given example to extract the entities based on given input in JSON format.
Example Input: Late night study session at the café on 15th, Dec 2024 at 9:00 pm for 2 hours.
Example Output: {{'action': 'study session', 'date': '15/12/2024', 'time': '9:00 PM', 'attendees': None, 'location': 'café', 'duration': '2 hours', 'recurrence': None, 'notes': None}}
--------------------------
Please extract the entities for the below user input in JSON format. And do not output anything else.
Human Input: {event_text}
AI:"""

  prompt = prompt.format(event_text=event_text)
  
  return prompt

In [10]:
def generate_inference(input_text,model,tokenizer):

  model.eval()

  input_prompt = prepare_prompt(input_text)

  inputs = tokenizer(input_prompt, return_tensors="pt").to(model.device)

  with torch.no_grad():

    outputs = model.generate(**inputs, max_new_tokens=100)
  
  decoded_output = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)

  torch.cuda.empty_cache()
  
  return decoded_output


In [11]:
example = ds[110]
inference_ = generate_inference(example['event_text'],base_model,base_tokenizer)
print("PROMPT",example['event_text'])
print("\n")
print("INFERENCE",inference_)
print("\n")
print("ACTUAL OUTPUT",example['output'])

PROMPT Training session 15 - Nov - 2023 3:30 pm video


INFERENCE 
1. What is the name of the user?
2. What is the name of the session?
3. What is the date of the session?
4. What is the time of the session?
5. What is the duration of the session?
6. What is the recurrence of the session?
7. What is the notes of the session?
8. What is the name of the location of the session?
9. What is the name of the


ACTUAL OUTPUT {'action': 'Training session', 'date': '15/11/2023', 'time': '3:30 PM', 'attendees': None, 'location': None, 'duration': None, 'recurrence': None, 'notes': 'video'}


### Generating inference from the fine-tuned model..!

In [12]:
example = ds[110]
inference_ = generate_inference(example['event_text'],lora_model,lora_tokenizer)
print("PROMPT",example['event_text'])
print("\n")
print("INFERENCE",inference_)
print("\n")
print("ACTUAL OUTPUT",example['output'])

PROMPT Training session 15 - Nov - 2023 3:30 pm video


INFERENCE  {'action': 'Training session', 'date': '15/11/2023', 'time': '3:30 PM', 'attendees': None, 'location': None, 'duration': None, 'recurrence': None, 'notes': None}


ACTUAL OUTPUT {'action': 'Training session', 'date': '15/11/2023', 'time': '3:30 PM', 'attendees': None, 'location': None, 'duration': None, 'recurrence': None, 'notes': 'video'}


### Creating a function that collectively outputs the `inference` from `base` and `fine-tuned` model...!

In [13]:
def compare_inference(example,base_model,base_tokenizer,tuned_model,tuned_tokenizer):

  input_text = example['event_text']
  ground_truth = example['output']

  ans = {}

  ans['input_text'] = input_text
    
  ans["base_inference"] = generate_inference(input_text,base_model,base_tokenizer)

  ans["finetuned_inference"] = generate_inference(input_text,tuned_model,tuned_tokenizer)
  
  ans['actual_output'] = ground_truth

  return ans

In [14]:
print("EVENT TEXT")
print(ds[300]['event_text'])

params={}
params['example'] = ds[300]
params['base_model']= base_model
params['base_tokenizer']= base_tokenizer
params['tuned_model']= lora_model
params['tuned_tokenizer']= lora_tokenizer

ans = compare_inference(**params)
for k,v in ans.items():
  print(k.upper(),v,sep="\n")
  print("\n")

EVENT TEXT
Onboarding 2023-12-14 8:45 am w/ Ava & Ethan BlueJeans 30m
INPUT_TEXT
Onboarding 2023-12-14 8:45 am w/ Ava & Ethan BlueJeans 30m


BASE_INFERENCE

1. Human Input: Onboarding 2023-12-14 8:45 am w/ Ava & Ethan BlueJeans 30m
2. AI:
1. Human Input: Onboarding 2023-12-14 8:45 am w/ Ava & Ethan BlueJeans 30m
2. AI:
1. Human Input: Onboarding 2


FINETUNED_INFERENCE
 {'action': 'Onboarding', 'date': '2023-12-14', 'time': '8:45 AM', 'attendees': ['Ava', 'Ethan'], 'location': 'BlueJeans', 'duration': '30m', 'recurrence': None, 'notes': None}


ACTUAL_OUTPUT
{'action': 'Onboarding', 'date': '2023-12-14', 'time': '8:45 AM', 'attendees': ['Ava', 'Ethan'], 'location': 'BlueJeans', 'duration': '30m', 'recurrence': None, 'notes': None}




### Creating `inference_df` to store and compare the inference for `base` vs `fine-tuned`..!

In [15]:
params={}
params['base_model']= base_model
params['base_tokenizer']= base_tokenizer
params['tuned_model']= lora_model
params['tuned_tokenizer']= lora_tokenizer

result_df = {}
for idx in tqdm(range(100),dynamic_ncols=True, leave=True):
    unique_rand_ints = random.sample(range(0, 791), 1)
    params['example'] = ds[unique_rand_ints]
    ans = compare_inference(**params)

    for k,v in ans.items():
        if k in result_df.keys():
            result_df[k].append(v)
        else:
            result_df[k]=[]
            result_df[k].append(v)
    

100%|██████████| 100/100 [20:12<00:00, 12.13s/it]


In [16]:
result_df = pd.DataFrame(result_df)
result_df.head()

Unnamed: 0,input_text,base_inference,finetuned_inference,actual_output
0,"[Plan a workshop with Olivia, Sophia, and Luca...","{'action': 'plan a workshop with Olivia, Soph...","{'action': 'Plan a workshop', 'date': '18/05/...","[{'action': 'Plan a workshop', 'date': '18/05/..."
1,"[Roadmap sync 27th, Dec 2025 12:00 pm w/ Joe Z...","['Roadmap sync 27th, Dec 2025 12:00 pm w/ Joe...","{'action': 'Roadmap sync', 'date': '27/12/202...","[{'action': 'Roadmap sync', 'date': '27/12/202..."
2,[Discuss proposal 2023-12-07 11:15 am w/ Emma ...,['Discuss proposal 2023-12-07 11:15\u202fEmma...,"{'action': 'Discuss proposal 2023-12-07', 'da...","[{'action': 'Discuss proposal', 'date': '2023-..."
3,"[Hiring panel 2024-03-03 1:30 pm Sophie, Will,...","['Hiring panel 2024-03-03 1:30pm Sophie, Will...","{'action': 'Hiring panel', 'date': '2024-03-0...","[{'action': 'Hiring panel', 'date': '2024-03-0..."
4,[Organize time for a roadmap discussion on May...,['Organize time for a roadmap discussion on M...,{'action': 'Organize time for a roadmap discu...,"[{'action': 'roadmap discussion', 'date': '21/..."


In [18]:
result_df.to_csv('Base_vs_FT_inference.csv')

### Checking performance for the NER task..!

In [19]:
def compare_dicts(result_dict, output_dict):
    comparison = {}
    for key in output_dict.keys():
        expected = output_dict[key]
        predicted = result_dict.get(key, None)

        # Normalize string values
        if isinstance(expected, str) and isinstance(predicted, str):
            expected = expected.strip().lower()
            predicted = predicted.strip().lower()

        # For lists: compare ignoring order
        if isinstance(expected, list) and isinstance(predicted, list):
            correct = sorted(expected) == sorted(predicted)
        else:
            correct = expected == predicted

        comparison[key] = {
            "expected": output_dict[key],
            "predicted": result_dict.get(key, None),
            "match": correct
        }

    return comparison

def dict_accuracy(result_dict, output_dict):
    comp = compare_dicts(result_dict, output_dict)
    matches = sum(1 for k in comp if comp[k]['match'])
    acc = {key: int(comp[key]["match"]) for key in comp}
    return acc

In [20]:
def calculate_accuracy(result_df,column_name):
    base_accuracy={}
    base_accuracy_df = {}
    comp = ds[0]['output'].keys()
    
    for idx in range(result_df.shape[0]):
        # print(idx)
        
        try:
            base_output = ast.literal_eval(result_df.loc[idx,column_name])
            matches = dict_accuracy(base_output,result_df.loc[idx,'actual_output'][0])
    
        
        except:
            base_output = {}
            matches = {key: 0 for key in comp}
    
        
        for com in comp:
            if com in base_accuracy_df.keys():
                base_accuracy_df[com].append(matches[com])
            else:
                base_accuracy_df[com] = []
                base_accuracy_df[com].append(matches[com])

    return pd.DataFrame(base_accuracy_df).describe().loc['mean']

In [21]:
print("Average Accuracy in % for the `Base` model over `100` random samples...!".upper())
print('\n')
for k,v in dict(calculate_accuracy(result_df,column_name='base_inference')).items():
    print(k.upper(),v*100,sep=' '*(20-len(k)))

AVERAGE ACCURACY IN % FOR THE `BASE` MODEL OVER `100` RANDOM SAMPLES...!


ACTION              0.0
DATE                0.0
TIME                0.0
ATTENDEES           0.0
LOCATION            0.0
DURATION            0.0
RECURRENCE          0.0
NOTES               0.0


In [22]:
print("Average Accuracy in % for the `Base` model over `100` random samples...!".upper())
print('\n')
for k,v in dict(calculate_accuracy(result_df,column_name='finetuned_inference')).items():
    print(k.upper(),v*100,sep=' '*(20-len(k)))

AVERAGE ACCURACY IN % FOR THE `BASE` MODEL OVER `100` RANDOM SAMPLES...!


ACTION              83.0
DATE                85.0
TIME                94.0
ATTENDEES           82.0
LOCATION            90.0
DURATION            89.0
RECURRENCE          93.0
NOTES               97.0


-------------