In [1]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git@nightly git+https://github.com/unslothai/unsloth-zoo.git

In [3]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 2x faster
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # 4bit for 405b!
    "unsloth/Mistral-Small-Instruct-2409",     # Mistral 22b 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!

    "unsloth/Llama-3.2-1B-bnb-4bit",           # NEW! Llama 3.2 models
    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-3B-bnb-4bit",
    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",

    "unsloth/Llama-3.3-70B-Instruct-bnb-4bit" # NEW! Llama 3.3 70B!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct", # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth 2025.12.8: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [4]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.12.8 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [5]:
import os
os.listdir()


['.config',
 'dataset.xlsx',
 'huggingface_tokenizers_cache',
 'unsloth_compiled_cache',
 'sample_data']

In [6]:
import pandas as pd
import json

df = pd.read_excel("dataset.xlsx")

step_cols = [c for c in df.columns if c.startswith("step_")]

def parse_step(step):
    parts = [p.strip() for p in step.split(";")]
    d = {}
    for p in parts:
        if "=" in p:
            k, v = p.split("=", 1)
            d[k.strip()] = v.strip()
    return d

samples = []

for _, row in df.iterrows():
    actions = []
    for c in step_cols:
        if pd.isna(row[c]):
            continue
        actions.append(parse_step(str(row[c])))

    if not actions:
        continue

    text = f"""### Instruction:
{row['instruction']}

### Response:
{json.dumps({"application": row["application"], "actions": actions})}
"""
    samples.append({"text": text})

print("Samples:", len(samples))


Samples: 376


In [7]:
print(samples[1])

{'text': '### Instruction:\nOpen Notepad and close it\n\n### Response:\n{"application": "notepad", "actions": [{"action": "open_app", "app": "notepad"}, {"action": "hotkey", "keys": "alt+f4"}, {}]}\n'}


In [8]:
from datasets import Dataset

dataset = Dataset.from_list(samples)

print(dataset)


Dataset({
    features: ['text'],
    num_rows: 376
})


In [9]:
def tokenize(example):
    return tokenizer(
        example["text"],
        truncation=True,
        max_length=2048,
        padding=False,
    )

dataset = dataset.map(tokenize, batched=True, remove_columns=["text"])


Map:   0%|          | 0/376 [00:00<?, ? examples/s]

In [10]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    args=TrainingArguments(
        output_dir="llama-ui-agent",
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        num_train_epochs=2,          # IMPORTANT: start with 2
        learning_rate=2e-4,
        fp16=True,
        logging_steps=10,
        save_strategy="epoch",
        report_to=[],
    ),
)

trainer.train()


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 376 | Num Epochs = 2 | Total steps = 188
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 4 x 1) = 4
 "-____-"     Trainable parameters = 24,313,856 of 3,237,063,680 (0.75% trained)


Step,Training Loss
10,1.3381
20,0.525
30,0.4257
40,0.3115
50,0.2866
60,0.2687
70,0.2742
80,0.2851
90,0.2781
100,0.217


TrainOutput(global_step=188, training_loss=0.3148143006132004, metrics={'train_runtime': 346.5975, 'train_samples_per_second': 2.17, 'train_steps_per_second': 0.542, 'total_flos': 879665894621184.0, 'train_loss': 0.3148143006132004, 'epoch': 2.0})

In [25]:
from google.colab import drive
drive.mount('/content/drive')

# To change to your notebook's directory, uncomment the line below and modify the path as needed.
# import os
os.chdir('/content/drive/MyDrive/Colab Notebooks/mySequence') # Example path

Mounted at /content/drive


In [26]:
model.save_pretrained("llama-ui-agent")
tokenizer.save_pretrained("llama-ui-agent")


('llama-ui-agent/tokenizer_config.json',
 'llama-ui-agent/special_tokens_map.json',
 'llama-ui-agent/chat_template.jinja',
 'llama-ui-agent/tokenizer.json')

In [15]:
#TESING THE MODEL
from unsloth import FastLanguageModel
FastLanguageModel.for_inference(model)


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 3072, padding_idx=128004)
        (layers): ModuleList(
          (0): LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear

In [21]:
def extract_first_json(text):
    start = text.find("{")
    if start == -1:
        return None

    depth = 0
    for i in range(start, len(text)):
        if text[i] == "{":
            depth += 1
        elif text[i] == "}":
            depth -= 1
            if depth == 0:
                return text[start:i+1]

    return None


In [22]:
import re
def run(prompt):
    text = f"""### Instruction:
{prompt}

### Response:
Return ONLY ONE valid JSON object.
"""

    inputs = tokenizer(text, return_tensors="pt").to("cuda")

    outputs = model.generate(
        **inputs,
        max_new_tokens=300,
        temperature=0.0,
        do_sample=False,
        repetition_penalty=1.2,
    )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

    json_text = extract_first_json(decoded)
    if json_text:
        return json_text
    else:
        return "ERROR: No JSON found"


In [23]:

run("Open Notepad and close it")  #in dataset


'{"application": "notepad", "actions": [{"action": "open_app", "app": "notepad"}, {"action": "hotkey", "keys": "alt+f4"}, {}]}'

In [24]:
run("Turn off Excel") #in dataset

'{"application": "excel", "actions": [{"action": "focus", "app": "excel"}, {"action": "hotkey", "keys": "alt+f4"}, {}]}'

In [29]:
run("Turn off firefox") #unseen input

'{"application": "firefox", "actions": [{"action": "focus", "app": "firefox"}, {"action": "hotkey", "keys": "alt+f4"}, {}]}'