In [1]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.55.4
!pip install --no-deps trl==0.22.2
!pip install gradio

In [2]:
from unsloth import FastLanguageModel, is_bfloat16_supported
import torch
from trl import SFTTrainer
from transformers import TrainingArguments
from transformers import TextStreamer
from datasets import load_dataset
from google.colab import userdata

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!


O modelo selecionado para receber o fine-tuning foi o ollama de 1B de par√¢metros. Ele funcionou bem na execu√ß√£o local e tinha um conjunto de resposta interessante para nosso dataset de treinamento.

In [3]:
HF_TOKEN = userdata.get('HF_TOKEN')
FINETUNING_MODEL = 'unsloth/Llama-3.2-1B-Instruct'
FINETUNING_DATASET = 'robsoncalixto/amazon_data_single_alpaca'
FINETUNED_MODEL = 'robsoncalixto/Llama-3.2-1B-finetuning-grp49'
MAX_SEQ_LENGTH = 2048
LOAD_IN_4BIT = True
DTYPE = None

In [4]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = FINETUNING_MODEL,
    max_seq_length = MAX_SEQ_LENGTH,
    dtype=DTYPE,
    load_in_4bit= LOAD_IN_4BIT,
    token=HF_TOKEN
  )

==((====))==  Unsloth 2025.9.9: Fast Llama patching. Transformers: 4.55.4.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Realizando um teste de infer√™ncia para avaliar o resultado do modelo.

A reposta do modelo √© que Harlequin √© uma marca de roupas e acess√≥rios para mulheres. Alguns outros detales.

O conte√∫do no dataset de fine-tuning orienta o modelo a responder sobre a biografia de uma pessoa chamada Harlequin.

In [5]:
FastLanguageModel.for_inference(model)
streamer = TextStreamer(tokenizer)
prompt = 'What do you know about the Harlequin?'
prompt_tokenizer = tokenizer(prompt, return_tensors='pt').to('cuda')

_=model.generate(**prompt_tokenizer, streamer=streamer, max_new_tokens=120)

<|begin_of_text|>What do you know about the Harlequin??
The Harlequin is a highly regarded and versatile digital camera system that offers a wide range of features and functions to help you capture and edit your photos. Here are some key features of the Harlequin camera system:
1. **Image Stabilization**: The Harlequin camera system features advanced image stabilization, which helps to reduce camera shake and blur caused by hand movement.
2. **Lens Selection**: The Harlequin camera system offers a wide range of interchangeable lenses, including wide-angle lenses, telephoto lenses, and macro lenses.
3. **Advanced Image Processing**: The Harle


In [6]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2025.9.9 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.




Usada estrat√©gia de few shot prompt para indicar o modelo qual ser√° o template usado para refinamento dos dados.


In [8]:
def format_llama3_template(example):
    messages = [
        {
            "role": "system",
            "content": "You are a product identification model. Your job is to analyze product questions and generate product information as a response.",
        },
        {
            "role": "user",
            "content": f"Describe the product: {example['instruction']}\n\nWhat do you know about the {example['instruction']}?",
        },
        {
            "role": "assistant",
            "content": example['output'],
        }
    ]

    # 2. Aplica o template de chat do Llama 3, garantindo o token EOS e a m√°scara de loss.
    # O "tokenize=False" garante que recebemos apenas a string formatada.
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=False
    )

    # Adiciona o token EOS, que √© CRUCIAL para o SFT em modelos de chat
    return { "text": text + tokenizer.eos_token }

O dataset nesta etapa j√° consta com todas as melhorias avaliadas em outros datasets, com isso temos a leitura dos dados mais r√°pidas via hugging face.

In [9]:
dataset = load_dataset('robsoncalixto/amazon_data_single_alpaca')
train_dataset = dataset['train']

In [11]:
formatted_train_dataset = train_dataset.map(format_llama3_template)

Os par√¢metros de treinamento possui um conjunto de configura√ß√£o que mescla valores default definidos na documenta√ß√£o do unsloth com observa√ß√µes de execu√ß√µes anteriores, principalmente por usar um modelo instru√≠do que facilita o fine-tuning.
Com isso, o foco dos par√¢metros √© obter o melhor resultado poss√≠vel com o menor tempo de execu√ß√£o.

In [12]:
formatted_train_dataset['text']

['<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 29 Sep 2025\n\nYou are a product identification model. Your job is to analyze product questions and generate product information as a response.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nDescribe the product: Girls Ballet Tutu Neon Pink\n\nWhat do you know about the Girls Ballet Tutu Neon Pink?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHigh quality 3 layer ballet tutu. 12 inches in length<|eot_id|><|eot_id|>',
 "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 29 Sep 2025\n\nYou are a product identification model. Your job is to analyze product questions and generate product information as a response.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nDescribe the product: Mog's Kittens\n\nWhat do you know about the Mog's Kittens?<|eot_id|><|start_header_id|>assistant<|end_header_

In [13]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=formatted_train_dataset,
    dataset_text_field="text",
    max_seq_length=2048,
    dataset_num_proc=4,
    packing = True,
    args = TrainingArguments(
        per_device_train_batch_size=16,
        gradient_accumulation_steps=2,
        warmup_steps=10,
        num_train_epochs=3,
        learning_rate=3e-5,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit",
        lr_scheduler_type = 'cosine',
        seed=42,
        #max_steps = 20 #removido para conseguir avaliar o treinamento do modelo com todos os dados. Est√° usando uma vers√£o com
    ),
)

In [14]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 3 | Total steps = 4,689
O^O/ \_/ \    Batch size per device = 16 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (16 x 2 x 1) = 32
 "-____-"     Trainable parameters = 11,272,192 of 1,247,086,592 (0.90% trained)
[34m[1mwandb[0m: Currently logged in as: [33mrobsoncaliixto[0m ([33mrobsoncaliixto-fiap[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [huggingface_hub.inference, openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,3.3635
2,3.5163
3,3.6407
4,3.5773
5,3.6178
6,3.6351
7,3.5306
8,3.5404
9,3.6573
10,3.6393


TrainOutput(global_step=4689, training_loss=2.0266562767388803, metrics={'train_runtime': 6205.6899, 'train_samples_per_second': 24.171, 'train_steps_per_second': 0.756, 'total_flos': 6.741937914005422e+17, 'train_loss': 2.0266562767388803, 'epoch': 3.0})

In [15]:
FastLanguageModel.for_inference(model)
streamer = TextStreamer(tokenizer)
prompt = 'What do you know about Harlequin?'
prompt_tokenizer = tokenizer(prompt, return_tensors='pt').to('cuda')

_=model.generate(**prompt_tokenizer, streamer=streamer, max_new_tokens=120)

<|begin_of_text|>What do you know about Harlequin? What do you know about Harlequin? What do you know about Harlequin? What do you know about Harlequin? What do you know about Harlequin? What do you know about Harlequin? What do you know about Harlequin? What do you know about Harlequin? What do you know about Harlequin? What do you know about Harlequin? What do you know about Harlequin? What do you know about Harlequin? What do you know about Harlequin? What do you know about Harlequin? What do you


In [16]:
model.save_pretrained(FINETUNED_MODEL)
tokenizer.save_pretrained(FINETUNED_MODEL)

('robsoncalixto/Llama-3.2-1B-finetuning-grp49/tokenizer_config.json',
 'robsoncalixto/Llama-3.2-1B-finetuning-grp49/special_tokens_map.json',
 'robsoncalixto/Llama-3.2-1B-finetuning-grp49/chat_template.jinja',
 'robsoncalixto/Llama-3.2-1B-finetuning-grp49/tokenizer.json')

In [17]:
model.push_to_hub_gguf(FINETUNED_MODEL, tokenizer, quantization_method = "q8_0", token = HF_TOKEN)

Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 1.1G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 58.01 out of 83.47 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:00<00:00, 49.34it/s]


Unsloth: Saving tokenizer... Done.
Done.


Unsloth: Converting llama model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q8_0'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: CMAKE detected. Finalizing some steps for installation.
Unsloth: [1] Converting model at robsoncalixto/Llama-3.2-1B-finetuning-grp49 into q8_0 GGUF format.
The output location will be /content/robsoncalixto/Llama-3.2-1B-finetuning-grp49/unsloth.Q8_0.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: Llama-3.2-1B-finetuning-grp49
INFO:hf-to-gguf:Model architecture: LlamaForCausalLM
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {32}
INFO

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...g-grp49/unsloth.Q8_0.gguf:   3%|3         | 41.9MB / 1.32GB            

No files have been modified since last commit. Skipping to prevent empty commit.


Saved GGUF to https://huggingface.co/robsoncalixto/Llama-3.2-1B-finetuning-grp49


In [18]:
import gradio as gr

In [19]:
model_base, _ = FastLanguageModel.from_pretrained(
    model_name = FINETUNING_MODEL,
    max_seq_length = MAX_SEQ_LENGTH,
    dtype=DTYPE,
    load_in_4bit= LOAD_IN_4BIT,
    token=HF_TOKEN
  )

==((====))==  Unsloth 2025.9.9: Fast Llama patching. Transformers: 4.55.4.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.10G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

In [20]:
FastLanguageModel.for_inference(model_base)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048, padding_idx=128004)
    (layers): ModuleList(
      (0): LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), 

In [21]:
FastLanguageModel.for_inference(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 2048, padding_idx=128004)
        (layers): ModuleList(
          (0): LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear

In [33]:
def models_comparison(input = ""):
    messages = [
        {
            "role": "system",
            "content": "You are a product identification model. Your job is to analyze product questions and generate product information as a response.",
        },
        {
            "role": "user",
            "content": f"Describe the product: {input}\n\nWhat do you know about the {input}?",
        },
    ]

    inference_prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )

    inputs = tokenizer([inference_prompt], return_tensors="pt").to("cuda")

    ## Configura√ß√£o para infer√™ncia
    gen_kwargs = {
        "max_new_tokens": 128,
        "do_sample": True,
        "temperature": 1,
        "repetition_penalty": 0.9,
    }

    # Gerado a partir do modelo base
    base_output_tokens = model_base.generate(**inputs, **gen_kwargs)
    base_response = tokenizer.batch_decode(base_output_tokens[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)[0]

    # Gerado a partir do modelo finetuned
    ft_output_tokens = model.generate(**inputs, **gen_kwargs)
    ft_response = tokenizer.batch_decode(ft_output_tokens[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)[0]

    return base_response, ft_response

In [34]:
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# ü§ñ Compara√ß√£o de Modelos: Llama 3.2 1B vs. Modelo com Fine-tuning")
    gr.Markdown("Digite um t√≠tulo de produto abaixo para ver como o Modelo Base e o Modelo com Fine-tuning respondem.")

    inp = gr.Textbox(label="Digite um T√≠tulo (Input)", placeholder="Exemplo: O que voc√™ sabe sobre o produto?")
    btn = gr.Button("Gerar Respostas")

    with gr.Row():
        out_base = gr.Textbox(label="Resposta do Modelo Base", lines=10)
        out_ft = gr.Textbox(label="Resposta do Modelo com Fine-tuning", lines=10)

    btn.click(fn=models_comparison, inputs=inp, outputs=[out_base, out_ft])

# Cria um link para uma aplica√ß√£o tempor√°ria para teste
demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://85a8883d34935edd8a.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


