In [1]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth

!pip install transformers==4.55.4
!pip install --no-deps trl==0.22.2
!pip install gradio

In [2]:
#Importe das bibliotecas

from unsloth import FastLanguageModel, is_bfloat16_supported
import torch
from trl import SFTTrainer
from transformers import TrainingArguments
from transformers import TextStreamer
from datasets import load_dataset
from google.colab import userdata

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [3]:
# Informações da GPU de processamento

gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Sun Nov  9 14:24:11 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   30C    P0             49W /  400W |     423MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

> O modelo selecionado para receber o fine-tuning fo o Olhama de 1B de parâmetros. Ele funcionou bem na execução local e tinha um conjunto de respostas interessante para o nosso dataset de treinamento.

In [5]:
# Variáveis locais

HF_TOKEN = userdata.get('HF_TOKEN')                       #Token do Google Colab
FINETUNING_MODEL = 'unsloth/Llama-3.2-1B-Instruct'        #modelo
FINETUNING_DATASET = 'Araguacy/amazon'                    #caminho do Hugging Face para o dataset
FINETUNED_MODEL = 'Araguacy/Llama-3.2-1B-finetuning-fiap' #Nome do modelo após treinamento
MAX_SEQ_LENGTH = 2048
LOAD_IN_4BIT = True #Parâmetro para o Unsloth usar a biblioteca bitsandbytes para carregar o modelo quantizado, economizando VRAM.
DTYPE = None

In [6]:

# Download do modelo, primeiro passo

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = FINETUNING_MODEL,
    max_seq_length = MAX_SEQ_LENGTH,
    dtype=DTYPE,
    load_in_4bit= LOAD_IN_4BIT,   # Quantização 4 bit (comprime)
    token=HF_TOKEN
  )

==((====))==  Unsloth 2025.11.2: Fast Llama patching. Transformers: 4.55.4.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.10G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

> Realizando uma inferência no modelo base para teste passando um prompt para falar sobre Harlequin.

In [7]:
FastLanguageModel.for_inference(model)
streamer = TextStreamer(tokenizer)
prompt = 'What do you know about the Harlequin?'
prompt_tokenizer = tokenizer(prompt, return_tensors='pt').to('cuda')

_=model.generate(**prompt_tokenizer, streamer=streamer, max_new_tokens=120)

<|begin_of_text|>What do you know about the Harlequin? Harlequin is a mysterious and elusive figure with a long history of involvement in various crimes and conspiracies. Here are some key facts about the Harlequin:

1. **Name Origin**: The Harlequin is believed to be named after a character from a medieval fable, known as "The Harlequin," who was a jester in the royal court of France. The character was known for his mischievous and cunning nature, which might have inspired the Harlequin's own identity.
2. **Mysterious Nature**: Harlequin is said to be a master of disguise


In [8]:
#Configuração do PEFT (LoRA), segundo passo
# r = 16: Define o "ranking" (tamanho) dos adaptadores LoRA.
# target_modules: Lista quais camadas do Transformer receberão os adaptadores.

model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2025.11.2 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


In [9]:
#Função para formatação do dataset para ser enviado ao treinamento.

def format_llama3_template(example):
    messages = [
        {
            "role": "system",
            "content": "You are a product identification model. Your job is to analyze product questions and generate product information as a response.",
        },
        {
            "role": "user",
            "content": f"Describe the product: {example['title']}\n\nWhat do you know about the {example['title']}?",
        },
        {
            "role": "assistant",
            "content": example['content'],
        }
    ]

    # 2. Aplica o template de chat do Llama 3, garantindo o token EOS e a máscara de loss.
    # O "tokenize=False" garante que recebemos apenas a string formatada.
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=False
    )

    # Adiciona o token EOS, que é CRUCIAL para o SFT em modelos de chat
    return { "text": text + tokenizer.eos_token }

O dataset nesta etapa já consta com todas as melhorias avaliadas em outros datasets, com isso temos a leitura dos dados mais rápidas via hugging face.

In [10]:
# Load do dataset pelo Hugging face

dataset = load_dataset(FINETUNING_DATASET)
train_dataset = dataset['train']

README.md:   0%|          | 0.00/316 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/36.6M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/100000 [00:00<?, ? examples/s]

In [11]:
# Aplica a formatação a todos os 100.000 registros
formatted_train_dataset = train_dataset.map(format_llama3_template)

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

In [12]:
# Print resultado da string formatada
print(train_dataset)

Dataset({
    features: ['title', 'content'],
    num_rows: 100000
})


In [13]:
formatted_train_dataset[0]

{'title': '(1) Buckle Guard Car Seat Belt Button Cover in BLACK',

>> Os parâmetros de treinamento possui um conjunto de configuração que mescla valores default definidos na documentação do unsloth com observações de execuções anteriores, principalmente por usar um modelo instruído que facilita o fine-tuning.
Com isso, o foco dos parâmetros é obter o melhor resultado possível com o menor tempo de execução.

In [14]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=formatted_train_dataset,
    dataset_text_field="text",
    max_seq_length=2048,
    dataset_num_proc=4,
    packing = True,
    args = TrainingArguments(
        per_device_train_batch_size=16,
        gradient_accumulation_steps=2,
        warmup_steps=10,
        num_train_epochs=3,               #O modelo verá o dataset inteiro 3 vezes
        learning_rate=3e-5,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit",        #Mais uma otimização de memória
        lr_scheduler_type = 'cosine',
        seed=42,
        #max_steps = 1 #removido para conseguir avaliar o treinamento do modelo com todos os dados. Está usando uma versão com
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=16):   0%|          | 0/100000 [00:00<?, ? examples/s]

In [15]:
# Treinamento do modelo, será necessário passar a chave do wandb (Weights & Biases) para monitorar o treinamento
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 100,000 | Num Epochs = 3 | Total steps = 9,375
O^O/ \_/ \    Batch size per device = 16 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (16 x 2 x 1) = 32
 "-____-"     Trainable parameters = 11,272,192 of 1,247,086,592 (0.90% trained)
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33maraguacybp[0m ([33maraguacybp-fiap[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [huggingface_hub.inference, openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,3.7882
2,3.9683
3,3.902
4,3.9495
5,3.8417
6,3.7981
7,3.9386
8,3.9053
9,3.796
10,3.83


TrainOutput(global_step=9375, training_loss=1.8145994728342691, metrics={'train_runtime': 7248.7627, 'train_samples_per_second': 41.386, 'train_steps_per_second': 1.293, 'total_flos': 6.901003531211244e+17, 'train_loss': 1.8145994728342691, 'epoch': 3.0})

## Resultado do treinamento :

> TrainOutput(global_step=9375, training_loss=1.8145994728342691, metrics={'train_runtime': 7248.7627, 'train_samples_per_second': 41.386, 'train_steps_per_second': 1.293, 'total_flos': 6.901003531211244e+17, 'train_loss': 1.8145994728342691, 'epoch': 3.0})

* global_step=9375: Este é o número total de "passos de otimização" que o modelo executou. Cálculo: (100.000 exemplos / 32) * 3 épocas = 3.125 * 3 = 9.375 passos
* training_loss=1.8145: Este é o número mais importante para a qualidade do modelo. "Loss" (perda) é a medida de "erro" do modelo
* train_runtime: 7248.7627: O tempo total que o fine-tuning levou para rodar, em segundos. Ou seja, o processo todo demorou pouco mais de 2 horas na GPU A100.
* train_samples_per_second: 41.386:A velocidade do seu treinamento. A GPU conseguiu processar, em média, 41.3 exemplos do seu dataset por segundo.
* train_steps_per_second: 1.293:A velocidade de atualização. O modelo estava realizando 1.29 atualizações de peso (passos) por segundo.
* total_flos: 6.90...e+17: FLOS (Floating Point Operations) é o número total de cálculos matemáticos que a GPU realizou. É um número astronômico que apenas quantifica o esforço computacional total.

In [26]:
# salvar os arquivos localmente em uma pasta
model.save_pretrained(FINETUNED_MODEL)
tokenizer.save_pretrained(FINETUNED_MODEL)

# 1. Este comando envia os adaptadores LoRA (o "modelo") para o Hub
model.push_to_hub(FINETUNED_MODEL, token = HF_TOKEN)

# 2. Este comando envia os arquivos do tokenizer para o Hub
tokenizer.push_to_hub(FINETUNED_MODEL, token = HF_TOKEN)

README.md:   0%|          | 0.00/534 [00:00<?, ?B/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...adapter_model.safetensors:   0%|          | 23.3kB / 45.1MB            

Saved model to https://huggingface.co/Araguacy/Llama-3.2-1B-finetuning-fiap


README.md:   0%|          | 0.00/533 [00:00<?, ?B/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...mpsq3keo6m/tokenizer.json: 100%|##########| 17.2MB / 17.2MB            

In [18]:
#Salvar o modelo quantizado no Hugging Face
model.push_to_hub_gguf(FINETUNED_MODEL, tokenizer, quantization_method = "q8_0", token = HF_TOKEN)

Unsloth: Converting model to GGUF format...
Unsloth: Merging model weights to 16-bit format...


config.json:   0%|          | 0.00/894 [00:00<?, ?B/s]

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...
Cache check failed: model.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files:   0%|          | 0/1 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files: 100%|██████████| 1/1 [00:07<00:00,  7.50s/it]


Note: tokenizer.model not found (this is OK for non-SentencePiece models)


Unsloth: Merging weights into 16bit: 100%|██████████| 1/1 [00:11<00:00, 11.22s/it]


Unsloth: Merge process complete. Saved to `/tmp/unsloth_gguf_ky5dy5ak`
Unsloth: Converting to GGUF format...
==((====))==  Unsloth: Conversion from HF to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF bf16 might take 3 minutes.
\        /    [2] Converting GGUF bf16 to ['q8_0'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: Updating system package directories
Unsloth: All required system packages already installed!
Unsloth: Install llama.cpp and building - please wait 1 to 3 minutes
Unsloth: Cloning llama.cpp repository
Unsloth: Install GGUF and other packages
Unsloth: Successfully installed llama.cpp!
Unsloth: Preparing converter script...
Unsloth: [1] Converting model into bf16 GGUF format.
This might take 3 minutes...
Unsloth: Initial conversion completed! Files: ['llama-3.2-1b-instruct.BF16.gguf'

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...3.2-1b-instruct.Q8_0.gguf:   0%|          |  559kB / 1.32GB            

Uploading config.json...
Uploading Ollama Modelfile...
Unsloth: Successfully uploaded GGUF to https://huggingface.co/Araguacy/Llama-3.2-1B-finetuning-fiap
Unsloth: Cleaning up temporary files...


'Araguacy/Llama-3.2-1B-finetuning-fiap'

In [19]:
import gradio as gr

In [20]:
model_base, _ = FastLanguageModel.from_pretrained(
    model_name = FINETUNING_MODEL,
    max_seq_length = MAX_SEQ_LENGTH,
    dtype=DTYPE,
    load_in_4bit= LOAD_IN_4BIT,
    token=HF_TOKEN
  )

==((====))==  Unsloth 2025.11.2: Fast Llama patching. Transformers: 4.55.4.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [21]:
# Para nova inferência do modelo base
FastLanguageModel.for_inference(model_base)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048, padding_idx=128004)
    (layers): ModuleList(
      (0): LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), 

In [22]:
# Para inferência do modelo treinado
FastLanguageModel.for_inference(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 2048, padding_idx=128004)
        (layers): ModuleList(
          (0): LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear

In [23]:
# Função para comparação/teste do modelo base x modelo treinado

def models_comparison(input = ""):
    messages = [
        {
            "role": "system",
            "content": "You are a product identification model. Your job is to analyze product questions and generate product information as a response.",
        },
        {
            "role": "user",
            "content": f"Describe the product: {input}\n\nWhat do you know about the {input}?",
        },
    ]

    inference_prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )

    inputs = tokenizer([inference_prompt], return_tensors="pt").to("cuda")

    ## Configuração para inferência
    gen_kwargs = {
        "max_new_tokens": 128,
        "do_sample": True,
        "temperature": 1,
        "repetition_penalty": 0.9,
    }

    # Gerado a partir do modelo base
    base_output_tokens = model_base.generate(**inputs, **gen_kwargs)
    base_response = tokenizer.batch_decode(base_output_tokens[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)[0]

    # Gerado a partir do modelo finetuned
    ft_output_tokens = model.generate(**inputs, **gen_kwargs)
    ft_response = tokenizer.batch_decode(ft_output_tokens[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)[0]

    return base_response, ft_response

In [24]:
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🤖 Comparação de Modelos: Llama 3.2 1B vs. Modelo com Fine-tuning")
    gr.Markdown("Digite um título de produto abaixo para ver como o Modelo Base e o Modelo com Fine-tuning respondem.")

    inp = gr.Textbox(label="Digite um Título (Input)", placeholder="Exemplo: O que você sabe sobre o produto?")
    btn = gr.Button("Gerar Respostas")

    with gr.Row():
        out_base = gr.Textbox(label="Resposta do Modelo Base", lines=10)
        out_ft = gr.Textbox(label="Resposta do Modelo com Fine-tuning", lines=10)

    btn.click(fn=models_comparison, inputs=inp, outputs=[out_base, out_ft])

# Cria um link para uma aplicação temporária para teste
demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://587e32bf74ffefa3f1.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


