In [1]:
!pip install transformers datasets evaluate peft trl bitsandbytes

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting trl
  Downloading trl-0.23.0-py3-none-any.whl.metadata (11 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Collecting datasets
  Downloading datasets-4.1.1-py3-none-any.whl.metadata (18 kB)
Collecting transformers
  Downloading transformers-4.56.2-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.7.0 (from evaluate)
  Downloading huggingface_hub-0.35.0-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-21.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting nvidi

In [2]:
import os
import torch
import gc
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, pipeline, logging
from peft import LoraConfig
from trl import SFTTrainer

In [3]:
# Definir modelos e datasets
base_model = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
guanaco_dataset = "mlabonne/guanaco-llama2-1k"
new_model = "llama-1.1B-chat-guanaco"

# Limpar memória antes de começar
gc.collect()
torch.cuda.empty_cache()

# Verificar disponibilidade da GPU
print(f"GPU disponível: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU atual: {torch.cuda.get_device_name(0)}")
    print(f"Memória total da GPU: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

# Carregar dataset completo (T4 deve suportar bem)
dataset = load_dataset(guanaco_dataset, split="train")
print(f"Tamanho do dataset: {len(dataset)} exemplos")

# Carregar o modelo - otimizado para T4
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    device_map="auto",
    torch_dtype=torch.float16  # Usar float16 para economia de memória na T4
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Opcional: Ativar gradient checkpointing
model.gradient_checkpointing_enable()

# Carregar tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

GPU disponível: True
GPU atual: Tesla T4
Memória total da GPU: 15.83 GB


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001-9ad84bb9cf65a4(…):   0%|          | 0.00/967k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Tamanho do dataset: 1000 exemplos


config.json:   0%|          | 0.00/560 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/4.40G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [4]:
# Verificar inferência inicial
logging.set_verbosity(logging.CRITICAL)
prompt = "Who is Napoleon Bonaparte?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print("\nResultado antes do fine-tuning:")
print(result[0]['generated_text'])
print("-" * 50)



Resultado antes do fine-tuning:
<s>[INST] Who is Napoleon Bonaparte? [/INST]
"I know why," said the king, "it is because I am a king, and I have a right to be."
-- Napoleon Bonaparte
--------------------------------------------------


In [5]:
# Configuração LoRA - otimizada para T4
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,  # A T4 pode lidar com rank 64
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"]
)

# Configuração de treinamento - otimizada para T4
training_params = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=4,  # T4 deve suportar batch size 4
    gradient_accumulation_steps=8,  # Efetivo batch size = 4*8 = 32
    optim="adamw_torch",
    save_steps=25,
    logging_steps=5,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,  # T4 suporta fp16 bem
    bf16=False,  # T4 não suporta bf16
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    gradient_checkpointing=True,
    report_to="none"  # Desativar reporting para reduzir overhead
)

# Limpar memória antes do treinamento
gc.collect()
torch.cuda.empty_cache()


In [6]:
# Configurar trainer compatível com TRL 0.23.0
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    args=training_params,
    peft_config=peft_params,
    formatting_func=lambda x: x["text"],
    processing_class=tokenizer
)

# Treinar o modelo
print("\nIniciando treinamento...")
trainer.train()

# Salvar o modelo
print("\nSalvando modelo...")
trainer.model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)


Applying formatting function to train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]


Iniciando treinamento...
{'loss': 1.7054, 'grad_norm': 0.1138823926448822, 'learning_rate': 0.00019948693233918952, 'entropy': 1.9302391350269317, 'num_tokens': 80839.0, 'mean_token_accuracy': 0.6186646908521652, 'epoch': 0.16}
{'loss': 1.9745, 'grad_norm': 0.10355348885059357, 'learning_rate': 0.00019377521321470805, 'entropy': 2.1054679960012437, 'num_tokens': 154902.0, 'mean_token_accuracy': 0.5739769265055656, 'epoch': 0.32}
{'loss': 1.8424, 'grad_norm': 0.09743526577949524, 'learning_rate': 0.00018207634412072764, 'entropy': 1.934621462225914, 'num_tokens': 216824.0, 'mean_token_accuracy': 0.5948654390871525, 'epoch': 0.48}
{'loss': 1.6433, 'grad_norm': 0.18306250870227814, 'learning_rate': 0.00016513724827222227, 'entropy': 1.6953807204961777, 'num_tokens': 284986.0, 'mean_token_accuracy': 0.6247365221381187, 'epoch': 0.64}
{'loss': 1.5755, 'grad_norm': 0.12711524963378906, 'learning_rate': 0.00014403941515576344, 'entropy': 1.6393293172121048, 'num_tokens': 362337.0, 'mean_toke

('llama-1.1B-chat-guanaco/tokenizer_config.json',
 'llama-1.1B-chat-guanaco/special_tokens_map.json',
 'llama-1.1B-chat-guanaco/tokenizer.model',
 'llama-1.1B-chat-guanaco/added_tokens.json',
 'llama-1.1B-chat-guanaco/tokenizer.json')

In [8]:
# Código corrigido para teste do modelo treinado

print("\nTestando modelo após fine-tuning:")

# 1. Primeiro, desative o gradient checkpointing e restaure o cache
model.gradient_checkpointing_disable()
model.config.use_cache = True

# 2. Certifique-se de que o modelo está em modo de avaliação
model.eval()

# 3. Limpe a memória
gc.collect()
torch.cuda.empty_cache()

# 4. Defina parâmetros de geração melhores
generation_config = {
    "max_new_tokens": 200,
    "do_sample": True,
    "temperature": 0.7,
    "top_p": 0.9,
    "top_k": 50,
    "repetition_penalty": 1.2,
    "no_repeat_ngram_size": 3
}

# 5. Crie o pipeline com os parâmetros de geração
pipe = pipeline(
    task='text-generation',
    model=model,
    tokenizer=tokenizer,
    **generation_config
)

# 6. Teste com o mesmo prompt
prompt = "Who is Napoleon Bonaparte?"
result = pipe(f'<s>[INST] {prompt} [/INST]')
print(result[0]['generated_text'])

# 7. Teste com um prompt diferente para comparação
prompt2 = "Explain quantum computing in simple terms."
result2 = pipe(f'<s>[INST] {prompt2} [/INST]')
print("\nSegundo exemplo:")
print(result2[0]['generated_text'])


Testando modelo após fine-tuning:
<s>[INST] Who is Napoleon Bonaparte? [/INST] "Napoleon" was the French version of his name. He was a famous general and statesman who led France to victory in several wars, including the Napoleonic Wars (1792–1815). In 1804 he declared himself emperor of the French Empire.

He died in exile on Saint Helena, an island off the coast of West Africa, where he remained until his death in 15 May 1621.

Segundo exemplo:
<s>[INST] Explain quantum computing in simple terms. [/INST] Quantum computers are machines that use quantum mechanics to perform calculations faster than conventional computers and allow for the creation of complex algorithms, such as those used by AI systems like Google's DeepMind.

Quantum computing uses qubits, or "qubit states," which can be in a superposition state (i.e., they can have both an up and down spin) rather than being either on or off. This allows for more efficient processing, since it enables the computer to take advantage 

In [9]:
!ls -la llama-1.1B-chat-guanaco

total 74480
drwxr-xr-x 2 root root     4096 Sep 20 22:13 .
drwxr-xr-x 1 root root     4096 Sep 20 22:13 ..
-rw-r--r-- 1 root root      884 Sep 20 22:13 adapter_config.json
-rw-r--r-- 1 root root 72113224 Sep 20 22:13 adapter_model.safetensors
-rw-r--r-- 1 root root     5268 Sep 20 22:13 README.md
-rw-r--r-- 1 root root      437 Sep 20 22:13 special_tokens_map.json
-rw-r--r-- 1 root root      978 Sep 20 22:13 tokenizer_config.json
-rw-r--r-- 1 root root  3619016 Sep 20 22:13 tokenizer.json
-rw-r--r-- 1 root root   499723 Sep 20 22:13 tokenizer.model


In [10]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig

# Caminho para o diretório do modelo refinado
model_path = "llama-1.1B-chat-guanaco"  # ou caminho completo

# Carregar a configuração PEFT
config = PeftConfig.from_pretrained(model_path)

# Carregar o modelo base
base_model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Aplicar o adaptador LoRA
model = PeftModel.from_pretrained(base_model, model_path)

# Carregar o tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token

In [15]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig
import torch

# Caminho para o diretório do modelo refinado
model_path = "llama-1.1B-chat-guanaco"

# Carregar a configuração PEFT
config = PeftConfig.from_pretrained(model_path)

# Carregar o modelo base
base_model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Aplicar o adaptador LoRA
model = PeftModel.from_pretrained(base_model, model_path)

# Carregar o tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token

# Configurar para inferência
model.eval()
model.config.use_cache = True

# Criar função de geração de texto
def generate_response(prompt, max_length=200):
    inputs = tokenizer(f"<s>[INST] {prompt} [/INST]", return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_length,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.2
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extrair apenas a resposta do modelo (após a instrução)
    response = response.split("[/INST]")[-1].strip()
    return response

# Exemplo de uso
prompt = "What are the main challenges of artificial intelligence?"
response = generate_response(prompt)
print(response)

There is no single answer to this question. Different people and organizations have different views on what constitutes a challenge for AI, and how we can address them effectively. Some view AI as an opportunity that brings about positive change in our society, while others see it as a threat that must be addressed through regulation or other measures.
