-------------------

## ✅ **💻 Notebook para treinamento com LoRA no Colab Free usando Falcon-RW-1B**

### 📁 1. Montar o Google Drive

In [3]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### 📦 2. Instalar dependências

In [4]:
!pip install -q transformers==4.33.2 peft==0.4.0 accelerate==0.21.0


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.9/119.9 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m65.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m71.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m906.3 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m101.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

### 🧠 3. Carregar Falcon-RW-1B com quantização 4-bit

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model, TaskType

model_name = "tiiuae/falcon-rw-1b"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    device_map="auto"  # Usa GPU automaticamente
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


### 🧩 4. Configurar LoRA

In [4]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["query_key_value"],  # específico do Falcon
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 1,572,864 || all params: 1,313,198,080 || trainable%: 0.1198


### 📚 5. Criar ou carregar dataset de exemplo

#### 🔸 Exemplo JSON salvo em `/content/drive/MyDrive/falcon/dataset.json`

```json
{"instruction": "Qual é a capital do Brasil?", "input": "", "output": "A capital do Brasil é Brasília."}
{"instruction": "Cite três pontos turísticos do Rio de Janeiro.", "input": "", "output": "Cristo Redentor, Pão de Açúcar e Praia de Copacabana."}
```

#### 🔸 Carregando o dataset

In [5]:
import pandas as pd
from datasets import Dataset

# 1. Caminho do CSV no seu Google Drive
caminho_csv = "/content/drive/MyDrive/falcon/medquad.csv"

# 2. Carregar o CSV
df = pd.read_csv(caminho_csv)

# 3. Verificar colunas disponíveis
print("Colunas no CSV:", *df.columns)
print(f'Linhas do CSV:{df.shape[0]}')

# 4. Manter apenas 'question' e 'answer', removendo vazios e duplicados
df = df[['question', 'answer']].dropna().drop_duplicates()

# 5. Renomear para o formato LoRA
df['instruction'] = df['question']
df['input'] = ""  # deixamos em branco por enquanto
df['output'] = df['answer']
df = df[['instruction', 'input', 'output']]

# 6. (Opcional) Cortar para 1000 exemplos
# df = df.sample(n=1000, random_state=42)

# 7. Mostrar 5 exemplos para verificar
print("\nExemplo de registros:")
display(df.sample(5))

# 8. Salvar no formato json no Google Drive
caminho_json = "/content/drive/MyDrive/falcon/saude_dataset.json"
df.to_json(caminho_json, orient="records", lines=True, force_ascii=False)
print(f"\n✅ Dataset salvo em: {caminho_json}")

# 9. Carregar para Dataset HuggingFace
dataset = Dataset.from_pandas(df)
print("\n✅ Dataset HuggingFace pronto para treino:", dataset)


Colunas no CSV: question answer source focus_area
Linhas do CSV:16412

Exemplo de registros:


Unnamed: 0,instruction,input,output
8602,What are the treatments for Heart Murmur ?,,A heart murmur isn't a disease. It's an extra ...
14570,What is (are) cap myopathy ?,,Cap myopathy is a disorder that primarily affe...
1008,what research (or clinical trials) is being do...,,New types of treatment are being tested in cli...
2180,What is (are) Miscarriage ?,,A miscarriage is the loss of pregnancy from na...
14482,What are the genetic changes related to myopat...,,Myopathy with deficiency of iron-sulfur cluste...



✅ Dataset salvo em: /content/drive/MyDrive/falcon/saude_dataset.json

✅ Dataset HuggingFace pronto para treino: Dataset({
    features: ['instruction', 'input', 'output', '__index_level_0__'],
    num_rows: 16359
})


In [6]:
from datasets import load_dataset
import pandas as pd
from datasets import Dataset

# Caminho no seu Google Drive
# caminho = "/content/drive/MyDrive/falcon/dataset.json"

# df = pd.read_json(caminho, lines=True)  # ⚠️ Use lines=True se for JSONL
# dataset = Dataset.from_pandas(df)


def format_prompt(example):
    prompt = f"Instrução: {example['instruction']}\nEntrada: {example['input']}\nResposta:"
    return tokenizer(prompt, padding="max_length", truncation=True, max_length=512)

def format_output(example):
    return tokenizer(example['output'], padding="max_length", truncation=True, max_length=512)

def preprocess(example):
    prompt = f"Instrução: {example['instruction']}\nEntrada: {example['input']}\nResposta:"
    input_ids = tokenizer(prompt, padding="max_length", truncation=True, max_length=512)["input_ids"]
    labels = tokenizer(example['output'], padding="max_length", truncation=True, max_length=512)["input_ids"]
    return {"input_ids": input_ids, "labels": labels}

tokenized_dataset = dataset.map(preprocess)


Map:   0%|          | 0/16359 [00:00<?, ? examples/s]

### 🏃 6. Treinar com `Trainer`

In [7]:
from transformers import TrainingArguments, Trainer
import os
os.environ["WANDB_DISABLED"] = "true"

output_dir = "/content/drive/MyDrive/falcon/modelo_lora"

training_args = TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    num_train_epochs=1.5,
    logging_steps=100,
    output_dir=output_dir,
    save_strategy="epoch",
    save_total_limit=1,
    fp16=True,  # FP16 com GPU T4 no Colab
    optim="adamw_torch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    # label_names=["labels"]  # 👈 Isso resolve o aviso
)


trainer.train()


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
100,5.5828
200,3.892
300,3.6781
400,3.506
500,3.5451
600,3.4198
700,3.3735
800,3.4345
900,3.3454
1000,3.3356


TrainOutput(global_step=1535, training_loss=3.5839822775377903, metrics={'train_runtime': 7542.1247, 'train_samples_per_second': 3.254, 'train_steps_per_second': 0.204, 'total_flos': 9.127224854288794e+16, 'train_loss': 3.5839822775377903, 'epoch': 1.5007641053854148})

### 💾 7. Salvar adaptadores LoRA

In [8]:
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('/content/drive/MyDrive/falcon/modelo_lora/tokenizer_config.json',
 '/content/drive/MyDrive/falcon/modelo_lora/special_tokens_map.json',
 '/content/drive/MyDrive/falcon/modelo_lora/vocab.json',
 '/content/drive/MyDrive/falcon/modelo_lora/merges.txt',
 '/content/drive/MyDrive/falcon/modelo_lora/added_tokens.json',
 '/content/drive/MyDrive/falcon/modelo_lora/tokenizer.json')

In [None]:
import os
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from peft import PeftModel, PeftConfig

# Criar diretório de offload
os.makedirs("/content/offload", exist_ok=True)

# Caminho do modelo LoRA salvo no Drive
model_path = "/content/drive/MyDrive/falcon/modelo_lora"

# Carregar config
config = PeftConfig.from_pretrained(model_path)

# Carregar modelo base com offload
base_model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    device_map="auto",
    offload_folder="/content/offload"  # Aqui está a correção!
)

# Aplicar LoRA
model = PeftModel.from_pretrained(base_model, model_path)

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Pipeline (sem device=)
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Prompt de teste
prompt = "What are the most common symptoms of diabetes?"
output = generator(prompt, max_new_tokens=100, do_sample=True, temperature=0.7)

print(output[0]["generated_text"])
