In [None]:
import torch

print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from datasets import load_from_disk
import random

dataset = load_from_disk("/content/drive/MyDrive/TC3/dataset/dataset_treino")
num_examples = len(dataset)
random_indices = random.sample(range(num_examples), 100000)
random_sample = dataset.select(random_indices)

In [None]:
import os

caminho_salvar_json = "/content/drive/MyDrive/TC3/dataset/random_sample3.json"

diretorio_salvar = os.path.dirname(caminho_salvar_json)
os.makedirs(diretorio_salvar, exist_ok=True)

random_sample.to_json(caminho_salvar_json)

print(f"Dataset salvo em formato json em: {caminho_salvar_json}")

In [None]:
from unsloth import FastLanguageModel
import torch

model_name = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit"

max_seq_length = 2048
dtype = None

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=True,
)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r=32,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha=64,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=random_sample,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=10,
        num_train_epochs=1,
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=25,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        save_strategy="epoch",
        save_total_limit=2,
        dataloader_pin_memory=False,
        report_to="none",
    ),
)

In [None]:
trainer_stats = trainer.train()

In [None]:
import os

drive.mount('/content/drive')

caminho_salvar_modelo = "/content/drive/MyDrive/TC3/modelos/meu_modelo_finetuned_3"

os.makedirs(caminho_salvar_modelo, exist_ok=True)

model.save_pretrained(caminho_salvar_modelo)
tokenizer.save_pretrained(caminho_salvar_modelo)

print(f"Modelo e Tokenizer salvos com sucesso em: {caminho_salvar_modelo}")

In [None]:
random_sample.select(range(10)).to_pandas()

In [None]:
import json

sample_list = []
with open('/content/drive/MyDrive/TC3/dataset/random_sample3.json', "r") as f:
    for line in f:
        sample_list.append(json.loads(line))

sample_treino = sample_list[0]
sample_treino

In [None]:
sample_teste = sample_list[1]
sample_teste

In [None]:
import re

def extract_title_and_content(text):
    match = re.search(r"### Input: (.*)\n### Output: (.*)", text)

    if match:
       return match.group(1).strip(), match.group(2).strip()
    else:
        return None, None

In [None]:
titulo_treino, content_treino = extract_title_and_content(sample_treino['text'])

print(f"Título: {titulo_treino}")
print(f"Conteúdo: {content_treino}")

In [None]:
titulo_teste = extract_title_and_content(sample_teste['text'])[0]
titulo_teste

In [None]:
FastLanguageModel.for_inference(model)

messages = [
    {"role": "system", "content": "\nYou are a helpful AI assistant that finds the correct product description.\n"},
    {"role": "user", "content": f"\n{titulo_treino}\n"},
    {"role": "assistant", "content": f"\n{content_treino}\n"},
    {"role": "user", "content": f"\n{titulo_teste}\n"},
]

inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt",
).to("cuda")

outputs = model.generate(
    input_ids=inputs,
    max_new_tokens=500,
    temperature=0.1,
    do_sample=True,
    top_p=0.9,
)

response = tokenizer.batch_decode(outputs)[0]
print(response)