# 🧠 Fine-Tuning de NER com Transformers em Português 🇧🇷

Este notebook mostra como treinar um modelo BERT para Reconhecimento de Entidades Nomeadas (NER) usando a biblioteca Hugging Face.


# 📁 2. Preparar dataset de exemplo em formato BIO


In [None]:
from datasets import Dataset, DatasetDict

# Exemplo didático: frases tokenizadas com rótulos BIO
data = {
    "tokens": [
        ["João", "mora", "em", "São", "Paulo", "."],
        ["Maria", "trabalha", "na", "Google", "."]
    ],
    "ner_tags": [
        [1, 0, 0, 2, 3, 0],  # João (B-PER), São (B-LOC), Paulo (I-LOC)
        [1, 0, 0, 4, 0]      # Maria (B-PER), Google (B-ORG)
    ]
}

label_list = ["O", "B-PER", "B-LOC", "I-LOC", "B-ORG"]
dataset = Dataset.from_dict(data)
dataset = DatasetDict({"train": dataset, "validation": dataset})


# 🔤 3. Tokenização com alinhamento de rótulos

In [None]:
from transformers import AutoTokenizer

model_checkpoint = "neuralmind/bert-base-portuguese-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(example["ner_tags"][word_idx])
        else:
            labels.append(example["ner_tags"][word_idx])
        previous_word_idx = word_idx
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=False)


# 🧠 4. Carregar modelo pré-treinado para Token Classification

In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list)
)


In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list)
)


# ⚙️ 5. Configurar treinamento

In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="./ner-model",
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01
)


# 📊 6. Métricas com seqeval

In [None]:
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(-1)

    true_labels = [[label_list[l] for l in example if l != -100] for example in labels]
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    print(classification_report(true_labels, true_predictions))

    return {
        "accuracy": accuracy_score(true_labels, true_predictions),
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions)
    }


# 🏋️ 7. Treinar o modelo

In [None]:
from transformers import Trainer, DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()


# ✅ 8. Avaliar no conjunto de validação

In [None]:
trainer.evaluate()


# 📌 Observações
O exemplo usa apenas duas frases fictícias. Para uso real, substitua o dataset por dados reais anotados em BIO.

A métrica seqeval fornece avaliação padrão para tarefas de NER com múltiplas entidades.

Você pode exportar o modelo final com trainer.save_model().