In [None]:
from seqeval.metrics import (
    precision_score,
    recall_score,
    f1_score,
    classification_report,
)
from seqeval.scheme import IOB2        # BIO/IOB2 tagging scheme


In [None]:
# Install required libraries

import torch
from transformers import RobertaTokenizer, RobertaForTokenClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
from seqeval.metrics import classification_report as seq_classification_report
import numpy as np
from transformers import DataCollatorForTokenClassification


# Prepare dataset
data = outputs

dataset = Dataset.from_dict({
    'tokens': [item['tokens'] for item in data],
    'tags': [item['tags'] for item in data]
})

dataset_dict = dataset.train_test_split(test_size=0.2)

# Model configuration
label_names = ["O", "B-Chemical", "B-Disease", "I-Disease", "I-Chemical"]
from transformers import RobertaTokenizerFast  
tokenizer = RobertaTokenizerFast.from_pretrained(
    "roberta-base",
    add_prefix_space=True,  # Required for word-based tokenization
    use_fast=True
)
model = RobertaForTokenClassification.from_pretrained(
    "roberta-base",
    num_labels=len(label_names),
    id2label={i: label for i, label in enumerate(label_names)},
    label2id={label: i for i, label in enumerate(label_names)}
)

# Tokenization function
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding=False,  # Padding handled by data collator
        is_split_into_words=True,
        return_offsets_mapping=True
    )
    
    labels = []
    for i, label in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
            
        labels.append(label_ids)
    
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


tokenized_data = dataset_dict.map(tokenize_and_align_labels, batched=True)


training_args = TrainingArguments(
    output_dir            = "./results",
    # evaluation_strategy   = "epoch",     # so we actually run evaluation
    learning_rate         = 2e-5,
    per_device_train_batch_size = 2,
    per_device_eval_batch_size  = 2,
    num_train_epochs      = 3,
    weight_decay          = 0.01,
    logging_steps         = 50,
    save_strategy         = "no",
    report_to             = "none",
    metric_for_best_model = "entity_f1",  # <‑‑ NEW
    greater_is_better     = True,
)



# Metric computation
# --------------------------------------------------------------
# entity‑level metrics  (BIO ➜ whole‑entity evaluation)
# --------------------------------------------------------------
from seqeval.metrics import (
    precision_score,
    recall_score,
    f1_score,
    classification_report,
)
from seqeval.scheme import IOB2

ENTITY_TYPES = ["Chemical", "Disease"]     
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    true_preds, true_labels = [], []
    for p_seq, l_seq in zip(preds, labels):
        sent_preds, sent_labels = [], []
        for p, l in zip(p_seq, l_seq):
            if l == -100:        # ignore special / padding tokens
                continue
            sent_preds.append(label_names[p])
            sent_labels.append(label_names[l])
        true_preds.append(sent_preds)
        true_labels.append(sent_labels)

    ent_prec = precision_score(true_labels, true_preds, scheme=IOB2)
    ent_rec  = recall_score(   true_labels, true_preds, scheme=IOB2)
    ent_f1   = f1_score(       true_labels, true_preds, scheme=IOB2)

    report = classification_report(
        true_labels,
        true_preds,
        scheme       = IOB2,
        output_dict  = True,
        zero_division= 0,
    )

    metrics = {
        "entity_precision": ent_prec,
        "entity_recall"   : ent_rec,
        "entity_f1"       : ent_f1,
    }

    # per‑entity‑type lines (Chemical, Disease, …)
    for ent in ENTITY_TYPES:
        if ent in report:
            metrics[f"{ent}_precision"] = report[ent]["precision"]
            metrics[f"{ent}_recall"]    = report[ent]["recall"]
            metrics[f"{ent}_f1"]        = report[ent]["f1-score"]

    # macro / weighted averages
    metrics["macro_avg_precision"]    = report["macro avg"]["precision"]
    metrics["macro_avg_recall"]       = report["macro avg"]["recall"]
    metrics["macro_avg_f1"]           = report["macro avg"]["f1-score"]
    metrics["weighted_avg_precision"] = report["weighted avg"]["precision"]
    metrics["weighted_avg_recall"]    = report["weighted avg"]["recall"]
    metrics["weighted_avg_f1"]        = report["weighted avg"]["f1-score"]

    return metrics




data_collator = DataCollatorForTokenClassification(
    tokenizer,
    pad_to_multiple_of=8,
    padding=True,
    label_pad_token_id=-100
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    data_collator=data_collator,  
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

save_directory = "model_roberta"
trainer.save_model(save_directory)        
tokenizer.save_pretrained(save_directory) 


results = trainer.evaluate()






In [None]:
print(f"Entity-level (strict) F1: {results['eval_entity_f1']:.3f}")
print(f"Entity-level precision : {results['eval_entity_precision']:.3f}")
print(f"Entity-level recall    : {results['eval_entity_recall']:.3f}")

# Optional per‑type scores if you enabled them in compute_metrics
for k, v in results.items():
    if k.startswith("eval_") and k.endswith("_f1") and k not in {
        "eval_entity_f1"}:
        print(f"{k.replace('eval_','').upper():>20}: {v:.3f}")