# 1. Load dataset

In [31]:
from datasets import load_dataset

# Load only the test split of the CoNLL-2003 dataset
raw_dataset = load_dataset("conll2003", split="test")
raw_dataset

Dataset({
    features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
    num_rows: 3453
})

# 3. Get labels

In [32]:
ner_feature = raw_dataset.features["ner_tags"]
label_names = ner_feature.feature.names
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

# 2. Load fine-tuned models from the HuggingFace

In [33]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

model_names = ["GustawB/albert-finetuned-ner", "GustawB/distilbert-finetuned-ner", "GustawB/bert-finetuned-ner"]
model_tokenizers = {}
models = {}
# Load tokenizers and models in an automatic fashion
for name in model_names:
    model_tokenizers[name] = AutoTokenizer.from_pretrained(name)
    models[name] = AutoModelForTokenClassification.from_pretrained(
        name,
        id2label=id2label,
        label2id=label2id,
    )

# 3 Tokenize and align labels

In [34]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [35]:
def tokenize_and_align_labels(examples, tokenizer):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [36]:
tokenized_datasets = {}
for name in model_names:
    tokenized_datasets[name] = raw_dataset.map(
        lambda examples: tokenize_and_align_labels(examples, model_tokenizers[name]),
        batched=True,
        remove_columns=raw_dataset.column_names,
    )
tokenized_datasets

{'GustawB/albert-finetuned-ner': Dataset({
     features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
     num_rows: 3453
 }),
 'GustawB/distilbert-finetuned-ner': Dataset({
     features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
     num_rows: 3453
 }),
 'GustawB/bert-finetuned-ner': Dataset({
     features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
     num_rows: 3453
 })}

# 4. Collate the datasets

In [37]:
from transformers import DataCollatorForTokenClassification

collators = {}
for name in model_names:
    collators[name] = DataCollatorForTokenClassification(tokenizer=model_tokenizers[name])

# 5. Define metrics

In [38]:
import numpy as np
import evaluate

metric = evaluate.load("seqeval")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

# 6. Evaluate models

In [45]:
from transformers import Trainer, TrainingArguments
from pprint import pprint

from accelerate import Accelerator, ProfileKwargs
from transformers import Trainer, TrainingArguments
import torch

# Setup for resource profiling with Accelerate
profile_kwargs = ProfileKwargs(
    activities=["cpu"],  # Track CPU utilization, can also use "gpu" if using a GPU
    record_shapes=True
)

results = {}
resource_logs = {}
for name in model_names:
    # Initialize Accelerator with profiling
    accelerator = Accelerator(cpu=True, kwargs_handlers=[profile_kwargs])
    # Prepare model with Accelerator
    model = accelerator.prepare(models[name])
    
    args = TrainingArguments(
        per_device_eval_batch_size=8,
        output_dir="./results",
        logging_dir="./logs",
    )
    
    trainer = Trainer(
        model=model,
        data_collator=collators[name],
        args=args,
        compute_metrics=compute_metrics,
        tokenizer=model_tokenizers[name],
    )
    
    # Profiling with Accelerator
    with accelerator.profile() as prof:  # Begin profiling
        eval_results = trainer.evaluate(tokenized_datasets[name])  # Run evaluation
        results[name] = eval_results
        resource_logs[name] = prof.key_averages().table(sort_by="cpu_time_total", row_limit=10)
        print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))
    break
    #results[name] = trainer.evaluate(tokenized_datasets[name])
    
pprint(results)
pprint(resource_logs)

  trainer = Trainer(






KeyboardInterrupt

