<a href="https://colab.research.google.com/github/arthurziegler/transformers-for-NLP/blob/main/notebooks/NER_HuggingFace.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [67]:
!pip install transformers datasets accelerate nvidia-ml-py3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [68]:
from datasets import load_dataset

In [69]:
data = load_dataset("conll2003")



  0%|          | 0/3 [00:00<?, ?it/s]

In [70]:
data

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [71]:
# We can see that each item in the dataset is a sentence but its token and targets columns are lists divided by the words of the sentence
data['train'][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [72]:
data['train'].features

{'id': Value(dtype='string', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'pos_tags': Sequence(feature=ClassLabel(names=['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'], id=None), length=-1, id=None),
 'chunk_tags': Sequence(feature=ClassLabel(names=['O', 'B-ADJP', 'I-ADJP', 'B-ADVP', 'I-ADVP', 'B-CONJP', 'I-CONJP', 'B-INTJ', 'I-INTJ', 'B-LST', 'I-LST', 'B-NP', 'I-NP', 'B-PP', 'I-PP', 'B-PRT', 'I-PRT', 'B-SBAR', 'I-SBAR', 'B-UCP', 'I-UCP', 'B-VP', 'I-VP'], id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)}

In [73]:
data['train'].features['ner_tags']

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

In [74]:
data['train'].features['ner_tags'].feature.names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [75]:
# We save the feature names to use later
label_names = data['train'].features['ner_tags'].feature.names

In [76]:
from transformers import AutoTokenizer

In [77]:
checkpoint = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-cased/snapshots/9d7568e4b20ed5db15ee30e99c7219bde9990762/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-cased",
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.25.1",
  "vocab_size": 28996
}

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--distilbert-base-cased/snapshots/9d7568e4b20ed5db15ee30e99c7219bde9990762/vocab.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-cased/snapshots/9d7568e4b20ed5db15ee30e99c7219bde99

In [78]:
idx = 0
t = tokenizer(data['train'][idx]['tokens'], is_split_into_words=True)
t

{'input_ids': [101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [79]:
#The tokenizer return looks like a dictionary but it is actually an object called BatchEncoding
type(t)

transformers.tokenization_utils_base.BatchEncoding

In [80]:
#The object has a tokens method that returns the original tokens before transforming them into integers
t.tokens()

['[CLS]',
 'EU',
 'rejects',
 'German',
 'call',
 'to',
 'boycott',
 'British',
 'la',
 '##mb',
 '.',
 '[SEP]']

In [81]:
t.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

In [82]:
#Define relationship between B and I tags
#['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
begin2inside = {
    1:2,
    3:4,
    5:6,
    7:8
}

In [83]:
def align_targets(labels, word_ids):
    aligned_labels = []
    previous_word = None

    for word in word_ids:
        if word is None:
            # Tokens like [CLS] and [SEP]
            label = -100 #This value is used by Hugging Face to ignore the tokens during training
        elif word != previous_word:
            # New word in the list
            label = labels[word]
        else:
            #Repeated word (Would be the next sub-word)
            if labels[word] in begin2inside:
                #Change B-<tag> to I-<tag>
                label = begin2inside[labels[word]]
            else:
                # Sub-word of a word classified as "O" gets the same label "O"
                label = labels[word]

        aligned_labels.append(label)
        previous_word = word #update last word
 
    return aligned_labels

In [84]:
## Label-Token Alignment Test
idx = 21
test_data = tokenizer(data['train'][idx]['tokens'], is_split_into_words=True)
print("Tokenized Data:", test_data)
print("Word Tokens:", data['train'][idx]['tokens'])
test_labels = data['train'][idx]['ner_tags']
print("Word Labels:", test_labels)
print("Word IDs:", test_data.word_ids())
aligned_targets = align_targets(test_labels, test_data.word_ids())
print("Sub-Word Labels:", aligned_targets)
print("Sub-Word Tokens:", test_data.tokens())

Tokenized Data: {'input_ids': [101, 149, 11414, 2137, 11414, 1820, 118, 4775, 118, 1659, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
Word Tokens: ['LONDON', '1996-08-22']
Word Labels: [5, 0]
Word IDs: [None, 0, 0, 0, 0, 1, 1, 1, 1, 1, None]
Sub-Word Labels: [-100, 5, 6, 6, 6, 0, 0, 0, 0, 0, -100]
Sub-Word Tokens: ['[CLS]', 'L', '##ON', '##D', '##ON', '1996', '-', '08', '-', '22', '[SEP]']


In [85]:
aligned_labels = [label_names[t] if t>=0 else None for t in aligned_targets]
for x, y in zip(test_data.tokens(), aligned_labels):
    print(f"{x}\t{y}")

[CLS]	None
L	B-LOC
##ON	I-LOC
##D	I-LOC
##ON	I-LOC
1996	O
-	O
08	O
-	O
22	O
[SEP]	None


In [86]:
# Function to tokenize both inputs and targets
def tokenize_fn(batch):
    # Tokenize the input sequence first
    tokenized_inputs = tokenizer(batch['tokens'], truncation=True, is_split_into_words=True)
    labels_batch = batch['ner_tags'] # The original targets word-by-word
    aligned_labels_batch = [] # The aligned targets sub-word by sub-word
    # Loop through each label sequence in the batch
    for i, labels in enumerate(labels_batch):
        word_ids = tokenized_inputs.word_ids(i) # Get word IDs for the sequence
        aligned_labels_batch.append(align_targets(labels, word_ids)) # Align sequence labels
    
    # Save final aligned labels in a column called 'labels' which is the required name for the hugging face models
    tokenized_inputs['labels'] = aligned_labels_batch
    
    return tokenized_inputs

In [87]:
data["train"].column_names

['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags']

In [88]:
tokenized_datasets = data.map(
	tokenize_fn,
	batched=True,
	remove_columns=data["train"].column_names
)



In [89]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

In [90]:
from transformers import DataCollatorForTokenClassification

In [91]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
data_collator

DataCollatorForTokenClassification(tokenizer=PreTrainedTokenizerFast(name_or_path='distilbert-base-cased', vocab_size=28996, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}), padding=True, max_length=None, pad_to_multiple_of=None, label_pad_token_id=-100, return_tensors='pt')

In [92]:
# Test the data collator
# First we need to define a list of samples from our dataset
collator_testset = [tokenized_datasets["train"][i] for i in range(2)]

In [93]:
collator_testset

[{'input_ids': [101,
   7270,
   22961,
   1528,
   1840,
   1106,
   21423,
   1418,
   2495,
   12913,
   119,
   102],
  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  'labels': [-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]},
 {'input_ids': [101, 1943, 14428, 102],
  'attention_mask': [1, 1, 1, 1],
  'labels': [-100, 1, 2, -100]}]

In [94]:
batch = data_collator(collator_testset)
batch["labels"]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([[-100,    3,    0,    7,    0,    0,    0,    7,    0,    0,    0, -100],
        [-100,    1,    2, -100, -100, -100, -100, -100, -100, -100, -100, -100]])

In [95]:
!pip install seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [96]:
from datasets import load_metric

In [97]:
metric = load_metric("seqeval")

In [99]:
# Try to use the compute metrics for a sequence
# It will give an error as the method work only with sequences of sequences
metric.compute(predictions=[0, 0, 0], references=[0, 0, 1])

TypeError: ignored

In [100]:
# Test the compute metrics again as a list of lists
# There will be warnings that the labels passed are integers instead of strings
metric.compute(
    predictions=[[0, 0, 0], [1, 0, 1]], 
    references=[[0, 0, 1], [1, 0, 1]])

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


{'overall_precision': 0.0,
 'overall_recall': 0.0,
 'overall_f1': 0.0,
 'overall_accuracy': 0.8333333333333334}

In [101]:
# Test the compute metrics again as a list of lists of a character
# There will be warnings that the labels passed are not NE tags
metric.compute(
    predictions=[['A', 'A', 'A'], ['A', 'B', 'C']], 
    references=[['A', 'C', 'A'], ['B', 'B', 'C']])



{'_': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1},
 'overall_precision': 0.0,
 'overall_recall': 0.0,
 'overall_f1': 0.0,
 'overall_accuracy': 0.6666666666666666}

In [102]:
# Test the compute metrics again as a list of lists of valid NE IOB tags
metric.compute(
    predictions=[['O', 'O', 'I-ORG'], ['B-MISC', 'O', 'B-PER']], 
    references=[['O', 'B-LOC', 'B-ORG'], ['B-MISC', 'I-MISC', 'B-PER']])

  _warn_prf(average, modifier, msg_start, len(result))


{'LOC': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1},
 'MISC': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1},
 'ORG': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'PER': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 0.6666666666666666,
 'overall_recall': 0.5,
 'overall_f1': 0.5714285714285715,
 'overall_accuracy': 0.5}

In [103]:
import numpy as np

In [104]:
def compute_metrics(logits_and_labels):
    logits, labels = logits_and_labels
    preds = np.argmax(logits, axis=-1)

    # remove -100, convert the label ids to label names
    str_labels = [[label_names[t] for t in label if t != -100] for label in labels]

    # do the same for predictions whenever true label is -100
    str_preds = [[label_names[p] for p, t in zip(pred, targ) if t != -100] for pred, targ in zip(preds, labels)]

    the_metrics = metric.compute(predictions=str_preds, references=str_labels)
    return {
        'precision': the_metrics['overall_precision'],
        'recall': the_metrics['overall_recall'],
        'f1': the_metrics['overall_f1'],
        'accuracy': the_metrics['overall_accuracy']
        }

In [105]:
id2label = {k: v for k, v in enumerate(label_names)} #Get label IDs
label2id = {v: k for k, v in id2label.items()} #Get label names from IDs
id2label

{0: 'O',
 1: 'B-PER',
 2: 'I-PER',
 3: 'B-ORG',
 4: 'I-ORG',
 5: 'B-LOC',
 6: 'I-LOC',
 7: 'B-MISC',
 8: 'I-MISC'}

In [106]:
from transformers import AutoModelForTokenClassification

In [107]:
model = AutoModelForTokenClassification.from_pretrained(
    checkpoint,
    id2label=id2label,
    label2id=label2id
)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-cased/snapshots/9d7568e4b20ed5db15ee30e99c7219bde9990762/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-cased",
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "O",
    "1": "B-PER",
    "2": "I-PER",
    "3": "B-ORG",
    "4": "I-ORG",
    "5": "B-LOC",
    "6": "I-LOC",
    "7": "B-MISC",
    "8": "I-MISC"
  },
  "initializer_range": 0.02,
  "label2id": {
    "B-LOC": 5,
    "B-MISC": 7,
    "B-ORG": 3,
    "B-PER": 1,
    "I-LOC": 6,
    "I-MISC": 8,
    "I-ORG": 4,
    "I-PER": 2,
    "O": 0
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_versio

In [108]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    "distilbert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [109]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [110]:
from pynvml import *


def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [111]:
print(trainer.args.device)
print_gpu_utilization()
!nvidia-smi

cuda:0
GPU memory occupied: 2686 MB.
Sun Jan  1 20:40:25 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   50C    P0    28W /  70W |   2686MiB / 15109MiB |     11%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+--------------------------------------------------

In [112]:
result = trainer.train()
print_summary(result)

***** Running training *****
  Num examples = 14041
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5268
  Number of trainable parameters = 65197833


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1035,0.084191,0.895136,0.913665,0.904306,0.975717
2,0.045,0.077397,0.907968,0.928139,0.917943,0.981103
3,0.0255,0.069776,0.916352,0.938405,0.927247,0.982884


***** Running Evaluation *****
  Num examples = 3250
  Batch size = 8
Saving model checkpoint to distilbert-finetuned-ner/checkpoint-1756
Configuration saved in distilbert-finetuned-ner/checkpoint-1756/config.json
Model weights saved in distilbert-finetuned-ner/checkpoint-1756/pytorch_model.bin
tokenizer config file saved in distilbert-finetuned-ner/checkpoint-1756/tokenizer_config.json
Special tokens file saved in distilbert-finetuned-ner/checkpoint-1756/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3250
  Batch size = 8
Saving model checkpoint to distilbert-finetuned-ner/checkpoint-3512
Configuration saved in distilbert-finetuned-ner/checkpoint-3512/config.json
Model weights saved in distilbert-finetuned-ner/checkpoint-3512/pytorch_model.bin
tokenizer config file saved in distilbert-finetuned-ner/checkpoint-3512/tokenizer_config.json
Special tokens file saved in distilbert-finetuned-ner/checkpoint-3512/special_tokens_map.json
***** Running Evaluation *****
 

Time: 327.99
Samples/second: 128.43
GPU memory occupied: 2846 MB.


In [113]:
print_gpu_utilization()

GPU memory occupied: 2846 MB.


In [114]:
type(trainer.train_dataset)

datasets.arrow_dataset.Dataset

In [115]:
trainer.save_model('my_saved_model')

Saving model checkpoint to my_saved_model
Configuration saved in my_saved_model/config.json
Model weights saved in my_saved_model/pytorch_model.bin
tokenizer config file saved in my_saved_model/tokenizer_config.json
Special tokens file saved in my_saved_model/special_tokens_map.json


In [127]:
from transformers import pipeline

ner = pipeline(
    "token-classification",
    model='my_saved_model',
    aggregation_strategy="none",
    ignore_labels=[""],
    device=0
)

loading configuration file my_saved_model/config.json
Model config DistilBertConfig {
  "_name_or_path": "my_saved_model",
  "activation": "gelu",
  "architectures": [
    "DistilBertForTokenClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "O",
    "1": "B-PER",
    "2": "I-PER",
    "3": "B-ORG",
    "4": "I-ORG",
    "5": "B-LOC",
    "6": "I-LOC",
    "7": "B-MISC",
    "8": "I-MISC"
  },
  "initializer_range": 0.02,
  "label2id": {
    "B-LOC": 5,
    "B-MISC": 7,
    "B-ORG": 3,
    "B-PER": 1,
    "I-LOC": 6,
    "I-MISC": 8,
    "I-ORG": 4,
    "I-PER": 2,
    "O": 0
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.25.1",
  "vocab_size": 28996

In [128]:
s = "O excelentíssimo presidente Luis Silva Bueno visitou ontem a Universidade Federal de Manaus do estado do Amazonas, Brasil"
results = ner(s)

In [129]:
results

[{'entity': 'O',
  'score': 0.90171427,
  'index': 1,
  'word': 'O',
  'start': 0,
  'end': 1},
 {'entity': 'O',
  'score': 0.99105835,
  'index': 2,
  'word': 'ex',
  'start': 2,
  'end': 4},
 {'entity': 'O',
  'score': 0.9698714,
  'index': 3,
  'word': '##cel',
  'start': 4,
  'end': 7},
 {'entity': 'O',
  'score': 0.87647444,
  'index': 4,
  'word': '##ent',
  'start': 7,
  'end': 10},
 {'entity': 'O',
  'score': 0.55733407,
  'index': 5,
  'word': '##ís',
  'start': 10,
  'end': 12},
 {'entity': 'O',
  'score': 0.6009466,
  'index': 6,
  'word': '##si',
  'start': 12,
  'end': 14},
 {'entity': 'O',
  'score': 0.5632206,
  'index': 7,
  'word': '##mo',
  'start': 14,
  'end': 16},
 {'entity': 'O',
  'score': 0.9990558,
  'index': 8,
  'word': 'president',
  'start': 17,
  'end': 26},
 {'entity': 'O',
  'score': 0.99827135,
  'index': 9,
  'word': '##e',
  'start': 26,
  'end': 27},
 {'entity': 'B-PER',
  'score': 0.9983644,
  'index': 10,
  'word': 'Luis',
  'start': 28,
  'end': 3

In [130]:
for i in range(len(results)):
    print(results[i])
    #print(f"the {i}-th result={results[i]['token_str']} has score {results[i]['score']}")

{'entity': 'O', 'score': 0.90171427, 'index': 1, 'word': 'O', 'start': 0, 'end': 1}
{'entity': 'O', 'score': 0.99105835, 'index': 2, 'word': 'ex', 'start': 2, 'end': 4}
{'entity': 'O', 'score': 0.9698714, 'index': 3, 'word': '##cel', 'start': 4, 'end': 7}
{'entity': 'O', 'score': 0.87647444, 'index': 4, 'word': '##ent', 'start': 7, 'end': 10}
{'entity': 'O', 'score': 0.55733407, 'index': 5, 'word': '##ís', 'start': 10, 'end': 12}
{'entity': 'O', 'score': 0.6009466, 'index': 6, 'word': '##si', 'start': 12, 'end': 14}
{'entity': 'O', 'score': 0.5632206, 'index': 7, 'word': '##mo', 'start': 14, 'end': 16}
{'entity': 'O', 'score': 0.9990558, 'index': 8, 'word': 'president', 'start': 17, 'end': 26}
{'entity': 'O', 'score': 0.99827135, 'index': 9, 'word': '##e', 'start': 26, 'end': 27}
{'entity': 'B-PER', 'score': 0.9983644, 'index': 10, 'word': 'Luis', 'start': 28, 'end': 32}
{'entity': 'I-PER', 'score': 0.9961175, 'index': 11, 'word': 'Silva', 'start': 33, 'end': 38}
{'entity': 'I-PER', 's

In [119]:
## Test
test_data = tokenizer(s)
print("Tokenized Data:", test_data)
print("Word Tokens:", data['train'][idx]['tokens'])
print("Word IDs:", test_data.word_ids())
print("Sub-Word Tokens:", test_data.tokens())

Tokenized Data: {'input_ids': [101, 152, 4252, 18389, 3452, 16928, 5053, 3702, 2084, 1162, 6132, 11211, 139, 23404, 1186, 3143, 6094, 1113, 18408, 170, 17572, 1162, 3467, 1260, 2268, 25134, 1202, 12890, 9359, 1202, 9786, 2225, 117, 23381, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
Word Tokens: ['LONDON', '1996-08-22']
Word IDs: [None, 0, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4, 5, 5, 5, 6, 6, 7, 7, 8, 9, 9, 10, 11, 12, 12, 13, 14, 14, 15, 16, 16, 17, 18, None]
Sub-Word Tokens: ['[CLS]', 'O', 'ex', '##cel', '##ent', '##ís', '##si', '##mo', 'president', '##e', 'Luis', 'Silva', 'B', '##uen', '##o', 'visit', '##ou', 'on', '##tem', 'a', 'Universidad', '##e', 'Federal', 'de', 'Man', '##aus', 'do', 'est', '##ado', 'do', 'Amazon', '##as', ',', 'Brasil', '[SEP]']


In [120]:
!cat my_saved_model/config.json

{
  "_name_or_path": "distilbert-base-cased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForTokenClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "O",
    "1": "B-PER",
    "2": "I-PER",
    "3": "B-ORG",
    "4": "I-ORG",
    "5": "B-LOC",
    "6": "I-LOC",
    "7": "B-MISC",
    "8": "I-MISC"
  },
  "initializer_range": 0.02,
  "label2id": {
    "B-LOC": 5,
    "B-MISC": 7,
    "B-ORG": 3,
    "B-PER": 1,
    "I-LOC": 6,
    "I-MISC": 8,
    "I-ORG": 4,
    "I-PER": 2,
    "O": 0
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.25.1",
  "vocab_size": 28996
}
