In [1]:
from datasets import load_dataset

data = load_dataset("conll2003")
data

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [2]:
data['train'][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [3]:
data['train'].features

{'id': Value(dtype='string', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'pos_tags': Sequence(feature=ClassLabel(names=['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'], id=None), length=-1, id=None),
 'chunk_tags': Sequence(feature=ClassLabel(names=['O', 'B-ADJP', 'I-ADJP', 'B-ADVP', 'I-ADVP', 'B-CONJP', 'I-CONJP', 'B-INTJ', 'I-INTJ', 'B-LST', 'I-LST', 'B-NP', 'I-NP', 'B-PP', 'I-PP', 'B-PRT', 'I-PRT', 'B-SBAR', 'I-SBAR', 'B-UCP', 'I-UCP', 'B-VP', 'I-VP'], id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)}

In [4]:
data['train'].features["ner_tags"]

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

In [5]:
data['train'].features["ner_tags"].feature

ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None)

In [6]:
data['train'].features["ner_tags"].feature.names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [7]:
label_names = data['train'].features["ner_tags"].feature.names
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [8]:
from transformers import AutoTokenizer

checkpoint = "distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [9]:
t = tokenizer("This lamb is little")

In [10]:
t.tokens(), t.word_ids(), 

(['[CLS]', 'This', 'la', '##mb', 'is', 'little', '[SEP]'],
 [None, 0, 1, 1, 2, 3, None])

In [11]:
list(tokenizer.special_tokens_map.values())

['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']

In [12]:
def alignTargets(labels, word_ids):
    aligned_targets = []
    prev_id = None
    
    for w_id in word_ids:
        if w_id is None:
            aligned_targets.append(-100)
        elif w_id == prev_id and labels[w_id] in [1, 3, 5, 7]:
            aligned_targets.append(labels[w_id]+1)
        else:
            aligned_targets.append(labels[w_id])
        
        prev_id = w_id
        
    return aligned_targets

In [13]:
idx = 0

t = tokenizer(data['train'][idx]['tokens'], is_split_into_words=True)
t

{'input_ids': [101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [14]:
data['train'][idx]['tokens']

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

In [15]:
t.tokens()

['[CLS]',
 'EU',
 'rejects',
 'German',
 'call',
 'to',
 'boycott',
 'British',
 'la',
 '##mb',
 '.',
 '[SEP]']

In [16]:
t.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

In [17]:
data['train'][idx]['ner_tags']

[3, 0, 7, 0, 0, 0, 7, 0, 0]

In [18]:
aligned_targets = alignTargets(data['train'][idx]['ner_tags'], t.word_ids())
aligned_targets

[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]

In [19]:
assert len(aligned_targets) == len(t.tokens())

In [20]:
def tokenize_fn(batch):
    tokenized_inputs = tokenizer(
        batch["tokens"], truncation=True, is_split_into_words=True
    )
    aligned_targets = []
    for i, tags in enumerate(batch["ner_tags"]):
        aligned_targets.append(alignTargets(tags, tokenized_inputs.word_ids(i)))

    tokenized_inputs["labels"] = aligned_targets
    return tokenized_inputs


tokenize_fn(data["train"][:5])


{'input_ids': [[101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102], [101, 1943, 14428, 102], [101, 26660, 13329, 12649, 15928, 1820, 118, 4775, 118, 1659, 102], [101, 1109, 1735, 2827, 1163, 1113, 9170, 1122, 19786, 1114, 1528, 5566, 1106, 11060, 1106, 188, 17315, 1418, 2495, 12913, 1235, 6479, 4959, 2480, 6340, 13991, 3653, 1169, 1129, 12086, 1106, 8892, 119, 102], [101, 1860, 112, 188, 4702, 1106, 1103, 1735, 1913, 112, 188, 27431, 3914, 14651, 163, 7635, 4119, 1163, 1113, 9031, 11060, 1431, 4417, 8892, 3263, 2980, 1121, 2182, 1168, 1190, 2855, 1235, 1103, 3812, 5566, 1108, 27830, 119, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0,

In [21]:
tokenized_datasets = data.map(tokenize_fn, batched=True, remove_columns=data['train'].column_names)
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

In [22]:
labels = data['train'][idx]['ner_tags']
word_ids = t.word_ids()
aligned_targets = alignTargets(labels, word_ids)
aligned_targets

[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]

In [23]:
aligned_labels = [label_names[t] if t >= 0 else None for t in aligned_targets]
for tok, lab in zip(t.tokens(), aligned_labels):
    print(f"{tok}\t{lab}")

[CLS]	None
EU	B-ORG
rejects	O
German	B-MISC
call	O
to	O
boycott	O
British	B-MISC
la	O
##mb	O
.	O
[SEP]	None


In [24]:
# make up a fake input just to test it
words = [
  '[CLS]', 'Ger', '##man', 'call', 'to', 'boycott', 'Micro', '##soft', '[SEP]']
word_ids = [None, 0, 0, 1, 2, 3, 4, 4, None]
labels = [7, 0, 0, 0, 3]
aligned_targets = alignTargets(labels, word_ids)
aligned_labels = [label_names[t] if t >= 0 else None for t in aligned_targets]
for x, y in zip(words, aligned_labels):
  print(f"{x}\t{y}")

[CLS]	None
Ger	B-MISC
##man	I-MISC
call	O
to	O
boycott	O
Micro	B-ORG
##soft	I-ORG
[SEP]	None


In [25]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [26]:
tokenized_datasets['train'][:2]

{'input_ids': [[101,
   7270,
   22961,
   1528,
   1840,
   1106,
   21423,
   1418,
   2495,
   12913,
   119,
   102],
  [101, 1943, 14428, 102]],
 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1]],
 'labels': [[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100], [-100, 1, 2, -100]]}

In [27]:
# DataColletor do not work with a dictionary of lists! We need a list of dictionaries

[tokenized_datasets['train'][i] for i in range(2)]

[{'input_ids': [101,
   7270,
   22961,
   1528,
   1840,
   1106,
   21423,
   1418,
   2495,
   12913,
   119,
   102],
  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  'labels': [-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]},
 {'input_ids': [101, 1943, 14428, 102],
  'attention_mask': [1, 1, 1, 1],
  'labels': [-100, 1, 2, -100]}]

In [28]:
batch = data_collator([tokenized_datasets['train'][i] for i in range(2)])
# Note that attendion_mask is zero for the values -100!
# Remember: values -100 are the way the transformers library work with special tokens like [PAD] and [SEP]
batch

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': tensor([[  101,  7270, 22961,  1528,  1840,  1106, 21423,  1418,  2495, 12913,
           119,   102],
        [  101,  1943, 14428,   102,     0,     0,     0,     0,     0,     0,
             0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]), 'labels': tensor([[-100,    3,    0,    7,    0,    0,    0,    7,    0,    0,    0, -100],
        [-100,    1,    2, -100, -100, -100, -100, -100, -100, -100, -100, -100]])}

In [29]:
batch['labels']

tensor([[-100,    3,    0,    7,    0,    0,    0,    7,    0,    0,    0, -100],
        [-100,    1,    2, -100, -100, -100, -100, -100, -100, -100, -100, -100]])

In [30]:
!pip install seqeval

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [31]:
from datasets import load_metric

metric = load_metric("seqeval")
metric.compute(predictions=[["O", "O", "I-ORG", "B-MISC"]], references=[['O', 'B-ORG', 'I-ORG', 'B-MISC']])


  metric = load_metric("seqeval")


{'MISC': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'ORG': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1},
 'overall_precision': 0.5,
 'overall_recall': 0.5,
 'overall_f1': 0.5,
 'overall_accuracy': 0.75}

In [32]:
# test it out - no longer works, now it looks for actual NE tags
metric.compute(predictions=[[0], [0], [0]], references=[[0], [0], [1]])

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


{'overall_precision': 0.0,
 'overall_recall': 0.0,
 'overall_f1': 0.0,
 'overall_accuracy': 0.6666666666666666}

In [33]:
# test it out - again: now it looks for actual NE tags
metric.compute(predictions=[["A", "A", "A"]], references=[["A", "B", "A"]])



{'overall_precision': 0.0,
 'overall_recall': 0.0,
 'overall_f1': 0.0,
 'overall_accuracy': 0.6666666666666666}

In [34]:
import numpy as np


seqeval_metric = load_metric("seqeval")


def compute_metrics(logits_and_labels):
    logits, labels = logits_and_labels
    predictions = np.argmax(logits, axis=-1)

    # remove the -100, convert the int label to str label name
    str_predictions = [
        [label_names[l] for l in label_arr if l != -100] for label_arr in labels
    ]
    # do the same for predictions, unless int label = -100
    str_labels = [
        [label_names[p] for l, p in zip(label_arr, pred_arr) if l != -100]
        for label_arr, pred_arr in zip(labels, predictions)
    ]
    metrics = seqeval_metric.compute(predictions=str_predictions, references=str_labels)

    return {
        "accuracy": metrics["overall_accuracy"],
        "f1": metrics["overall_f1"],
        "recall": metrics["overall_recall"],
        "precision": metrics["overall_precision"],
    }


In [35]:
id2label = {k: v for k, v in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [36]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    "my_trainer",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
)

In [38]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)


In [39]:
trainer.train()



  0%|          | 0/5268 [00:00<?, ?it/s]

{'loss': 0.2864, 'learning_rate': 1.810174639331815e-05, 'epoch': 0.28}
{'loss': 0.1305, 'learning_rate': 1.6203492786636296e-05, 'epoch': 0.57}
{'loss': 0.0955, 'learning_rate': 1.4305239179954442e-05, 'epoch': 0.85}


  0%|          | 0/407 [00:00<?, ?it/s]

{'eval_loss': 0.09405087679624557, 'eval_accuracy': 0.9762023900629894, 'eval_f1': 0.9050270045700042, 'eval_recall': 0.8938125718037092, 'eval_precision': 0.9165264220801077, 'eval_runtime': 4.1562, 'eval_samples_per_second': 781.96, 'eval_steps_per_second': 97.926, 'epoch': 1.0}
{'loss': 0.0726, 'learning_rate': 1.240698557327259e-05, 'epoch': 1.14}
{'loss': 0.0521, 'learning_rate': 1.0508731966590738e-05, 'epoch': 1.42}
{'loss': 0.0493, 'learning_rate': 8.610478359908885e-06, 'epoch': 1.71}
{'loss': 0.047, 'learning_rate': 6.712224753227031e-06, 'epoch': 1.99}


  0%|          | 0/407 [00:00<?, ?it/s]

{'eval_loss': 0.07171370089054108, 'eval_accuracy': 0.9811620651086125, 'eval_f1': 0.9178503197940028, 'eval_recall': 0.906183368869936, 'eval_precision': 0.929821608885897, 'eval_runtime': 4.4778, 'eval_samples_per_second': 725.799, 'eval_steps_per_second': 90.892, 'epoch': 2.0}
{'loss': 0.0263, 'learning_rate': 4.8139711465451785e-06, 'epoch': 2.28}
{'loss': 0.0306, 'learning_rate': 2.9157175398633257e-06, 'epoch': 2.56}
{'loss': 0.0311, 'learning_rate': 1.0174639331814731e-06, 'epoch': 2.85}


  0%|          | 0/407 [00:00<?, ?it/s]

{'eval_loss': 0.07092730700969696, 'eval_accuracy': 0.9828692529581444, 'eval_f1': 0.9249314157452823, 'eval_recall': 0.9139149006078529, 'eval_precision': 0.9362167620329855, 'eval_runtime': 4.6528, 'eval_samples_per_second': 698.498, 'eval_steps_per_second': 87.473, 'epoch': 3.0}
{'train_runtime': 239.1066, 'train_samples_per_second': 176.168, 'train_steps_per_second': 22.032, 'train_loss': 0.0791057058600829, 'epoch': 3.0}


TrainOutput(global_step=5268, training_loss=0.0791057058600829, metrics={'train_runtime': 239.1066, 'train_samples_per_second': 176.168, 'train_steps_per_second': 22.032, 'train_loss': 0.0791057058600829, 'epoch': 3.0})

In [40]:
trainer.save_model('my_saved_model')

In [41]:
from transformers import pipeline

ner = pipeline(
  "token-classification",
  model='my_saved_model',
  aggregation_strategy="simple",
  device=0,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [43]:
s = "Bill Gates was the CEO of Microsoft in Seattle, Washington DC."
ner(s)

[{'entity_group': 'PER',
  'score': 0.9993341,
  'word': 'Bill Gates',
  'start': 0,
  'end': 10},
 {'entity_group': 'ORG',
  'score': 0.9987509,
  'word': 'Microsoft',
  'start': 26,
  'end': 35},
 {'entity_group': 'LOC',
  'score': 0.9986998,
  'word': 'Seattle',
  'start': 39,
  'end': 46},
 {'entity_group': 'LOC',
  'score': 0.96306336,
  'word': 'Washington DC',
  'start': 48,
  'end': 61}]