In [1]:
import torch

from transformers import pipeline
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, AutoTokenizer, AutoModel
from transformers import DataCollatorForTokenClassification
from datasets import load_dataset
import evaluate

import numpy as np
import pandas as pd

### Dataset

In [None]:
raw_datasets = load_dataset("conll2003")
raw_datasets

In [7]:
input_eg = raw_datasets['train'][0]['tokens']
input_eg

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

In [8]:
output_eg = raw_datasets['train'][0]['ner_tags']
output_eg

[3, 0, 7, 0, 0, 0, 7, 0, 0]

In [27]:
train = raw_datasets['train']

In [41]:
len(train)

14041

In [112]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [119]:
raw_datasets['test'].select(range(10))

Dataset({
    features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
    num_rows: 10
})

### Tokenizer

In [64]:
names = raw_datasets['train'].features['ner_tags'].feature.names
def labels_to_labelnames(labels):
    return [None if l == -100 else names[l] for l in labels]

In [19]:
def align_labels_with_subwords(wordids, labels):
    new_labels = [-100 if w is None else labels[w] for w in wordids]

    # If a label is repeated (same label as the one preceding it) and the label is B-XXX we change it to I-XXX
    updated_labels = [(l+1) if l % 2 == 1 and i > 0 and new_labels[i-1] == l else l for i, l in enumerate(new_labels)]

    return updated_labels

In [60]:
def tokenize_and_align_labels(ds_slice, tokenizer):
    tokenized = tokenizer(ds_slice['tokens'], truncation=True, is_split_into_words=True)
    tokenized['subtokens'] = [tokenized.tokens(i) for i, t in enumerate(ds_slice['tokens'])]
    tokenized['word_ids'] = [tokenized.word_ids(i) for i, t in enumerate(ds_slice['tokens'])]
    tokenized['labels'] = [align_labels_with_subwords(tokenized.word_ids(i), nt) for i, nt in enumerate(ds_slice['ner_tags'])]

    return tokenized

In [None]:
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [136]:
input_eg

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

In [9]:
subwordtokens_eg = tokenizer(input_eg, is_split_into_words=True)
subwordtokens_eg

{'input_ids': [101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [12]:
subwordtokens_eg.tokens()

['[CLS]',
 'EU',
 'rejects',
 'German',
 'call',
 'to',
 'boycott',
 'British',
 'la',
 '##mb',
 '.',
 '[SEP]']

In [137]:
input_sen = 'EU rejects German call to boycott British lamb.'

In [138]:
tokenizer(input_sen)

{'input_ids': [101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [139]:
tokenizer(input_sen).tokens()

['[CLS]',
 'EU',
 'rejects',
 'German',
 'call',
 'to',
 'boycott',
 'British',
 'la',
 '##mb',
 '.',
 '[SEP]']

In [None]:
mapped = raw_datasets.map(lambda x: tokenize_and_align_labels(x, tokenizer), batched=True, remove_columns=['pos_tags', 'chunk_tags'])

In [70]:
mapped['test'][2]

{'id': '2',
 'tokens': ['AL-AIN', ',', 'United', 'Arab', 'Emirates', '1996-12-06'],
 'ner_tags': [5, 0, 5, 6, 6, 0],
 'input_ids': [101,
  18589,
  118,
  19016,
  2249,
  117,
  1244,
  4699,
  14832,
  1820,
  118,
  1367,
  118,
  5037,
  102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'subtokens': ['[CLS]',
  'AL',
  '-',
  'AI',
  '##N',
  ',',
  'United',
  'Arab',
  'Emirates',
  '1996',
  '-',
  '12',
  '-',
  '06',
  '[SEP]'],
 'word_ids': [None, 0, 0, 0, 0, 1, 2, 3, 4, 5, 5, 5, 5, 5, None],
 'labels': [-100, 5, 6, 6, 6, 0, 5, 6, 6, 0, 0, 0, 0, 0, -100]}

In [71]:
labels_to_labelnames(mapped['test'][2]['ner_tags'])

['B-LOC', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'O']

In [None]:
ds = mapped.map(remove_columns=['id', 'tokens', 'ner_tags', 'subtokens', 'word_ids'])

In [82]:
ds['train'][0]

{'input_ids': [101,
  7270,
  22961,
  1528,
  1840,
  1106,
  21423,
  1418,
  2495,
  12913,
  119,
  102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]}

### Data collation and padding

In [72]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [81]:
batch = data_collator([ds["train"][i] for i in range(2)])


In [79]:
batch

{'input_ids': tensor([[  101,  7270, 22961,  1528,  1840,  1106, 21423,  1418,  2495, 12913,
            119,   102],
         [  101,  1943, 14428,   102,     0,     0,     0,     0,     0,     0,
              0,     0]]),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]),
 'labels': tensor([[-100,    3,    0,    7,    0,    0,    0,    7,    0,    0,    0, -100],
         [-100,    1,    2, -100, -100, -100, -100, -100, -100, -100, -100, -100]])}

### Metrics

In [None]:
metric = evaluate.load("seqeval")

In [103]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

### Model

In [86]:
id2label = {i: label for i, label in enumerate(names)}
label2id = {v: k for k, v in id2label.items()}

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

In [104]:
model.config

BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-PER",
    "2": "I-PER",
    "3": "B-ORG",
    "4": "I-ORG",
    "5": "B-LOC",
    "6": "I-LOC",
    "7": "B-MISC",
    "8": "I-MISC"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-LOC": 5,
    "B-MISC": 7,
    "B-ORG": 3,
    "B-PER": 1,
    "I-LOC": 6,
    "I-MISC": 8,
    "I-ORG": 4,
    "I-PER": 2,
    "O": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

### Trainer

In [115]:
args = TrainingArguments(
    output_dir="model_path = "./my_model_directory/"",
    evaluation_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    eval_steps=100,
    save_steps=100,
#    per_device_train_batch_size=16,
#    per_device_eval_batch_size=16,    
    learning_rate=2e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    save_total_limit=2,
    report_to = "none"
)

PyTorch: setting up devices


In [120]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds["validation"],
    eval_dataset=ds["test"].select(range(500)),
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

***** Running training *****
  Num examples = 3250
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 407
  Number of trainable parameters = 107726601


  0%|          | 0/407 [00:00<?, ?it/s]

***** Running Evaluation *****
  Num examples = 500
  Batch size = 8


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 0.3997305929660797, 'eval_precision': 0.7131079967023908, 'eval_recall': 0.8799593082400814, 'eval_f1': 0.7877959927140255, 'eval_accuracy': 0.9212278523868638, 'eval_runtime': 1386.2093, 'eval_samples_per_second': 0.361, 'eval_steps_per_second': 0.045, 'epoch': 0.25}


***** Running Evaluation *****
  Num examples = 500
  Batch size = 8


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 0.2125166356563568, 'eval_precision': 0.8035381750465549, 'eval_recall': 0.8779247202441506, 'eval_f1': 0.8390860476421974, 'eval_accuracy': 0.9458300417560095, 'eval_runtime': 1373.8291, 'eval_samples_per_second': 0.364, 'eval_steps_per_second': 0.046, 'epoch': 0.49}


***** Running Evaluation *****
  Num examples = 500
  Batch size = 8


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 0.19511854648590088, 'eval_precision': 0.8594802694898941, 'eval_recall': 0.9084435401831129, 'eval_f1': 0.8832838773491593, 'eval_accuracy': 0.9566640334048075, 'eval_runtime': 1370.8813, 'eval_samples_per_second': 0.365, 'eval_steps_per_second': 0.046, 'epoch': 0.74}


***** Running Evaluation *****
  Num examples = 500
  Batch size = 8


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 0.20683902502059937, 'eval_precision': 0.8633301251203079, 'eval_recall': 0.9125127161749745, 'eval_f1': 0.887240356083086, 'eval_accuracy': 0.9531655569348833, 'eval_runtime': 1467.5869, 'eval_samples_per_second': 0.341, 'eval_steps_per_second': 0.043, 'epoch': 0.98}




Training completed. Do not forget to share your model on huggingface.co/models =)




{'train_runtime': 31259.915, 'train_samples_per_second': 0.104, 'train_steps_per_second': 0.013, 'train_loss': 0.10209329591043458, 'epoch': 1.0}


TrainOutput(global_step=407, training_loss=0.10209329591043458, metrics={'train_runtime': 31259.915, 'train_samples_per_second': 0.104, 'train_steps_per_second': 0.013, 'train_loss': 0.10209329591043458, 'epoch': 1.0})

In [126]:
trainer.save_model()

Saving model checkpoint to TrainOutput
Configuration saved in TrainOutput\config.json
Model weights saved in TrainOutput\pytorch_model.bin
tokenizer config file saved in TrainOutput\tokenizer_config.json
Special tokens file saved in TrainOutput\special_tokens_map.json


### Using model

In [122]:
token_classifier = pipeline(
    "token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="simple"
)
token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")

[{'entity_group': 'PER',
  'score': 0.98693,
  'word': 'Sylvain',
  'start': 11,
  'end': 18},
 {'entity_group': 'LOC',
  'score': 0.4050066,
  'word': 'Hu',
  'start': 33,
  'end': 35},
 {'entity_group': 'MISC',
  'score': 0.5244588,
  'word': '##gging',
  'start': 35,
  'end': 40},
 {'entity_group': 'ORG',
  'score': 0.3589424,
  'word': 'Face',
  'start': 41,
  'end': 45},
 {'entity_group': 'LOC',
  'score': 0.83852637,
  'word': 'Brooklyn',
  'start': 49,
  'end': 57}]

In [123]:
sentences = [
    'Arun works at Microsoft in Redmond',
    'Looking for a CRV near Renton',
    'Suv under 40K close to Oakland'
]

In [124]:
[f'{s} {token_classifier(s)}' for s in sentences]

["Arun works at Microsoft in Redmond [{'entity_group': 'PER', 'score': 0.6723282, 'word': 'Arun', 'start': 0, 'end': 4}, {'entity_group': 'ORG', 'score': 0.91245943, 'word': 'Microsoft', 'start': 14, 'end': 23}, {'entity_group': 'LOC', 'score': 0.9441663, 'word': 'Redmond', 'start': 27, 'end': 34}]",
 "Looking for a CRV near Renton [{'entity_group': 'MISC', 'score': 0.44998217, 'word': '##V', 'start': 16, 'end': 17}, {'entity_group': 'LOC', 'score': 0.9406806, 'word': 'Renton', 'start': 23, 'end': 29}]",
 "Suv under 40K close to Oakland [{'entity_group': 'LOC', 'score': 0.97419125, 'word': 'Oakland', 'start': 23, 'end': 30}]"]

### Load model from local dir

In [2]:
model_path = "./TrainOutput"

In [3]:
local_tokenizer = AutoTokenizer.from_pretrained(model_path)

In [4]:
local_model = AutoModelForTokenClassification.from_pretrained(model_path)

In [6]:
local_token_classifier = pipeline(
    "token-classification", model=local_model, tokenizer=local_tokenizer, aggregation_strategy="max"
)
local_token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")

[{'entity_group': 'PER',
  'score': 0.99660456,
  'word': 'Sylvain',
  'start': 11,
  'end': 18},
 {'entity_group': 'ORG',
  'score': 0.7260044,
  'word': 'Hugging Face',
  'start': 33,
  'end': 45},
 {'entity_group': 'LOC',
  'score': 0.96814585,
  'word': 'Brooklyn',
  'start': 49,
  'end': 57}]

In [145]:
[f'{s} {local_token_classifier(s)}' for s in sentences]

["Arun works at Microsoft in Redmond [{'entity_group': 'PER', 'score': 0.9553094, 'word': 'Arun', 'start': 0, 'end': 4}, {'entity_group': 'ORG', 'score': 0.9504268, 'word': 'Microsoft', 'start': 14, 'end': 23}, {'entity_group': 'LOC', 'score': 0.97999233, 'word': 'Redmond', 'start': 27, 'end': 34}]",
 "Looking for a CRV near Renton [{'entity_group': 'MISC', 'score': 0.52804685, 'word': 'CRV', 'start': 14, 'end': 17}, {'entity_group': 'LOC', 'score': 0.98194075, 'word': 'Renton', 'start': 23, 'end': 29}]",
 "Suv under 40K close to Oakland [{'entity_group': 'LOC', 'score': 0.98331785, 'word': 'Oakland', 'start': 23, 'end': 30}]"]

In [140]:
example = "My name is Sylvain and I work at Hugging Face in Brooklyn."
subtokens = local_tokenizer(example)
subtokens

{'input_ids': [101, 1422, 1271, 1110, 156, 7777, 2497, 1394, 1105, 146, 1250, 1120, 20164, 10932, 10289, 1107, 6010, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [141]:
subtokens.tokens()

['[CLS]',
 'My',
 'name',
 'is',
 'S',
 '##yl',
 '##va',
 '##in',
 'and',
 'I',
 'work',
 'at',
 'Hu',
 '##gging',
 'Face',
 'in',
 'Brooklyn',
 '.',
 '[SEP]']