# Token Classification

## Import Modules

In [61]:
import evaluate
import numpy as np
import transformers
import tensorflow as tf

from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer, create_optimizer, TFAutoModelForTokenClassification
from transformers.keras_callbacks import KerasMetricCallback


## Import Dataset

In [2]:
wnut = load_dataset("wnut_17")

Downloading builder script: 100%|██████████| 7.46k/7.46k [00:00<00:00, 3.76MB/s]
Downloading metadata: 100%|██████████| 4.28k/4.28k [00:00<00:00, 4.18MB/s]
Downloading readme: 100%|██████████| 9.05k/9.05k [00:00<00:00, 4.50MB/s]
Downloading data: 494kB [00:00, 1.74MB/s]0/3 [00:00<?, ?it/s]
Downloading data: 115kB [00:00, 7.55MB/s]                    .57s/it]
Downloading data: 192kB [00:00, 7.51MB/s]                    .03it/s]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.11it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 600.87it/s]
Generating train split: 100%|██████████| 3394/3394 [00:00<00:00, 9076.04 examples/s]
Generating validation split: 100%|██████████| 1009/1009 [00:00<00:00, 11074.47 examples/s]
Generating test split: 100%|██████████| 1287/1287 [00:00<00:00, 10382.38 examples/s]


In [11]:
wnut

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 3394
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 1009
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 1287
    })
})

In [12]:
wnut["train"][0]

{'id': '0',
 'tokens': ['@paulwalk',
  'It',
  "'s",
  'the',
  'view',
  'from',
  'where',
  'I',
  "'m",
  'living',
  'for',
  'two',
  'weeks',
  '.',
  'Empire',
  'State',
  'Building',
  '=',
  'ESB',
  '.',
  'Pretty',
  'bad',
  'storm',
  'here',
  'last',
  'evening',
  '.'],
 'ner_tags': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  7,
  8,
  8,
  0,
  7,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0]}

In [4]:
label_list = wnut["train"].features[f"ner_tags"].feature.names
label_list

['O',
 'B-corporation',
 'I-corporation',
 'B-creative-work',
 'I-creative-work',
 'B-group',
 'I-group',
 'B-location',
 'I-location',
 'B-person',
 'I-person',
 'B-product',
 'I-product']

### Preprocessing the Data

In [9]:
# use DistilBERT
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [13]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [14]:
tokenized_wnut = wnut.map(tokenize_and_align_labels, batched=True)

Map: 100%|██████████| 3394/3394 [00:00<00:00, 7785.13 examples/s]
Map: 100%|██████████| 1009/1009 [00:00<00:00, 11318.95 examples/s]
Map: 100%|██████████| 1287/1287 [00:00<00:00, 9402.45 examples/s]


In [17]:
tokenized_wnut["train"][0]

{'id': '0',
 'tokens': ['@paulwalk',
  'It',
  "'s",
  'the',
  'view',
  'from',
  'where',
  'I',
  "'m",
  'living',
  'for',
  'two',
  'weeks',
  '.',
  'Empire',
  'State',
  'Building',
  '=',
  'ESB',
  '.',
  'Pretty',
  'bad',
  'storm',
  'here',
  'last',
  'evening',
  '.'],
 'ner_tags': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  7,
  8,
  8,
  0,
  7,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'input_ids': [101,
  1030,
  2703,
  17122,
  2009,
  1005,
  1055,
  1996,
  3193,
  2013,
  2073,
  1045,
  1005,
  1049,
  2542,
  2005,
  2048,
  3134,
  1012,
  3400,
  2110,
  2311,
  1027,
  9686,
  2497,
  1012,
  3492,
  2919,
  4040,
  2182,
  2197,
  3944,
  1012,
  102],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'labels': [-100,
  0,
  -100,
  -100,
  0,
  0,
  -100,
  0,
  0,
  0,
  0,
  0,
  0,
  

In [52]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [57]:
data_collator_tf = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf")

## Evaluate Metrics

In [24]:
seqeval = evaluate.load("seqeval")

In [26]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

## Train the Model

In [49]:
id2label = {
    0: "O",
    1: "B-corporation",
    2: "I-corporation",
    3: "B-creative-work",
    4: "I-creative-work",
    5: "B-group",
    6: "I-group",
    7: "B-location",
    8: "I-location",
    9: "B-person",
    10: "I-person",
    11: "B-product",
    12: "I-product",
}
label2id = {
    "O": 0,
    "B-corporation": 1,
    "I-corporation": 2,
    "B-creative-work": 3,
    "I-creative-work": 4,
    "B-group": 5,
    "I-group": 6,
    "B-location": 7,
    "I-location": 8,
    "B-person": 9,
    "I-person": 10,
    "B-product": 11,
    "I-product": 12,
}

In [50]:
model = AutoModelForTokenClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=13, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [53]:
training_args = TrainingArguments(
    output_dir="wnut_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_wnut["train"],
    eval_dataset=tokenized_wnut["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.291391,0.575309,0.215941,0.314016,0.937113
2,No log,0.275412,0.582593,0.303985,0.399513,0.941388


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=426, training_loss=0.2106298527247469, metrics={'train_runtime': 702.3336, 'train_samples_per_second': 9.665, 'train_steps_per_second': 0.607, 'total_flos': 91934268452820.0, 'train_loss': 0.2106298527247469, 'epoch': 2.0})

### Fine-Tuning

In [55]:
batch_size = 16
num_train_epochs = 3
num_train_steps = (len(tokenized_wnut["train"]) // batch_size) * num_train_epochs
optimizer, lr_schedule = create_optimizer(
    init_lr=2e-5,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
    num_warmup_steps=0,
)

In [56]:
model = TFAutoModelForTokenClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=13, id2label=id2label, label2id=label2id
)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForTokenClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing TFDistilBertForTokenClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForTokenClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able t

In [58]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_wnut["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator_tf,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_wnut["validation"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator_tf,
)

In [60]:
model.compile(optimizer=optimizer)

In [62]:
metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)

In [63]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=[metric_callback])

Epoch 1/3
Epoch 2/3


  _warn_prf(average, modifier, msg_start, len(result))


Epoch 3/3


<keras.src.callbacks.History at 0x1dc4b3a2650>

## Inference

In [77]:
text = "Toru, a quiet and preternaturally serious young college student in Tokyo, is devoted to Naoko, a beautiful and introspective young woman, but their mutual passion is marked by the tragic death of their best friend years before. Toru begins to adapt to campus life and the loneliness and isolation he faces there, but Naoko finds the pressures and responsibilities of life unbearable. As she retreats further into her own world, Toru finds himself reaching out to others and drawn to a fiercely independent and sexually liberated young woman. A magnificent blending of the music, the mood, and the ethos that was the sixties with the story of one college student's romantic coming of age, Norwegian Wood brilliantly recaptures a young man's first, hopeless, and heroic love."

In [78]:
inputs = tokenizer(text, return_tensors="tf")
logits = model(**inputs).logits

In [79]:
predicted_token_class_ids = tf.math.argmax(logits, axis=-1)
predicted_token_class = [model.config.id2label[t] for t in predicted_token_class_ids[0].numpy().tolist()]

tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0].numpy())
token_class_pairs = list(zip(tokens, predicted_token_class))

token_class_pairs

[('[CLS]', 'O'),
 ('tor', 'B-person'),
 ('##u', 'O'),
 (',', 'O'),
 ('a', 'O'),
 ('quiet', 'O'),
 ('and', 'O'),
 ('pre', 'O'),
 ('##tern', 'O'),
 ('##at', 'O'),
 ('##ural', 'O'),
 ('##ly', 'O'),
 ('serious', 'O'),
 ('young', 'O'),
 ('college', 'O'),
 ('student', 'O'),
 ('in', 'O'),
 ('tokyo', 'B-location'),
 (',', 'O'),
 ('is', 'O'),
 ('devoted', 'O'),
 ('to', 'O'),
 ('na', 'B-person'),
 ('##oko', 'B-person'),
 (',', 'O'),
 ('a', 'O'),
 ('beautiful', 'O'),
 ('and', 'O'),
 ('intro', 'O'),
 ('##sp', 'O'),
 ('##ect', 'O'),
 ('##ive', 'O'),
 ('young', 'O'),
 ('woman', 'O'),
 (',', 'O'),
 ('but', 'O'),
 ('their', 'O'),
 ('mutual', 'O'),
 ('passion', 'O'),
 ('is', 'O'),
 ('marked', 'O'),
 ('by', 'O'),
 ('the', 'O'),
 ('tragic', 'O'),
 ('death', 'O'),
 ('of', 'O'),
 ('their', 'O'),
 ('best', 'O'),
 ('friend', 'O'),
 ('years', 'O'),
 ('before', 'O'),
 ('.', 'O'),
 ('tor', 'B-person'),
 ('##u', 'O'),
 ('begins', 'O'),
 ('to', 'O'),
 ('adapt', 'O'),
 ('to', 'O'),
 ('campus', 'O'),
 ('life', 'O')