Reference: https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb

In [2]:
import transformers

In [3]:
transformers.__version__

'4.25.1'

### Token Classification

The most common token classification tasks are : 

a. Named-entity recognition(NER)

b. Part-of-Speech tagging(POS)

c. Chunk--Grammatically classify the tokens and group them into "chunks" that go together

In [4]:
task= "ner"   # should be one of "ner", "pos", or "chunk"
model_checkpoint= "distilbert-base-uncased"
batch_size=16

##### Loading the dataset

using Datasets library to download the data and get the metrics we need to use for evaluation

If we want to use our own dataset defined from a JSON or csv file it might need some adjustments in the names of the columns used.
ref: https://huggingface.co/docs/datasets/quickstart

In [9]:
from datasets import load_dataset, load_metric

datasets= load_dataset("conll2003")

Downloading builder script:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

Downloading and preparing dataset conll2003/conll2003 to C:/Users/AIXI/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98...


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

Dataset conll2003 downloaded and prepared to C:/Users/AIXI/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [13]:
# https://aclanthology.org/W03-0419.pdf
datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [18]:
datasets['train'][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [20]:
datasets['train'].features[f"ner_tags"]

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

So for the NER tags, 0 corresponds to 'O', 1 to 'B-PER' etc... On top of the 'O' (which means no special entity), there are four labels for NER here, each prefixed with 'B-' (for beginning) or 'I-' (for intermediate), that indicate if the token is the first one for the current group with the label or not:
- 'PER' for person
- 'ORG' for organization
- 'LOC' for location
- 'MISC' for miscellaneous

In [27]:
label_list= datasets['train'].features[f"{task}_tags"].feature.names
label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [28]:
from datasets import ClassLabel, Sequence
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
        elif isinstance(typ, Sequence) and isinstance(typ.feature, ClassLabel):
            df[column] = df[column].transform(lambda x: [typ.feature.names[i] for i in x])
    display(HTML(df.to_html()))

In [29]:
show_random_elements(datasets["train"])

Unnamed: 0,id,tokens,pos_tags,chunk_tags,ner_tags
0,3745,"[BELGRADE, 1996-08-24]","[VB, CD]","[B-VP, B-NP]","[B-LOC, O]"
1,10419,"[Guy, Forget, (, France, ), beat, Grant, Stafford, (, South, Africa, ), 3-6, 2-6, 6-4, 7-6, (, 7-2, ), 6-3]","[NNP, VB, (, NNP, ), VB, NNP, NNP, (, NNP, NNP, ), CD, CD, CD, CD, (, CD, ), CD]","[B-NP, B-VP, O, B-NP, O, B-VP, B-NP, I-NP, O, B-NP, I-NP, O, B-NP, I-NP, I-NP, B-ADVP, O, B-NP, O, B-NP]","[B-PER, I-PER, O, B-LOC, O, O, B-PER, I-PER, O, B-LOC, I-LOC, O, O, O, O, O, O, O, O, O]"
2,7432,"[Contributing, to, the, peso, 's, weakness, ,, another, dealer, said, banks, who, might, have, sold, greenbacks, in, Monday, 's, trading, appeared, to, shy, away, from, the, market, .]","[VBG, TO, DT, NN, POS, NN, ,, DT, NN, VBD, NNS, WP, MD, VB, VBN, VBZ, RB, NNP, POS, NN, VBD, TO, VB, RB, IN, DT, NN, .]","[B-VP, B-PP, B-NP, I-NP, B-NP, I-NP, O, B-NP, I-NP, B-VP, B-NP, B-NP, B-VP, I-VP, I-VP, B-VP, B-ADVP, B-NP, B-NP, I-NP, B-VP, I-VP, I-VP, B-ADVP, B-PP, B-NP, I-NP, O]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
3,4956,"[Barnsley, 3, Huddersfield, 1]","[NNP, CD, NNP, CD]","[B-NP, I-NP, I-NP, I-NP]","[B-ORG, O, B-ORG, O]"
4,5214,"[Uralmash, Yekaterinburg, 24, 3, 8, 13, 24, 43, 16]","[NNP, NNP, CD, CD, CD, CD, CD, CD, CD]","[B-NP, I-NP, I-NP, I-NP, I-NP, I-NP, I-NP, I-NP, I-NP]","[B-ORG, I-ORG, O, O, O, O, O, O, O]"
5,7824,"[REPRICING, OF, THE, BALANCE, OF, THE, BONDS, IN, THE, ACCOUNT, .]","[VBG, IN, DT, NN, IN, DT, NNS, IN, DT, NN, .]","[B-VP, B-PP, B-NP, I-NP, B-PP, B-NP, I-NP, B-PP, B-NP, I-NP, O]","[O, O, O, O, O, O, O, O, O, O, O]"
6,3557,"[Saeed, Anwar, c, Croft, b, Cork, 176]","[NNP, NNP, SYM, NNP, SYM, NN, CD]","[B-NP, I-NP, O, B-NP, O, B-NP, I-NP]","[B-PER, I-PER, O, B-PER, O, B-PER, O]"
7,1935,"[out, ,, M., Dowman, 107, ), v, Surrey, .]","[RP, ,, NNP, NNP, CD, ), FW, NNP, .]","[B-PRT, O, B-NP, I-NP, I-NP, O, B-NP, I-NP, O]","[O, O, B-PER, I-PER, O, O, O, B-ORG, O]"
8,11390,"[Dole, said, 70, percent, of, the, cocaine, that, entered, the, United, States, and, 40, percent, of, the, marijuana, came, from, Mexico, ., ""]","[NNP, VBD, CD, NN, IN, DT, NN, WDT, VBD, DT, NNP, NNP, CC, CD, NN, IN, DT, NN, VBD, IN, NNP, ., ""]","[B-NP, B-VP, B-NP, I-NP, B-PP, B-NP, I-NP, B-NP, B-VP, B-NP, I-NP, I-NP, O, B-NP, I-NP, B-PP, B-NP, I-NP, B-VP, B-PP, B-NP, I-NP, O]","[B-PER, O, O, O, O, O, O, O, O, O, B-LOC, I-LOC, O, O, O, O, O, O, O, O, B-LOC, O, O]"
9,5595,"[11., Frank, Nobilo, (, New, Zealand, ), 209,412]","[CD, NNP, NNP, (, NNP, NNP, ), CD]","[B-NP, I-NP, I-NP, O, B-NP, I-NP, O, B-NP]","[O, B-PER, I-PER, O, B-LOC, I-LOC, O, O]"


### Preprocesssing the data

In [30]:
from transformers import AutoTokenizer

tokenizer= AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer("Hello this is youkesh nepal")

loading configuration file config.json from cache at C:\Users\AIXI/.cache\huggingface\hub\models--distilbert-base-uncased\snapshots\1c4513b2eedbda136f57676a34eea67aba266e5c\config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.25.1",
  "vocab_size": 30522
}

loading file vocab.txt from cache at C:\Users\AIXI/.cache\huggingface\hub\models--distilbert-base-uncased\snapshots\1c4513b2eedbda136f57676a34eea67aba266e5c\vocab.txt
loading file tokenizer.json from cache at C:\Users\AIXI/.cache\huggingface\hub\models--distil

{'input_ids': [101, 7592, 2023, 2003, 2017, 9681, 2232, 8222, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [33]:
# The following assertion ensures that our tokenizer is a fast tokenizers (backed by Rust) from the Tokenizers library. 
# Those fast tokenizers are available for almost all models, and we will 
# need some of the special features they have for our preprocessing.

import transformers
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)


In [34]:
tokenizer("Hello this is yukesh nepal")

{'input_ids': [101, 7592, 2023, 2003, 9805, 9681, 2232, 8222, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

Note:

The transformers are often pretrainied with subword tokenizers, meaning that even if our input have been split into words already, each of those words could be split again by the tokenizer

In [37]:
tokenizer(["Hello", ",", "this", "is", "one", "sentence", "split", "into", "words", "."], is_split_into_words=True)

{'input_ids': [101, 7592, 1010, 2023, 2003, 2028, 6251, 3975, 2046, 2616, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [38]:
example = datasets["train"][4]
print(example["tokens"])

['Germany', "'s", 'representative', 'to', 'the', 'European', 'Union', "'s", 'veterinary', 'committee', 'Werner', 'Zwingmann', 'said', 'on', 'Wednesday', 'consumers', 'should', 'buy', 'sheepmeat', 'from', 'countries', 'other', 'than', 'Britain', 'until', 'the', 'scientific', 'advice', 'was', 'clearer', '.']


In [43]:
tokenized_input= tokenizer(example["tokens"], is_split_into_words=True)
tokens= tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)

['[CLS]', 'germany', "'", 's', 'representative', 'to', 'the', 'european', 'union', "'", 's', 'veterinary', 'committee', 'werner', 'z', '##wing', '##mann', 'said', 'on', 'wednesday', 'consumers', 'should', 'buy', 'sheep', '##me', '##at', 'from', 'countries', 'other', 'than', 'britain', 'until', 'the', 'scientific', 'advice', 'was', 'clearer', '.', '[SEP]']


In [41]:
example.keys()

dict_keys(['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'])

In [47]:
print(example['ner_tags'])
print(example['tokens'])
print(len(example['ner_tags']))
print(len(example['tokens']))

[5, 0, 0, 0, 0, 3, 4, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0]
['Germany', "'s", 'representative', 'to', 'the', 'European', 'Union', "'s", 'veterinary', 'committee', 'Werner', 'Zwingmann', 'said', 'on', 'Wednesday', 'consumers', 'should', 'buy', 'sheepmeat', 'from', 'countries', 'other', 'than', 'Britain', 'until', 'the', 'scientific', 'advice', 'was', 'clearer', '.']
31
31


In [48]:
print(tokenized_input)

{'input_ids': [101, 2762, 1005, 1055, 4387, 2000, 1996, 2647, 2586, 1005, 1055, 15651, 2837, 14121, 1062, 9328, 5804, 2056, 2006, 9317, 10390, 2323, 4965, 8351, 4168, 4017, 2013, 3032, 2060, 2084, 3725, 2127, 1996, 4045, 6040, 2001, 24509, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
