Reference: https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb

In [1]:
import transformers

In [2]:
transformers.__version__

'4.25.1'

### Token Classification

The most common token classification tasks are : 

a. Named-entity recognition(NER)

b. Part-of-Speech tagging(POS)

c. Chunk--Grammatically classify the tokens and group them into "chunks" that go together

In [3]:
task= "ner"   # should be one of "ner", "pos", or "chunk"
model_checkpoint= "distilbert-base-uncased"
batch_size=16

##### Loading the dataset

using Datasets library to download the data and get the metrics we need to use for evaluation

If we want to use our own dataset defined from a JSON or csv file it might need some adjustments in the names of the columns used.
ref: https://huggingface.co/docs/datasets/quickstart

In [4]:
from datasets import load_dataset, load_metric

datasets= load_dataset("conll2003")

Found cached dataset conll2003 (C:/Users/AIXI/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98)


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
# https://aclanthology.org/W03-0419.pdf
datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [6]:
datasets['train'][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [7]:
datasets['train'].features[f"ner_tags"]

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

So for the NER tags, 0 corresponds to 'O', 1 to 'B-PER' etc... On top of the 'O' (which means no special entity), there are four labels for NER here, each prefixed with 'B-' (for beginning) or 'I-' (for intermediate), that indicate if the token is the first one for the current group with the label or not:
- 'PER' for person
- 'ORG' for organization
- 'LOC' for location
- 'MISC' for miscellaneous

In [8]:
label_list= datasets['train'].features[f"{task}_tags"].feature.names
label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [15]:
from datasets import ClassLabel, Sequence
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    # assertion to check if the required no.of examples is not greater than the total length of the dataset.
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."

    # initialize an empty list to store randomly picked indices
    picks = []

    # loop to pick random sample elements from the 'datasets' 
    for _ in range(num_examples):

        # generate a random index between between 0 and the total no.of elements-1 in the dataset[-1 is because index starts from 0]
        pick = random.randint(0, len(dataset)-1)

        # check if randomly picked index is already in the 'picks' list, if so, keep generating until we get unique index
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)

        # append the unique random index to the 'picks' list
        picks.append(pick)
    
    # creating a new dataframe df containing the randomly selected elements from the 'datasets' using the indices in 'picks'
    df = pd.DataFrame(dataset[picks])
    
    # loop over the features of the datasets
    for column, typ in dataset.features.items():
        # check if the feature's type is 'ClassLabel'  
        if isinstance(typ, ClassLabel):
            # transform the column in 'df' to map integer representation to label names
            df[column] = df[column].transform(lambda i: typ.names[i])
        # check if the feature is a sequence and its inner feature is 'ClassLabel'     
        elif isinstance(typ, Sequence) and isinstance(typ.feature, ClassLabel):
            # transform the column in df to map each integer representation in the sequence to its label name
            df[column] = df[column].transform(lambda x: [typ.feature.names[i] for i in x])
    display(HTML(df.to_html()))

In [14]:
show_random_elements(datasets["train"], num_examples=1)

Unnamed: 0,id,tokens,pos_tags,chunk_tags,ner_tags
0,12641,"[SOCCER, -, DUTCH, FIRST, DIVISION, RESULTS, /, STANDINGS, .]","[NN, :, VB, NNP, NNP, NNS, SYM, NNS, .]","[B-NP, O, B-VP, B-NP, I-NP, I-NP, O, B-NP, O]","[O, O, B-MISC, O, O, O, O, O, O]"


### Preprocesssing the data

In [16]:
from transformers import AutoTokenizer

tokenizer= AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer("Hello this is youkesh nepal")

{'input_ids': [101, 7592, 2023, 2003, 2017, 9681, 2232, 8222, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [17]:
# The following assertion ensures that our tokenizer is a fast tokenizers (backed by Rust) from the Tokenizers library. 
# Those fast tokenizers are available for almost all models, and we will 
# need some of the special features they have for our preprocessing.

import transformers
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [44]:
print(tokenizer("Hello this is yukesh nepal"))
print(tokenizer("Hello this is yukesh nepal").word_ids())
print(tokenizer("Hello this is yukesh nepal").tokens())

{'input_ids': [101, 7592, 2023, 2003, 9805, 9681, 2232, 8222, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}
[None, 0, 1, 2, 3, 3, 3, 4, None]
['[CLS]', 'hello', 'this', 'is', 'yu', '##kes', '##h', 'nepal', '[SEP]']


Note:

The transformers are often pretrainied with subword tokenizers, meaning that even if our input have been split into words already, each of those words could be split again by the tokenizer

In [21]:
tokenizer(["Hello", ",", "this", "is", "one", "sentence", "split", "into", "words", "."], is_split_into_words=True)

{'input_ids': [101, 7592, 1010, 2023, 2003, 2028, 6251, 3975, 2046, 2616, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [22]:
example = datasets["train"][4]
print(example["tokens"])

['Germany', "'s", 'representative', 'to', 'the', 'European', 'Union', "'s", 'veterinary', 'committee', 'Werner', 'Zwingmann', 'said', 'on', 'Wednesday', 'consumers', 'should', 'buy', 'sheepmeat', 'from', 'countries', 'other', 'than', 'Britain', 'until', 'the', 'scientific', 'advice', 'was', 'clearer', '.']


In [23]:
tokenized_input= tokenizer(example["tokens"], is_split_into_words=True)
print(tokenized_input)
tokens= tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)

{'input_ids': [101, 2762, 1005, 1055, 4387, 2000, 1996, 2647, 2586, 1005, 1055, 15651, 2837, 14121, 1062, 9328, 5804, 2056, 2006, 9317, 10390, 2323, 4965, 8351, 4168, 4017, 2013, 3032, 2060, 2084, 3725, 2127, 1996, 4045, 6040, 2001, 24509, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['[CLS]', 'germany', "'", 's', 'representative', 'to', 'the', 'european', 'union', "'", 's', 'veterinary', 'committee', 'werner', 'z', '##wing', '##mann', 'said', 'on', 'wednesday', 'consumers', 'should', 'buy', 'sheep', '##me', '##at', 'from', 'countries', 'other', 'than', 'britain', 'until', 'the', 'scientific', 'advice', 'was', 'clearer', '.', '[SEP]']


In [46]:
print(tokenizer.convert_ids_to_tokens(tokenized_input['input_ids'][0]))

[CLS]


In [24]:
example.keys()

dict_keys(['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'])

In [36]:
example

{'id': '4',
 'tokens': ['Germany',
  "'s",
  'representative',
  'to',
  'the',
  'European',
  'Union',
  "'s",
  'veterinary',
  'committee',
  'Werner',
  'Zwingmann',
  'said',
  'on',
  'Wednesday',
  'consumers',
  'should',
  'buy',
  'sheepmeat',
  'from',
  'countries',
  'other',
  'than',
  'Britain',
  'until',
  'the',
  'scientific',
  'advice',
  'was',
  'clearer',
  '.'],
 'pos_tags': [22,
  27,
  21,
  35,
  12,
  22,
  22,
  27,
  16,
  21,
  22,
  22,
  38,
  15,
  22,
  24,
  20,
  37,
  21,
  15,
  24,
  16,
  15,
  22,
  15,
  12,
  16,
  21,
  38,
  17,
  7],
 'chunk_tags': [11,
  11,
  12,
  13,
  11,
  12,
  12,
  11,
  12,
  12,
  12,
  12,
  21,
  13,
  11,
  12,
  21,
  22,
  11,
  13,
  11,
  1,
  13,
  11,
  17,
  11,
  12,
  12,
  21,
  1,
  0],
 'ner_tags': [5,
  0,
  0,
  0,
  0,
  3,
  4,
  0,
  0,
  0,
  1,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  5,
  0,
  0,
  0,
  0,
  0,
  0,
  0]}

In [25]:
print(example['ner_tags'])
print(example['tokens'])
print(len(example['ner_tags']))
print(len(example['tokens']))

[5, 0, 0, 0, 0, 3, 4, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0]
['Germany', "'s", 'representative', 'to', 'the', 'European', 'Union', "'s", 'veterinary', 'committee', 'Werner', 'Zwingmann', 'said', 'on', 'Wednesday', 'consumers', 'should', 'buy', 'sheepmeat', 'from', 'countries', 'other', 'than', 'Britain', 'until', 'the', 'scientific', 'advice', 'was', 'clearer', '.']
31
31


In [45]:
'''
the word_ids() function returns the same no. of elements as in "input_ids". It helps in mapping special tokens to None and all other
tokens to their respective word. This wy, we can align the labels with the processed input ids.
'''
print(tokenized_input.word_ids())
print("The length of word id index is: ", len(tokenized_input.word_ids()))
print("The length of input_ids is:" ,len(tokenized_input['input_ids']))

[None, 0, 1, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9, 10, 11, 11, 11, 12, 13, 14, 15, 16, 17, 18, 18, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, None]
The length of word id index is:  39
The length of input_ids is: 39


In [47]:
word_ids= tokenized_input.word_ids()
aligned_labels= [-100 if i is None else example[f'{task}_tags'][i] for i in word_ids]
print(len(aligned_labels), len(tokenized_input["input_ids"]))

39 39
