In [1]:
! pip install datasets -q
! pip install seqeval -q
! pip install evaluate -q

NER (Named Entity Recognition)
- Tags
  - B: Begining
  - I: Inside
  - O: outside

- displacy: to visualize the NER (spacy)
- IOB format for NER (Wikipedia document)
  - IGN: ignore token

- data: CONLLPP dataset
  - 7 tokens

In [2]:
from datasets import load_dataset
import pandas as pd

# load data
data = load_dataset('conllpp', trust_remote_code=True)
print(data)

# to print the featues of the data
print(data['train'].features)
# B, I, O and PER, ORG, LOC, MIS

# single data point will looks like
print(data['train'][0])

# tags to string
tags = data['train'].features['ner_tags'].feature.names
int2str = {id: str for id, str in enumerate(tags)}

def tag_id2str(row):
  return {'ner_tag_names': [int2str[id] for id in row['ner_tags']]}

data = data.map(tag_id2str)
print(data['train'][0])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/8.73k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.70k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/650k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/163k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/141k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})
{'id': Value(dtype='string', id=None), 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'pos_tags': Sequence(feature=ClassLabel(names=['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'], id=None), length=-1, id=None), 'chunk_tags': Sequence(feature=ClassLabel(names=['O', 'B-ADJP', 'I-ADJP', 'B-ADVP', 'I-ADVP', 'B-CONJP', 

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

{'id': '0', 'tokens': ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7], 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0], 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0], 'ner_tag_names': ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']}


## Model

In [3]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

chkpt = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(chkpt)

# tokenizer.is_fast
inputs = tokenizer(data['train'][0]['tokens'], is_split_into_words=True)
print(inputs.tokens())
print(inputs.word_ids())

# {0: 'O',
#  1: 'B-PER',
#  2: 'I-PER',
#  3: 'B-ORG',
#  4: 'I-ORG',
#  5: 'B-LOC',
#  6: 'I-LOC',
#  7: 'B-MISC',
#  8: 'I-MISC'}

# now need to check the alignment problem
# some words may divided into subword tokens
# second subword will start with '##' or input word ids will be same

# def align_labels_with_tokens(labels, word_ids):
#   new_labels = []
#   current_word = None

#   for word_id in word_ids:
#     if word_id != current_word:
#       current_word = word_id
#       # pytorch None index are replaced with -100
#       label = -100 if word_id is None else labels[word_id]
#       new_labels.append(label)
#     elif word_id is None:
#       new_labels.append(-100)
#     else:
#       label = labels[word_id]
#       if label%2 == 1:
#         label += 1
#   return new_labels

def align_labels_with_tokens(labels, word_ids):

  new_labels = []
  current_word = None

  for word_id in word_ids:

    if word_id != current_word:
      current_word = word_id
      label = -100 if word_id is None else labels[word_id]
      new_labels.append(label)

    # For subwords, simply append the label for B- and I-
    elif word_id is not None:
      label = labels[word_id]
      if label%2 == 1:
        label += 1
      new_labels.append(label)

    else:
      new_labels.append(-100)
  return new_labels

def tokenize_and_align_labels(examples):
  tokenized_inputs = tokenizer(
      examples['tokens'],
      is_split_into_words=True,
      truncation=True
  )

  labels = examples['ner_tags']

  aligned_labels = []
  for i, label in enumerate(labels):
    aligned_labels.append(align_labels_with_tokens(
        label, tokenized_inputs.word_ids(i)
        )
    )

  # tokenizer returns 3 items
  # input_ids, attention_mask, labels
  # need to replace labels with aligned_labels
  tokenized_inputs['labels'] = aligned_labels
  return tokenized_inputs

# tokenize the input and adjust the lenght of the labels as per the number of
# tokens
temp = data.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=[
        'id',
        'tokens',
        'pos_tags',
        'chunk_tags',
        'ner_tags',
        'ner_tag_names'
        ]
    )

['[CLS]', 'EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'la', '##mb', '.', '[SEP]']
[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]




Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [4]:
from transformers import DataCollatorForTokenClassification
import evaluate
import numpy as np


# data collation for making the input lenght equal in a batch
datacollator = DataCollatorForTokenClassification(tokenizer=tokenizer)
# load the metrics for evluate the model
metircs = evaluate.load('seqeval')

# need to remove label None from the accuracy predictions
def compute_metrics(eval_preds):
  # labels used for the dictionary storing the label and index (prediction)
  logits, act_labels = eval_preds
  predictions = np.argmax(logits, axis=-1)
  # Remove -100 from predictions -> remove preditions where the actual label is -100
  predictions = [[p for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, act_labels)]

  true_labels = [
       [int2str[l] for l in label if l != -100]
       for label in act_labels
      ]
  # Iterate over predictions when creating true_predictions to ensure correct alignment
  true_predictions = [
          [int2str[p] for p in prediction]
          for prediction in predictions
      ]

  metrics_value = metircs.compute(predictions=true_predictions, references=true_labels)
  return metrics_value

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

## Model training

In [5]:
from transformers import TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    chkpt,
    # num_labels=10,
    id2label=int2str
    )
# we can plug the id to label dictionary in the model
print(f'number of labels for the model: {model.config.num_labels}')

args = TrainingArguments(
    'distilbert-finetuned-ner',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=temp['train'],
    eval_dataset=temp['validation'],
    tokenizer=tokenizer,
    data_collator=datacollator,
    compute_metrics=compute_metrics
)

trainer.train()

model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


number of labels for the model: 9


Epoch,Training Loss,Validation Loss,Loc,Misc,Org,Per,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1,0.0897,0.086106,"{'precision': 0.9408866995073891, 'recall': 0.9357648339684268, 'f1': 0.9383187772925764, 'number': 1837}","{'precision': 0.7538910505836576, 'recall': 0.8405639913232104, 'f1': 0.7948717948717949, 'number': 922}","{'precision': 0.8258196721311475, 'recall': 0.901565995525727, 'f1': 0.8620320855614972, 'number': 1341}","{'precision': 0.9280730005367687, 'recall': 0.9386536373507057, 'f1': 0.9333333333333335, 'number': 1842}",0.87868,0.91417,0.896074,0.976173
2,0.0449,0.073951,"{'precision': 0.9441431670281996, 'recall': 0.9477408818726184, 'f1': 0.9459386036403151, 'number': 1837}","{'precision': 0.8238289205702648, 'recall': 0.8774403470715835, 'f1': 0.8497899159663866, 'number': 922}","{'precision': 0.8782098312545855, 'recall': 0.8926174496644296, 'f1': 0.8853550295857989, 'number': 1341}","{'precision': 0.94148655772272, 'recall': 0.9695982627578719, 'f1': 0.955335651243648, 'number': 1842}",0.909136,0.931168,0.92002,0.981913
3,0.0293,0.073623,"{'precision': 0.9446534121440086, 'recall': 0.9569951007076756, 'f1': 0.950784207679827, 'number': 1837}","{'precision': 0.8485477178423236, 'recall': 0.8872017353579176, 'f1': 0.8674443266171792, 'number': 922}","{'precision': 0.8806941431670282, 'recall': 0.9082774049217002, 'f1': 0.894273127753304, 'number': 1341}","{'precision': 0.9512847965738758, 'recall': 0.9647122692725298, 'f1': 0.9579514824797843, 'number': 1842}",0.916886,0.937563,0.927109,0.983811


Trainer is attempting to log a value of "{'precision': 0.9408866995073891, 'recall': 0.9357648339684268, 'f1': 0.9383187772925764, 'number': 1837}" of type <class 'dict'> for key "eval/LOC" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.7538910505836576, 'recall': 0.8405639913232104, 'f1': 0.7948717948717949, 'number': 922}" of type <class 'dict'> for key "eval/MISC" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.8258196721311475, 'recall': 0.901565995525727, 'f1': 0.8620320855614972, 'number': 1341}" of type <class 'dict'> for key "eval/ORG" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.9280730005367687, 'recall': 0.9386536373507057,

TrainOutput(global_step=5268, training_loss=0.0803661420658491, metrics={'train_runtime': 340.8592, 'train_samples_per_second': 123.579, 'train_steps_per_second': 15.455, 'total_flos': 460431563935266.0, 'train_loss': 0.0803661420658491, 'epoch': 3.0})

In [9]:
# to zip the trained model and download the file
# ! zip -r <target_file_name.zip> <foler path>
# download the zipped file to computer

# to do NER using the trained model
from transformers import pipeline

pipeline = pipeline(
    'token-classification',
    model='/content/distilbert-finetuned-ner/checkpoint-5268', # path to the model
    tokenizer = tokenizer,
    aggregation_strategy = 'simple'
)

pipeline('I am Aneesh Cherian K. I worked as a lead data scientist in Envestnet')

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'entity_group': 'PER',
  'score': 0.9389229,
  'word': 'Aneesh Cherian K.',
  'start': 5,
  'end': 22},
 {'entity_group': 'ORG',
  'score': 0.99805784,
  'word': 'Envestnet',
  'start': 60,
  'end': 69}]