<a href="https://colab.research.google.com/github/c-kartik/BERT_NER/blob/main/BERT_w_NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
!pip install transformers datasets tokenizers seqeval -q

In [21]:
import datasets
import transformers
import numpy as np
from transformers import BertTokenizerFast, DataCollatorForTokenClassification, AutoModelForTokenClassification

conll2003 = datasets.load_dataset("conll2003")

In [22]:
conll2003

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [23]:
conll2003.shape

{'train': (14041, 5), 'validation': (3250, 5), 'test': (3453, 5)}

In [24]:
conll2003["train"][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [25]:
conll2003["train"].features['ner_tags']

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

In [26]:
conll2003["train"].description

'The shared task of CoNLL-2003 concerns language-independent named entity recognition. We will concentrate on\nfour types of named entities: persons, locations, organizations and names of miscellaneous entities that do\nnot belong to the previous three groups.\n\nThe CoNLL-2003 shared task data files contain four columns separated by a single space. Each word has been put on\na separate line and there is an empty line after each sentence. The first item on each line is a word, the second\na part-of-speech (POS) tag, the third a syntactic chunk tag and the fourth the named entity tag. The chunk tags\nand the named entity tags have the format I-TYPE which means that the word is inside a phrase of type TYPE. Only\nif two phrases of the same type immediately follow each other, the first word of the second phrase will have tag\nB-TYPE to show that it starts a new phrase. A word with tag O is not part of a phrase. Note the dataset uses IOB2\ntagging scheme, whereas the original dataset uses 

In [27]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

In [28]:
example_text = conll2003['train'][0]

tokenized_input = tokenizer(example_text['tokens'], is_split_into_words=True)
tokenized_input

{'input_ids': [101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [29]:
tokens = tokenizer.convert_ids_to_tokens(tokenized_input['input_ids'])
tokens

['[CLS]',
 'EU',
 'rejects',
 'German',
 'call',
 'to',
 'boycott',
 'British',
 'la',
 '##mb',
 '.',
 '[SEP]']

In [30]:
word_ids = tokenized_input.word_ids()
word_ids

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

In [31]:
len(example_text['ner_tags']), len(tokenized_input["input_ids"])

(9, 12)

# Function `tokenize_and_align_labels`

1.   Set -100 as the label for these special tokens and the subwords we wish to mask during training
2.   Mask the subword representations after the first subword
3.   Then we align the labels with the token ids using the strategy we picked


In [32]:
def tokenize_and_align_labels(example, label_all_tokens = True):
  tokenized_input = tokenizer(example['tokens'], truncation=True, is_split_into_words=True)
  labels = []

  for i, label in enumerate(example['ner_tags']):
    word_ids = tokenized_input.word_ids(batch_index=i)
    # word_ids(): return a list mapping of the tokens to their actual word in the initial sentence
    # It returns a list indicating the word corresponding to each token

    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:
      if word_idx is None:
        # set -100 as the label for these special tokens
        label_ids.append(-100)
      elif word_idx != previous_word_idx:
        # if current word_idx is != prev then its the most regular case
        # and add the corresponding token
        label_ids.append(label[word_idx])
      else:
        # to take care of sub-words which have the same word_idx
        # set -100 as well for them, but only if label_all_tokens == False
        label_ids.append(label[word_idx] if label_all_tokens else -100)
      previous_word_idx = word_idx
    labels.append(label_ids)
  tokenized_input['labels'] = labels
  return tokenized_input

In [33]:
q = tokenize_and_align_labels(conll2003['train'][4:5])

print(q)

{'input_ids': [[101, 1860, 112, 188, 4702, 1106, 1103, 1735, 1913, 112, 188, 27431, 3914, 14651, 163, 7635, 4119, 1163, 1113, 9031, 11060, 1431, 4417, 8892, 3263, 2980, 1121, 2182, 1168, 1190, 2855, 1235, 1103, 3812, 5566, 1108, 27830, 119, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 5, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, -100]]}


# Before applying the `tokenize_and_align_labels()` the `tokenized_input` has 3 keys
*   `input_ids`
*   `token_type_ids`
*   `attention_mask`

But after applying `tokenize_and_align_labels()` we have an extra key - `labels`

In [34]:
for token, label in zip(tokenizer.convert_ids_to_tokens(q['input_ids'][0]), q['labels'][0]):
  print(f"{token: <40} {label}")

[CLS]                                    -100
Germany                                  5
'                                        0
s                                        0
representative                           0
to                                       0
the                                      0
European                                 3
Union                                    4
'                                        0
s                                        0
veterinary                               0
committee                                0
Werner                                   1
Z                                        2
##wing                                   2
##mann                                   2
said                                     0
on                                       0
Wednesday                                0
consumers                                0
should                                   0
buy                                      0
sheep   

In [35]:
tokenized_datasets = conll2003.map(tokenize_and_align_labels, batched=True)

In [36]:
model = AutoModelForTokenClassification.from_pretrained('bert-base-cased', num_labels=9)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
!pip install --upgrade transformers



In [38]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    #"test-ner",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
!pip install evaluate -q # Install the evaluate package which contains load_metric

In [None]:
import evaluate  # Import load_metric from evaluate instead of datasets

In [None]:
metric = evaluate.load('seqeval')

example = conll2003['train'][0]

label_list = conll2003['train'].features['ner_tags'].feature.names

label_list

In [None]:
labels = [label_list[i] for i in example['ner_tags']]

labels

In [None]:
metric.compute(predictions=[labels], references=[labels])

In [None]:
def compute_metrics(eval_preds):
  pred_logits, labels = eval_preds
  pred_logits = np.argmax(pred_logits, axis=-1)

  prediction = [
      [label_list[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100] for prediction, label in zip(pred_logits, labels)
  ]
  true_labels = [
      [label_list[l] for (eval_preds, l) in zip(prediction, label) if l != -100] for prediction, label in zip(pred_logits, labels)
  ]
  results = metric.compute(predictions=prediction, references=true_labels)

  return {
      'precision': results['overall_precision'],
      'recall': results['overall_recall'],
      'f1': results['overall_f1'],
      'accuracy': results['overall_accuracy']
  }

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [None]:
trainer.train()

model.save_pretrained('ner_model')

tokenizer.save_pretrained('tokenizer')

In [None]:
id2label = {
    str(i) : label for i, label in enumerate(label_list)
}

label2id = {
    label: str(i) for i, label in enumerate(label_list)
}

In [None]:
import json
config = json.load(open('ner_model/config.json'))
config['id2label'] = id2label
config['label2id'] = label2id

json.dump(config, open('ner_model/config.json', 'w'))

model_fine_tuned = AutoModelForTokenClassification.from_pretrained('ner_model')

# Testing

In [None]:
from transformers import pipeline

In [None]:
nlp = pipeline('ner', model=model_fine_tuned, tokenizer=tokenizer)
example = "Bill Gates is the Founder of Microsoft"

ner_results = nlp(example)
print(ner_results)