In [2]:
from datasets import load_dataset
from evaluate import load
import numpy as np 
from transformers import BertTokenizerFast 
from transformers import DataCollatorForTokenClassification 
from transformers import AutoModelForTokenClassification 

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load the CoNLL-2003 dataset
dataset = load_dataset("conll2003",trust_remote_code=True)
print(dataset)

# Load BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")


DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})


In [4]:
dataset["train"].features["ner_tags"]

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

In [5]:
def tokenize_and_align_labels(dataset, label_all_tokens=True): 

    tokenized_inputs = tokenizer(dataset["tokens"], truncation=True, is_split_into_words=True) 
    labels = [] 
    for i, label in enumerate(dataset["ner_tags"]): 
        word_ids = tokenized_inputs.word_ids(batch_index=i) 
        previous_word_idx = None 
        label_ids = []

        for word_idx in word_ids: 
            if word_idx is None: 
                label_ids.append(-100)
            elif word_idx != previous_word_idx:               
                label_ids.append(label[word_idx]) 
            else: 
                label_ids.append(label[word_idx] if label_all_tokens else -100) 
            previous_word_idx = word_idx 
            
        labels.append(label_ids) 
    tokenized_inputs["labels"] = labels 
    return tokenized_inputs 

In [14]:
q = tokenize_and_align_labels(dataset['train'][4:5]) 
print(q) 

{'input_ids': [[101, 2762, 1005, 1055, 4387, 2000, 1996, 2647, 2586, 1005, 1055, 15651, 2837, 14121, 1062, 9328, 5804, 2056, 2006, 9317, 10390, 2323, 4965, 8351, 4168, 4017, 2013, 3032, 2060, 2084, 3725, 2127, 1996, 4045, 6040, 2001, 24509, 1012, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 5, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, -100]]}


In [15]:
for token, label in zip(tokenizer.convert_ids_to_tokens(q["input_ids"][0]),q["labels"][0]): 
    print(f"{token:_<40} {label}") 

[CLS]___________________________________ -100
germany_________________________________ 5
'_______________________________________ 0
s_______________________________________ 0
representative__________________________ 0
to______________________________________ 0
the_____________________________________ 0
european________________________________ 3
union___________________________________ 4
'_______________________________________ 0
s_______________________________________ 0
veterinary______________________________ 0
committee_______________________________ 0
werner__________________________________ 1
z_______________________________________ 2
##wing__________________________________ 2
##mann__________________________________ 2
said____________________________________ 0
on______________________________________ 0
wednesday_______________________________ 0
consumers_______________________________ 0
should__________________________________ 0
buy_____________________________________ 0
sheep___

In [34]:
dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map: 100%|██████████| 14041/14041 [00:01<00:00, 7856.17 examples/s]
Map: 100%|██████████| 3250/3250 [00:00<00:00, 7605.18 examples/s]
Map: 100%|██████████| 3453/3453 [00:00<00:00, 7762.55 examples/s]


In [4]:
# Define the label map
label_list = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

# Load pre-trained BERT model
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(label_list))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
from transformers import TrainingArguments, Trainer 
args = TrainingArguments( 
"general_ner",
evaluation_strategy = "epoch", 
learning_rate=2e-5, 
per_device_train_batch_size=16, 
per_device_eval_batch_size=16, 
num_train_epochs=3, 
weight_decay=0.01, 
use_cpu=True,
) 
data_collator = DataCollatorForTokenClassification(tokenizer) 
metric = load("seqeval")



In [36]:
def compute_metrics(eval_preds): 
    
    pred_logits, labels = eval_preds 
    
    pred_logits = np.argmax(pred_logits, axis=2) 
    # the logits and the probabilities are in the same order,
    # so we don’t need to apply the softmax
    
    # We remove all the values where the label is -100
    predictions = [ 
        [label_list[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100] 
        for prediction, label in zip(pred_logits, labels) 
    ] 
       
    true_labels = [ 
      [label_list[l] for (eval_preds, l) in zip(prediction, label) if l != -100] 
       for prediction, label in zip(pred_logits, labels) 
   ] 
    results = metric.compute(predictions=predictions, references=true_labels) 
    return { 
   "precision": results["overall_precision"], 
   "recall": results["overall_recall"], 
   "f1": results["overall_f1"], 
  "accuracy": results["overall_accuracy"], 
  } 

In [None]:
trainer = Trainer( 
    model, 
    args, 
   train_dataset=dataset["train"], 
   eval_dataset=dataset["validation"], 
   data_collator=data_collator, 
   processing_class=tokenizer, 
   compute_metrics=compute_metrics 
) 
trainer.train() 

In [38]:
model.save_pretrained("general_ner_model")
tokenizer.save_pretrained("tokenizer")

('tokenizer\\tokenizer_config.json',
 'tokenizer\\special_tokens_map.json',
 'tokenizer\\vocab.txt',
 'tokenizer\\added_tokens.json',
 'tokenizer\\tokenizer.json')

In [5]:
import json

id2label = {
    str(i): label for i,label in enumerate(label_list)
}
label2id = {
    label: str(i) for i,label in enumerate(label_list)
}

config = json.load(open("general_ner_model/config.json"))
config["id2label"] = id2label
config["label2id"] = label2id
json.dump(config, open("general_ner_model/config.json","w"))

In [6]:
model_fine_tuned = AutoModelForTokenClassification.from_pretrained("general_ner_model")

In [7]:
from transformers import pipeline

nlp = pipeline("ner", model=model_fine_tuned, tokenizer=tokenizer)

example = "Bill Gates is the Founder of Microsoft"

ner_results = nlp(example)

print(ner_results)


Device set to use cpu


[{'entity': 'B-PER', 'score': 0.9950805, 'index': 1, 'word': 'bill', 'start': 0, 'end': 4}, {'entity': 'I-PER', 'score': 0.9911732, 'index': 2, 'word': 'gates', 'start': 5, 'end': 10}, {'entity': 'B-ORG', 'score': 0.97507817, 'index': 7, 'word': 'microsoft', 'start': 29, 'end': 38}]
