In [1]:
import torch
import numpy as np
import pandas as pd

In [2]:
from transformers import BertForTokenClassification, BertTokenizerFast
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification

2024-04-05 11:53:24.364677: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-05 11:53:24.364782: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-05 11:53:24.496263: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') #using gpu

In [4]:
from datasets import load_dataset
dataset = load_dataset("conll2003") #we will be using the conll2003 dataset

Downloading builder script:   0%|          | 0.00/2.58k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

Downloading and preparing dataset conll2003/conll2003 (download: 959.94 KiB, generated: 9.78 MiB, post-processed: Unknown size, total: 10.72 MiB) to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/63f4ebd1bcb7148b1644497336fd74643d4ce70123334431a3c053b7ee4e96ee...


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14042 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3251 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3454 [00:00<?, ? examples/s]

Dataset conll2003 downloaded and prepared to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/63f4ebd1bcb7148b1644497336fd74643d4ce70123334431a3c053b7ee4e96ee. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
dataset #viewing and understanding the structure of dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14042
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3251
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3454
    })
})

In [6]:
text_example = dataset['train'][0]
text_example #viewing a sample of the dataset

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [7]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") #setting up the tokenizer

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [8]:
tokenized_input = tokenizer(text_example['tokens'], is_split_into_words = True)
tokenized_input #using the sample to check if tokenizer is set up correctly

{'input_ids': [101, 7327, 19164, 2446, 2655, 2000, 17757, 2329, 12559, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [9]:
tokens = tokenizer.convert_ids_to_tokens(tokenized_input['input_ids']) #converting to tokens
tokens

['[CLS]',
 'eu',
 'rejects',
 'german',
 'call',
 'to',
 'boycott',
 'british',
 'lamb',
 '.',
 '[SEP]']

In [10]:
word_ids = tokenized_input.word_ids()
word_ids #checking the word ids of the sample

[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, None]

In [11]:
#two additional tokens [CLS] and [SEP] come up. Additionally, problem of sub token arises as the tokenizer can also cut down a word into multiple pieces(not shown in the above example)
#therefore, we define a function to tackle both of these problems

In [12]:
def neg100_plus_subword_adjust(input_data, label_all_tokens = True):
    tokenized_input = tokenizer(input_data['tokens'], truncation = True , is_split_into_words = True) #tokenizing the input data
    labels = [] #initialising labels array to collect 
    for i, label in enumerate(input_data["ner_tags"]):
        word_ids = tokenized_input.word_ids(batch_index = i) #generating word ids for i
        previous_word_idx = None #setting previous word idx back to none
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None: #handling the None case
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx]) #base case
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100) #subword problem handling
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_input["labels"] = labels #adding labels as a new key-value pair to tokenized_input
    return tokenized_input

In [13]:
z = neg100_plus_subword_adjust(dataset['train'][4:6])
print(z) #checking if the function works

{'input_ids': [[101, 2762, 1005, 1055, 4387, 2000, 1996, 2647, 2586, 1005, 1055, 15651, 2837, 14121, 1062, 9328, 5804, 2056, 2006, 9317, 10390, 2323, 4965, 8351, 4168, 4017, 2013, 3032, 2060, 2084, 3725, 2127, 1996, 4045, 6040, 2001, 24509, 1012, 102], [101, 1000, 2057, 2079, 1050, 1005, 1056, 2490, 2151, 2107, 12832, 2138, 2057, 2079, 1050, 1005, 1056, 2156, 2151, 5286, 2005, 2009, 1010, 1000, 1996, 3222, 1005, 1055, 2708, 14056, 24794, 2271, 3158, 4315, 14674, 2409, 1037, 2739, 27918, 1012, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [14]:
for token, label in zip(tokenizer.convert_ids_to_tokens(z['input_ids'][0]), z['labels'][0]):
    print(f"{token:_<40} {label}") #arranging tokens and corresponding ner tags side by side

[CLS]___________________________________ -100
germany_________________________________ 5
'_______________________________________ 0
s_______________________________________ 0
representative__________________________ 0
to______________________________________ 0
the_____________________________________ 0
european________________________________ 3
union___________________________________ 4
'_______________________________________ 0
s_______________________________________ 0
veterinary______________________________ 0
committee_______________________________ 0
werner__________________________________ 1
z_______________________________________ 2
##wing__________________________________ 2
##mann__________________________________ 2
said____________________________________ 0
on______________________________________ 0
wednesday_______________________________ 0
consumers_______________________________ 0
should__________________________________ 0
buy_____________________________________ 0
sheep___

In [15]:
tokenized_dataset = dataset.map(neg100_plus_subword_adjust, batched = True) #using the function to tokenize the whole dataset

  0%|          | 0/15 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [16]:
model = AutoModelForTokenClassification.from_pretrained('bert-base-uncased', num_labels = 9).to(device) #initialising the bert model

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments( # setting up the training arguments
    "test-ner",
    evaluation_strategy = "epoch",
    learning_rate = 2e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 1,
    weight_decay = 0.01,
        
)

In [18]:
data_collator = DataCollatorForTokenClassification(tokenizer) #we use data collator to arrange, batch and pad the dataset

In [19]:
pip install seqeval

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25ldone
[?25h  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=b38d69ea62ef0f38cb70ba61089d9340eb8a9d27f624d6612354682d4c17bacd
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2
Note: you may need to restart the kernel to use updated packages.


In [20]:
from datasets import load_metric
metric = load_metric('seqeval') #using seqeval as a metric calculator

Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [21]:
metric

Metric(name: "seqeval", features: {'predictions': Sequence(feature=Value(dtype='string', id='label'), length=-1, id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='label'), length=-1, id='sequence')}, usage: """
Produces labelling scores along with its sufficient statistics
from a source against one or more references.

Args:
    predictions: List of List of predicted labels (Estimated targets as returned by a tagger)
    references: List of List of reference labels (Ground truth (correct) target values)
    suffix: True if the IOB prefix is after type, False otherwise. default: False
    scheme: Specify target tagging scheme. Should be one of ["IOB1", "IOB2", "IOE1", "IOE2", "IOBES", "BILOU"].
        default: None
    mode: Whether to count correct entity labels with incorrect I/B tags as true positives or not.
        If you want to only count exact matches, pass mode="strict". default: None.
    sample_weight: Array-like of shape (n_samples,), weights for indi

In [22]:
label_list = dataset['train'].features['ner_tags'].feature.names
label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [23]:
labels = [label_list[i] for i in text_example["ner_tags"]]#text_example, as referred above, is the first row in the training dataset
metric.compute(predictions=[labels], references=[labels])#to check if the metric works correctly

{'MISC': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 2},
 'ORG': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [24]:
def compute_metrics(eval_preds):#eval fxn
    pred_logits, labels = eval_preds
    pred_logits = np.argmax(pred_logits, axis = 2)
    #setting up lists for metric.compute
    predictions = [
        [label_list[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100] 
        for prediction, label in zip(pred_logits, labels) 
    ]
    true_labels = [#
      [label_list[l] for (eval_preds, l) in zip(prediction, label) if l != -100] 
       for prediction, label in zip(pred_logits, labels) 
   ]
    #applying metric.compute
    results = metric.compute(predictions=predictions, references=true_labels) 
    return { 
       "precision": results["overall_precision"], 
       "recall": results["overall_recall"], 
       "f1": results["overall_f1"], 
      "accuracy": results["overall_accuracy"], 
      } 

In [25]:
trainer = Trainer( #setting up Trainer
    model, 
    args, 
   train_dataset=tokenized_dataset["train"], 
   eval_dataset=tokenized_dataset["validation"], 
   data_collator=data_collator, 
   tokenizer=tokenizer, 
   compute_metrics=compute_metrics 
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [26]:
trainer.train() #training

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2269,0.069167,0.908322,0.925495,0.916828,0.981032


TrainOutput(global_step=878, training_loss=0.16607801137589648, metrics={'train_runtime': 164.2349, 'train_samples_per_second': 85.499, 'train_steps_per_second': 5.346, 'total_flos': 341037325138356.0, 'train_loss': 0.16607801137589648, 'epoch': 1.0})

In [27]:
model.save_pretrained("ner_model") #saving model

In [28]:
tokenizer.save_pretrained("tokenizer") #saving tokenizer

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.txt',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

In [29]:
id2label = {
    str(i): label for i,label in enumerate(label_list)
}
label2id = {
    label: str(i) for i,label in enumerate(label_list)
}

In [30]:
import json

In [31]:
config = json.load(open("ner_model/config.json"))

In [32]:
config["id2label"] = id2label
config["label2id"] = label2id

In [33]:
json.dump(config, open("ner_model/config.json","w"))

In [34]:
model_fine_tuned = AutoModelForTokenClassification.from_pretrained("ner_model")

In [35]:
from transformers import pipeline #using pipeline

In [36]:
nlp = pipeline("ner", model=model_fine_tuned, tokenizer=tokenizer)#setting up pipeline

In [37]:
#assume you are in 2003 or before while trying out the model below :-

In [38]:
example = "Henry Ford, the founder of Ford Motor Company, was born in Michigan, United States of America"
ner_results = nlp(example)
print(ner_results)

[{'entity': 'B-PER', 'score': 0.95669186, 'index': 1, 'word': 'henry', 'start': 0, 'end': 5}, {'entity': 'I-PER', 'score': 0.9199473, 'index': 2, 'word': 'ford', 'start': 6, 'end': 10}, {'entity': 'B-ORG', 'score': 0.8947865, 'index': 7, 'word': 'ford', 'start': 27, 'end': 31}, {'entity': 'I-ORG', 'score': 0.8595338, 'index': 8, 'word': 'motor', 'start': 32, 'end': 37}, {'entity': 'I-ORG', 'score': 0.9048448, 'index': 9, 'word': 'company', 'start': 38, 'end': 45}, {'entity': 'B-LOC', 'score': 0.9842651, 'index': 14, 'word': 'michigan', 'start': 59, 'end': 67}, {'entity': 'B-LOC', 'score': 0.97773564, 'index': 16, 'word': 'united', 'start': 69, 'end': 75}, {'entity': 'I-LOC', 'score': 0.7145643, 'index': 17, 'word': 'states', 'start': 76, 'end': 82}, {'entity': 'I-LOC', 'score': 0.5217796, 'index': 18, 'word': 'of', 'start': 83, 'end': 85}, {'entity': 'I-LOC', 'score': 0.85132134, 'index': 19, 'word': 'america', 'start': 86, 'end': 93}]
