In [1]:
from transformers import BertForTokenClassification, Trainer, TrainingArguments, BertTokenizerFast
from datasets import load_dataset
import torch

In [4]:
# Check if CUDA is available
print(torch.cuda.is_available())

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

True


In [6]:
# Load dataset
dataset = load_dataset('conll2003')

In [None]:
# Load tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

In [14]:
# EDA
print(dataset['train'][0])

ner_labels = dataset['train'].features['ner_tags'].feature.names
print(ner_labels)

{'id': '0', 'tokens': ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7], 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0], 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}
['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']


In [5]:
# EDA with tokenization
examples = dataset['train'][:3]
tokenized_examples = tokenizer(
    examples['tokens'],
    max_length=128,
    is_split_into_words=True,  # Since 'tokens' are already split into words
    padding='max_length',
    truncation=True,
    return_tensors='pt'  # Return PyTorch tensors for easy use with models
)

# Display the tokenized output
print(tokenized_examples)

{'input_ids': tensor([[  101,  7270, 22961,  1528,  1840,  1106, 21423,  1418,  2495, 12913,
           119,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [6]:
# Tokenize function
def tokenize_and_align_labels(examples):
    # Tokenize inputs with padding and truncation enabled
    tokenized_inputs = tokenizer(
        examples['tokens'],
        max_length=128,
        truncation=True,
        padding='max_length',
        is_split_into_words=True,
        return_tensors='pt'
    )
    
    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        for word_id in word_ids:
            # Special tokens have a word_id of None, so assign a label of -100
            if word_id is None:
                label_ids.append(-100)
            else:
                label_ids.append(label[word_id])
        labels.append(label_ids)
    
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

for split in tokenized_dataset:
    print(f"{split} split length: {len(tokenized_dataset[split])}")
    # Check the shape of input_ids for the first example in this split
    print(f"{split} split input_ids shape: {len(tokenized_dataset[split][0]['input_ids'])}")
    print(f"{split} split attention_mask shape: {len(tokenized_dataset[split][0]['attention_mask'])}")

train split length: 14041
train split input_ids shape: 128
train split attention_mask shape: 128
validation split length: 3250
validation split input_ids shape: 128
validation split attention_mask shape: 128
test split length: 3453
test split input_ids shape: 128
test split attention_mask shape: 128


In [7]:
# Load BERT model
model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(dataset['train'].features['ner_tags'].feature.names))
model.to(device)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [8]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
    logging_dir='./logs'
)




In [9]:
# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    tokenizer=tokenizer
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0824,0.071364
2,0.0389,0.064922
3,0.0233,0.063953


TrainOutput(global_step=5268, training_loss=0.07184880333772281, metrics={'train_runtime': 1043.5139, 'train_samples_per_second': 40.366, 'train_steps_per_second': 5.048, 'total_flos': 2751824963545344.0, 'train_loss': 0.07184880333772281, 'epoch': 3.0})

In [11]:
model.save_pretrained('model')
tokenizer.save_pretrained('tokenizer')

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.txt',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

In [7]:
# Load the finetuned model and pretrained model
model = BertForTokenClassification.from_pretrained('model')
tokenizer = BertTokenizerFast.from_pretrained('tokenizer')
model.to(device)
base_model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(dataset['train'].features['ner_tags'].feature.names))
base_model.to(device)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [12]:
# Inference function
def predict_entities(model, tokenizer, sentence):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True).to(device)
    outputs = model(**inputs).logits
    predictions = torch.argmax(outputs, dim=2)
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    labels = [ner_labels[pred] for pred in predictions[0].cpu().numpy()]

    return list(zip(tokens, labels))

In [16]:
# Evaluation
test_sentence = "John Smith lives in New York and works for Google."
print("\nPredictions with fintuned BERT:")
print(predict_entities(model, tokenizer, test_sentence))


Predictions with fintuned BERT:
[('[CLS]', 'O'), ('John', 'B-PER'), ('Smith', 'I-PER'), ('lives', 'O'), ('in', 'O'), ('New', 'B-LOC'), ('York', 'I-LOC'), ('and', 'O'), ('works', 'O'), ('for', 'O'), ('Google', 'B-ORG'), ('.', 'O'), ('[SEP]', 'O')]


In [18]:
print("Predictions with base BERT:")
print(predict_entities(base_model, tokenizer, test_sentence))

Predictions with base BERT:
[('[CLS]', 'I-LOC'), ('John', 'I-LOC'), ('Smith', 'I-MISC'), ('lives', 'O'), ('in', 'I-PER'), ('New', 'O'), ('York', 'I-PER'), ('and', 'I-PER'), ('works', 'I-PER'), ('for', 'I-PER'), ('Google', 'I-MISC'), ('.', 'O'), ('[SEP]', 'O')]
