In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification, EarlyStoppingCallback, pipeline
from datasets import load_dataset
import evaluate
import numpy as np



In [2]:
ds = load_dataset("conll2003", trust_remote_code=True)

In [3]:
ds["train"].features

{'id': Value(dtype='string', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'pos_tags': Sequence(feature=ClassLabel(names=['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'], id=None), length=-1, id=None),
 'chunk_tags': Sequence(feature=ClassLabel(names=['O', 'B-ADJP', 'I-ADJP', 'B-ADVP', 'I-ADVP', 'B-CONJP', 'I-CONJP', 'B-INTJ', 'I-INTJ', 'B-LST', 'I-LST', 'B-NP', 'I-NP', 'B-PP', 'I-PP', 'B-PRT', 'I-PRT', 'B-SBAR', 'I-SBAR', 'B-UCP', 'I-UCP', 'B-VP', 'I-VP'], id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)}

In [4]:
ds["train"][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [5]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [6]:
def preprocess_ds(examples):
    return tokenizer(examples["tokens"], is_split_into_words=True, truncation=True)

In [7]:
test = ds["train"].map(preprocess_ds, batched=True)

In [8]:
tokenizer.convert_ids_to_tokens(test[2]['input_ids'])

['[CLS]', 'brussels', '1996', '-', '08', '-', '22', '[SEP]']

In [9]:
test[2]["ner_tags"]

[5, 0]

In [10]:
test[2]["tokens"]

['BRUSSELS', '1996-08-22']

In [11]:
ds["train"] = ds["train"].select(range(4))

In [12]:
ds["train"]

Dataset({
    features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
    num_rows: 4
})

In [13]:
def preprocess_ds(examples):
    tokenized_input = tokenizer(examples["tokens"], is_split_into_words=True, truncation=True)
    final_labels = list()
    for idx, _ in enumerate(tokenized_input["input_ids"]):
        word_ids = tokenized_input.word_ids(batch_index=idx)
        current_label = list()
        previous_word_id = None
        for word_id in word_ids:
            if word_id is None:
                current_label.append(-100)
            elif word_id != previous_word_id:
                previous_word_id = word_id
                current_ner_tag = examples["ner_tags"][idx]
                current_label.append(current_ner_tag[word_id])
            else:
                current_label.append(-100)
        final_labels.append(current_label)
    # print(final_labels)
    # print(tokenized_input["input_ids"])
    assert len(final_labels) == len(tokenized_input["input_ids"])
    tokenized_input["labels"] = final_labels
     
    return tokenized_input

In [14]:
test = ds["train"].map(preprocess_ds, batched=True)

In [15]:
test

Dataset({
    features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 4
})

In [16]:
ds = load_dataset("conll2003", trust_remote_code=True)

In [17]:
ds = ds.map(preprocess_ds, batched=True)

In [18]:
len(ds["train"]["labels"])

14041

In [19]:
len(ds["train"]["input_ids"])

14041

In [20]:
class_names = ds["train"].features["ner_tags"].feature.names
id2label = {idx: name for idx, name in enumerate(class_names)}
num_labels = len(class_names)

In [21]:
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=num_labels, id2label=id2label)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
datacollator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [23]:
seqeval = evaluate.load("seqeval")

In [24]:

def compute_metrics(output):
    predictions, labels = output
    predictions = np.argmax(predictions, axis=-1)
    new_predictions = [[class_names[p] for p, l in zip(prediction, label) if l!=-100] for prediction, label in zip(predictions, labels)]
    new_labels = [[class_names[l] for p, l in zip(prediction, label) if l!=-100] for prediction, label in zip(predictions, labels)]
    results = seqeval.compute(predictions=new_predictions, references=new_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"]
    }

In [None]:
# Define training arguments for Hugging Face Trainer
training_args = TrainingArguments(
    output_dir="test",                     # Directory where checkpoints, logs, etc. are saved
    per_device_train_batch_size=16,        # Training batch size per GPU/CPU
    per_device_eval_batch_size=16,         # Evaluation batch size per GPU/CPU
    num_train_epochs=3,                    # Number of training epochs
    eval_strategy="epoch",                 # Run evaluation at the end of each epoch
    save_strategy="epoch",                 # Save model checkpoint at the end of each epoch
    logging_strategy="epoch",              # Log training metrics (loss, etc.) once per epoch
    metric_for_best_model="eval_loss",     # Metric used to decide the "best model"
    save_total_limit=1,                    # Keep only the most recent checkpoint (older ones deleted)
    load_best_model_at_end=True,           # Load best model at the end (set True if you want best model)
    push_to_hub=False,                     # Push model to Hugging Face Hub
    fp16=False,                             # Use mixed precision (float16) for faster training on GPUs
    gradient_accumulation_steps=1,         # Accumulate gradients for 4 steps before backward/update -> Effective batch size = 16 * 4 = 64
    # lr_scheduler_type="cosine",            # Use cosine learning rate scheduler
    # report_to="mlflow"                     # Report logs & metrics to MLflow
)

# Initialize the Trainer
trainer = Trainer(
    args=training_args,                    # Training arguments defined above
    model=model,                           # Model to train (BERT sequence classification in this case)
    train_dataset=ds["train"],             # Training dataset
    eval_dataset=ds["test"],               # Evaluation dataset
    data_collator=datacollator,            # Function to batch and pad inputs
    processing_class=tokenizer,            # Alternate for datacollator with padding
    compute_metrics=compute_metrics,       # Function to compute custom metrics (accuracy, F1, etc.)
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], # Stop training if no improvement for 3 evaluation rounds
)

In [26]:
trainer.train()

  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1029,0.098922,0.879958,0.899433,0.889589,0.97728
2,0.0271,0.109452,0.896832,0.917316,0.906958,0.980575
3,0.0113,0.125107,0.903412,0.914129,0.908739,0.980876


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


TrainOutput(global_step=2634, training_loss=0.04708535506495401, metrics={'train_runtime': 160.8641, 'train_samples_per_second': 261.855, 'train_steps_per_second': 16.374, 'total_flos': 1020143109346326.0, 'train_loss': 0.04708535506495401, 'epoch': 3.0})

In [27]:
trainer.save_model("best_model")

In [49]:
classifier = pipeline("ner", "best_model", grouped_entities=True)

Device set to use cuda:0


In [50]:
results = classifier("Vijay is going to cm of tamilnadu")
results

[{'entity_group': 'PER',
  'score': np.float32(0.99209166),
  'word': 'vijay',
  'start': 0,
  'end': 5},
 {'entity_group': 'LOC',
  'score': np.float32(0.96222115),
  'word': 'tamilnadu',
  'start': 24,
  'end': 33}]

In [51]:
from spacy import displacy

# Create a spaCy-style Doc for visualization
doc = {
    "text": "Vijay is going to cm of tamilnadu",
    "ents": [
        {"start": ent["start"], "end": ent["end"], "label": ent["entity_group"]}
        for ent in results
    ],
    "title": None
}

# 3. Use displaCy to render in Jupyter/Colab
displacy.render(doc, style="ent", manual=True, jupyter=True)