In [None]:
!pip install datasets
!pip install -U accelerate
!pip install -U transformers
!pip install seqeval



In [None]:
# loading the dataset
from datasets import load_dataset, load_metric, concatenate_datasets
datasets = load_dataset("cfilt/HiNER-original")

In [None]:
label_list = ['B-FESTIVAL', 'B-GAME', 'B-LANGUAGE', 'B-LITERATURE', 'B-LOCATION', 'B-MISC', 'B-NUMEX', 'B-ORGANIZATION', 'B-PERSON', 'B-RELIGION', 'B-TIMEX', 'I-FESTIVAL', 'I-GAME', 'I-LANGUAGE', 'I-LITERATURE', 'I-LOCATION', 'I-MISC', 'I-NUMEX', 'I-ORGANIZATION', 'I-PERSON', 'I-RELIGION', 'I-TIMEX', 'O']
labels_vocab = {'B-FESTIVAL': 0, 'B-GAME': 1, 'B-LANGUAGE': 2, 'B-LITERATURE': 3, 'B-LOCATION': 4, 'B-MISC': 5, 'B-NUMEX': 6, 'B-ORGANIZATION': 7, 'B-PERSON': 8, 'B-RELIGION': 9, 'B-TIMEX': 10, 'I-FESTIVAL': 11, 'I-GAME': 12, 'I-LANGUAGE': 13, 'I-LITERATURE': 14, 'I-LOCATION': 15, 'I-MISC': 16, 'I-NUMEX': 17, 'I-ORGANIZATION': 18, 'I-PERSON': 19, 'I-RELIGION': 20, 'I-TIMEX': 21, 'O': 22}
labels_vocab_reverse = {v:k for k,v in labels_vocab.items()}

In [None]:
train_dataset = datasets["train"]
val_dataset = datasets["validation"]
test_dataset = datasets["test"]

In [None]:
# preprocess the data
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

In [None]:
import transformers
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [None]:
label_all_tokens = False

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
train_tokenized = train_dataset.map(tokenize_and_align_labels, batched=True)
val_tokenized = val_dataset.map(tokenize_and_align_labels, batched=True)
test_tokenized = test_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/21657 [00:00<?, ? examples/s]

In [None]:
# fine-tuning the model
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-multilingual-cased", num_labels=len(label_list), label2id=labels_vocab, id2label=labels_vocab_reverse
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model_name = "bert-base-multilingual-cased"
args = TrainingArguments(
    "HiNER",
    evaluation_strategy = "steps",
    learning_rate=4e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    push_to_hub=True,
    eval_steps=10000,
    save_steps=10000,
)

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
metric = load_metric("seqeval")

  metric = load_metric("seqeval")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [None]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
!python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('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')"

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


TrainOutput(global_step=4740, training_loss=0.12170462829654227, metrics={'train_runtime': 1460.1008, 'train_samples_per_second': 51.933, 'train_steps_per_second': 3.246, 'total_flos': 3646812394817118.0, 'train_loss': 0.12170462829654227, 'epoch': 1.0})

# Evaluating the Model

In [None]:
trainer.evaluate()

{'eval_loss': 0.0908978134393692,
 'eval_precision': 0.887940218314317,
 'eval_recall': 0.8947925256328474,
 'eval_f1': 0.8913532028331861,
 'eval_accuracy': 0.9715378339100502,
 'eval_runtime': 140.5348,
 'eval_samples_per_second': 154.104,
 'eval_steps_per_second': 9.635,
 'epoch': 1.0}

In [None]:
predictions, labels, _ = trainer.predict(test_tokenized)
predictions = np.argmax(predictions, axis=2)

true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

{'FESTIVAL': {'precision': 1.0,
  'recall': 0.02564102564102564,
  'f1': 0.05,
  'number': 39},
 'GAME': {'precision': 0.594017094017094,
  'recall': 0.7533875338753387,
  'f1': 0.6642771804062126,
  'number': 369},
 'LANGUAGE': {'precision': 0.9075907590759076,
  'recall': 0.9243697478991597,
  'f1': 0.9159034138218152,
  'number': 1190},
 'LITERATURE': {'precision': 0.6329787234042553,
  'recall': 0.664804469273743,
  'f1': 0.6485013623978202,
  'number': 179},
 'LOCATION': {'precision': 0.9538627959045055,
  'recall': 0.9462218007586345,
  'f1': 0.9500269345927866,
  'number': 40072},
 'MISC': {'precision': 0.6574898785425102,
  'recall': 0.7624413145539906,
  'f1': 0.7060869565217393,
  'number': 1065},
 'NUMEX': {'precision': 0.7227833894500562,
  'recall': 0.6942647692971108,
  'f1': 0.7082371054657429,
  'number': 4638},
 'ORGANIZATION': {'precision': 0.7564213066712636,
  'recall': 0.8200336385722294,
  'f1': 0.7869440459110474,
  'number': 5351},
 'PERSON': {'precision': 0.835

# Testing

In [None]:
model.to('cpu')

In [None]:
from transformers import pipeline
import pandas as pd

In [None]:
nlp = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)
example = "भारतीय अंतरिक्ष अनुसंधान संगठन या इसरो भारत की राष्ट्रीय अंतरिक्ष एजेंसी है, जिसका मुख्यालय बेंगलुरु में है। यह अंतरिक्ष विभाग के तहत संचालित होता है जिसकी देखरेख सीधे भारत के प्रधान मंत्री करते हैं जबकि इसरो के अध्यक्ष डीओएस के कार्यकारी के रूप में भी कार्य करते हैं।"

ner_results = nlp(example)

pd.DataFrame(ner_results)