In [None]:
!pip install seqeval
!pip install datasets
!pip install transformers

In [None]:
!pip install torch==1.5.0

In [None]:
# !pip install datasets transformers[sentencepiece]
# !apt install git-lfs

# !git config --global user.email "you@example.com"
# !git config --global user.name "Your Name"

import numpy as np
import seqeval
import torch
from datasets import load_dataset, load_metric
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    Trainer,
    TrainingArguments,
)


def main():
    def check_cuda():
        if torch.cuda.is_available():
            torch.cuda.current_device()
            print(torch.cuda.current_device())
            torch.cuda.device(0)
            print(torch.cuda.device(0))
            torch.cuda.device_count()
            print(torch.cuda.device_count())
            torch.cuda.get_device_name(0)
            print(torch.cuda.get_device_name(0))
        else:
            print("No GPU Available.")

    check_cuda()

    # load raw dataset
    raw_datasets = load_dataset("conll2003")

    # get features from ner tags
    ner_feature = raw_datasets["train"].features["ner_tags"]
    # get feature label names
    label_names = ner_feature.feature.names

    # load bert model
    model_checkpoint = "bert-base-cased"
    # load tokeniser
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

    # align labels with tokens
    def align_labels_with_tokens(labels, word_ids):
        new_labels = []
        current_word = None
        for word_id in word_ids:
            if word_id != current_word:
                # Start of a new word!
                current_word = word_id
                label = -100 if word_id is None else labels[word_id]
                new_labels.append(label)
            elif word_id is None:
                # Special token
                new_labels.append(-100)
            else:
                # Same word as previous token
                label = labels[word_id]
                # If the label is B-XXX we change it to I-XXX
                if label % 2 == 1:
                    label += 1
                new_labels.append(label)

        return new_labels

    # tokenise and align labels
    def tokenize_and_align_labels(examples):
        tokenized_inputs = tokenizer(
            examples["tokens"], truncation=True, is_split_into_words=True
        )
        all_labels = examples["ner_tags"]
        new_labels = []
        for i, labels in enumerate(all_labels):
            word_ids = tokenized_inputs.word_ids(i)
            new_labels.append(align_labels_with_tokens(labels, word_ids))

        tokenized_inputs["labels"] = new_labels
        return tokenized_inputs

    # tokenise datasets
    tokenized_datasets = raw_datasets.map(
        tokenize_and_align_labels,
        batched=True,
        remove_columns=raw_datasets["train"].column_names,
    )

    # data collator
    data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

    # metrics
    metric = load_metric("seqeval")

    # compute metrics
    def compute_metrics(eval_preds):
        logits, labels = eval_preds
        predictions = np.argmax(logits, axis=-1)

        # Remove ignored index (special tokens) and convert to labels
        true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
        true_predictions = [
            [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        all_metrics = metric.compute(
            predictions=true_predictions, references=true_labels
        )
        return {
            "precision": all_metrics["overall_precision"],
            "recall": all_metrics["overall_recall"],
            "f1": all_metrics["overall_f1"],
            "accuracy": all_metrics["overall_accuracy"],
        }

    # label id mappings
    id2label = {str(i): label for i, label in enumerate(label_names)}
    label2id = {v: k for k, v in id2label.items()}

    # define model
    model = AutoModelForTokenClassification.from_pretrained(
        model_checkpoint,
        id2label=id2label,
        label2id=label2id,
    )

    # define training arguments
    args = TrainingArguments(
        "bert-finetuned-ner",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        num_train_epochs=3,
        weight_decay=0.01,
        push_to_hub=False,
        hub_token=XXX,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
    )
    trainer.train()


if __name__ == "__main__":
    main()