# NER Transformer Notebook Training

This is a notebook detailing the training of a transformer NER model using HuggingFace transformers.

## 1. Installs and Imports

In [None]:
!pip install datasets transformers seqeval

In [None]:
import os
import random
from datetime import date

import numpy as np
import pandas as pd

# inference
import torch
import transformers
from datasets import ClassLabel, Sequence, load_dataset, load_from_disk, load_metric
from google.colab import drive
from IPython.display import HTML, display
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    Trainer,
    TrainingArguments,
    pipeline,
)

print(transformers.__version__)

In [None]:
system = "COLAB"  # ["AWS", "COLAB"]

In [None]:
if system == "AWS":
    fs = s3fs.S3FileSystem()
    s3_bucket = "govuk-data-infrastructure-integration"
    DATA_DIR = f"s3://{s3_bucket}/model-data/govner-data"
    for f in fs.ls(DATA_DIR):
        print(f)
    # Manage interactions with the Amazon SageMaker APIs and any other AWS services needed.
    # sagemaker session bucket -> used for uploading data, models and logs
    # sagemaker will automatically create this bucket if it not exists
    sess = sagemaker.Session()
    sagemaker_session_bucket = s3_bucket
    if sagemaker_session_bucket is None and sess is not None:
        # set to default bucket if a bucket name is not given
        sagemaker_session_bucket = sess.default_bucket()

    role = sagemaker.get_execution_role()
    sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

    print(f"sagemaker role arn: {role}")
    print(f"sagemaker bucket: {sess.default_bucket()}")
    print(f"sagemaker session region: {sess.boto_region_name}")
elif system == "COLAB":
    drive.mount("/content/gdrive")
    DATA_DIR = os.path.join(
        "/content/gdrive/Shared drives/",
        "GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data",
    )
    MODEL_DIR = os.path.join(
        "/content/gdrive/Shared drives/",
        "GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Models",
    )

In [None]:
print("Data Folder: {}".format(DATA_DIR))
print(os.listdir(DATA_DIR)[:3])
print("Model Folder: {}".format(MODEL_DIR))
print(os.listdir(MODEL_DIR)[:3])

## 2. Load Data

Define some variables that will be useful.

In [None]:
task = "ner"
dataset_name = "govuk"
model_checkpoint = "distilbert-base-uncased"
batch_size = 16

In [None]:
dataset_type = "SAMPLED"  # "FULL"

In [None]:
if dataset_type == "SAMPLED":
    hf_data = "samp_hf_govuk_data"
    hf_data_path = f"{DATA_DIR}/{hf_data}"
    print("Data path: {}".format(hf_data_path))
elif dataset_type == "FULL":
    hf_data = "hf_govuk_data"
    hf_data_path = f"{DATA_DIR}/{hf_data}"
    print("Data path: {}".format(hf_data_path))

Load the dataset that has been saved to disk in a HuggingFace DatasetDict (Apache Arrow).

In [None]:
datasets = load_from_disk(hf_data_path)

In [None]:
datasets

Inspect an element

In [None]:
datasets["train"][5]

The labels are already coded as integer ids to be easily usable by our model, but the correspondence with the actual categories is stored in the features of the dataset:

In [None]:
datasets["train"].features[f"new_label_list_id"]

In [None]:
label_list = datasets["train"].features[f"new_label_list_id"].feature.names
label_list

Show some random examples from the dataset in HTML format - this makes it easier to read than from the json.

In [None]:
def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(
        dataset
    ), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset) - 1)
        while pick in picks:
            pick = random.randint(0, len(dataset) - 1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
        elif isinstance(typ, Sequence) and isinstance(typ.feature, ClassLabel):
            df[column] = df[column].transform(
                lambda x: [typ.feature.names[i] for i in x]
            )
    display(HTML(df.to_html()))

In [None]:
show_random_elements(datasets["train"])

## 3. Tokenise the Data

Download tokeniser that will be used to tokenise the data.

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

The assert keyword lets you test if a condition in your code returns True, if not, the program will raise an AssertionError.

In [None]:
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

How does the tokeniser work?

In [None]:
# observe how the tokeniser works on a string
tokenizer("Hello, this is one sentence!")

In [None]:
# observe how the tokeniser works on a list of tokens
tokenizer(
    ["Hello", ",", "this", "is", "one", "sentence", "split", "into", "words", "."],
    is_split_into_words=True,
)

Try this out on example, tokens 4 from training set.

In [None]:
example = datasets["train"][5]
print(example["text_token"])

In [None]:
tokenized_input = tokenizer(example["text_token"], is_split_into_words=True)
print(tokenized_input)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)

Print the number of token labels in the data, and the length of the tokenised input. They are different, because special tokens are added to the start and end of a list when tokenised.

In [None]:
len(example[f"new_label_list_id"]), len(tokenized_input["input_ids"])

If we look into these examples, we can see they are added to the start and end.

In [None]:
print(tokenized_input.word_ids())

We can align these labels, by adding '-100' where there are None.

In [None]:
word_ids = tokenized_input.word_ids()
print(word_ids)
aligned_labels = [
    -100 if i is None else example[f"new_label_list_id"][i] for i in word_ids
]
print(aligned_labels)
print(len(aligned_labels), len(tokenized_input["input_ids"]))

We now need to tokenise each example and align the labels.


In [None]:
label_all_tokens = True

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["text_token"], truncation=True, is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples[f"new_label_list_id"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

Now we can tokenise and align training examples in the datasets.

In [None]:
tokenize_and_align_labels(datasets["train"][:5])

In [None]:
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)

## 4. Modelling

First, instantiate a model that will be used, **make sure it is the same as the tokeniser you are using!** Use the number of labels that are in your label list - this ensures there will be an output class for each token.

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, num_labels=len(label_list)
)

Define the training arguments that will dictate how the model will train.

In [None]:
model_name = model_checkpoint.split("/")[-1]
tod_date = date.today().strftime("%d-%m-%Y")
# full_model_name = f"{model_name}-finetuned-{task}-{dataset_name}-{dataset_type}-{tod_date}"
# print(full_model_name)
# print(MODEL_DIR)
OUTPUT_PATH = f"{MODEL_DIR}/{model_name}-finetuned-{task}-{dataset_name}-{dataset_type}-{tod_date}"
print(OUTPUT_PATH)

args = TrainingArguments(
    output_dir=OUTPUT_PATH,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

The *Data Collator* in the trainer, automatically pads the model inputs in a batch to the length of the longest example. This bypasses the need to set a global maximum sequence length, and in practice leads to faster training since we perform fewer redundant computations on the padded tokens and attention masks.

For token classification tasks, there is a dedicated *DataCollatorForTokenClassification* which expects a list of dicts, where each dict represents a single example in the dataset.



In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
metric = load_metric("seqeval")

In [None]:
label_list

In [None]:
labels = [label_list[i] for i in example[f"new_label_list_id"]]
print(labels)
metric.compute(predictions=[labels], references=[labels])

In [None]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
tokenizer.save_pretrained(OUTPUT_PATH)

# Load Model for Inference

Load model from local


In [None]:
model_name = model_checkpoint.split("/")[-1]
model_name

In [None]:
os.listdir(OUTPUT_PATH)

In [None]:
checkpoint = os.path.join(OUTPUT_PATH, "checkpoint-1500")
checkpoint

In [None]:
local_tokenizer = AutoTokenizer.from_pretrained(checkpoint, local_files_only=True)
local_model = AutoModelForTokenClassification.from_pretrained(
    checkpoint, local_files_only=True
)

In [None]:
sequences = ["my name is rory"]

In [None]:
processed_tokens = local_tokenizer(
    sequences, padding=True, truncation=True, return_tensors="pt"
)

In [None]:
processed_tokens

In [None]:
output = local_model(**processed_tokens)

In [None]:
output

In [None]:
print(output.logits)

In [None]:
# Replace this with your own checkpoint
token_classifier = pipeline(
    "token-classification",
    model=local_model,
    tokenizer=local_tokenizer,
    aggregation_strategy="simple",
)

In [None]:
string = "The show is on the Disney Channel. It airs at 8pm. It will be shown in spanish and english."
print(string)
print(len(string))

In [None]:
result = token_classifier(
    "The show is on the Disney Channel. It airs at 8pm. It will be shown in spanish and english."
)

In [None]:
result

In [None]:
label_list

## Visualise Entites

In [None]:
import spacy
from spacy import displacy

text = "Hi my name is Rory Hurley. I work for the Cabinet Office. I speak english and a little bit of spanish"

nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
displacy.render(doc, style="ent", jupyter=True, options={"distance": 90})

In [None]:
text = "My name is John Smith and I live in Paris"
entities = [
    ("Employee", 11, 21),  # John Smith
    ("Location", 36, 41),  # Paris
]

In [None]:
import spacy


def display_entities(text, entities):
    nlp = spacy.blank("en")
    doc = nlp(text)
    ents = []
    for ee in entities:
        ents.append(doc.char_span(ee[1], ee[2], ee[0]))
    doc.ents = ents
    displacy.render(doc, style="ent", jupyter=True, options={"distance": 90})


def tokenise_and_display(text):
    result = token_classifier(text)
    print(result)
    res_ents = [(i["entity_group"], i["start"], i["end"]) for i in result]
    print(res_ents)
    display_entities(text, entities=res_ents)

In [None]:
display_entities(text, entities)

In [None]:
result[:2]

In [None]:
res_ents = [(i["entity_group"], i["start"], i["end"]) for i in result]
res_ents

In [None]:
display_entities(string, res_ents)

In [None]:
tokenise_and_display(string)