# NER Transformer Notebook Training

This is a notebook detailing the training of a transformer NER model using HuggingFace transformers.

## 1. Installs and Imports

In [1]:
!pip install datasets transformers seqeval

Collecting datasets
  Downloading datasets-1.16.1-py3-none-any.whl (298 kB)
[K     |████████████████████████████████| 298 kB 5.6 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 13.2 MB/s 
[?25hCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 1.3 MB/s 
[?25hCollecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 461 kB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2021.11.1-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 45.7 MB/s 
[?25hCollecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 43.8 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_

In [2]:
import os
import random
import transformers
import pandas as pd
import pandas as pd
import numpy as np
from datetime import date
from google.colab import drive
from seqeval.metrics import accuracy_score
from IPython.display import display, HTML
from collections import defaultdict, Counter, OrderedDict
from datasets import load_dataset, load_metric, load_from_disk, ClassLabel, Sequence
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification

#inference
import torch
from transformers import pipeline

print(transformers.__version__)

4.12.5


In [3]:
system = "COLAB" #["AWS", "COLAB"]

In [4]:
if system=="AWS":
    fs = s3fs.S3FileSystem()    
    s3_bucket = "govuk-data-infrastructure-integration"
    DATA_DIR = f's3://{s3_bucket}/model-data/govner-data'
    for f in fs.ls(DATA_DIR):
        print(f)
    #Manage interactions with the Amazon SageMaker APIs and any other AWS services needed.
    # sagemaker session bucket -> used for uploading data, models and logs
    # sagemaker will automatically create this bucket if it not exists
    sess = sagemaker.Session() 
    sagemaker_session_bucket= s3_bucket
    if sagemaker_session_bucket is None and sess is not None:
        # set to default bucket if a bucket name is not given
        sagemaker_session_bucket = sess.default_bucket()
        
    role = sagemaker.get_execution_role()
    sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

    print(f"sagemaker role arn: {role}")
    print(f"sagemaker bucket: {sess.default_bucket()}")
    print(f"sagemaker session region: {sess.boto_region_name}")
elif system=="COLAB":
    drive.mount("/content/gdrive")
    DATA_DIR = os.path.join("/content/gdrive/Shared drives/", "GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data")
    MODEL_DIR = os.path.join("/content/gdrive/Shared drives/", "GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Models")

Mounted at /content/gdrive


In [5]:
print("Data Folder: {}".format(DATA_DIR))
print(os.listdir(DATA_DIR)[:3])
print("Model Folder: {}".format(MODEL_DIR))
print(os.listdir(MODEL_DIR)[:3])

Data Folder: /content/gdrive/Shared drives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data
['label_map_09062020_more_ents.json', 'label_map_12062020_more_ents.json', 'line_by_line_NER_data_sampled_09062020_more_ents.csv']
Model Folder: /content/gdrive/Shared drives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Models
['distilbert-base-uncased-finetuned-ner-conll2003', 'distilbert-base-uncased-finetuned-ner-govuk', 'distilbert-base-uncased-finetuned-ner-govuk-SAMPLED-07-12-2021']


## 2. Load Data

Define some variables that will be useful.

In [6]:
task = "ner"
dataset_name = "govuk"
model_checkpoint = "distilbert-base-uncased"
batch_size = 16

In [7]:
dataset_type = "SAMPLED" #"FULL"

In [8]:
if dataset_type == "SAMPLED":
  hf_data = 'samp_hf_govuk_data'
  hf_data_path = f'{DATA_DIR}/{hf_data}'
  print("Data path: {}".format(hf_data_path))
elif dataset_type == "FULL":
  hf_data = 'hf_govuk_data'
  hf_data_path = f'{DATA_DIR}/{hf_data}'
  print("Data path: {}".format(hf_data_path))

Data path: /content/gdrive/Shared drives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data/samp_hf_govuk_data


Load the dataset that has been saved to disk in a HuggingFace DatasetDict (Apache Arrow).

In [9]:
datasets = load_from_disk(hf_data_path)

In [10]:
datasets

DatasetDict({
    train: Dataset({
        features: ['new_label_list_id', 'text_token'],
        num_rows: 8500
    })
    test: Dataset({
        features: ['new_label_list_id', 'text_token'],
        num_rows: 1500
    })
})

Inspect an element

In [11]:
datasets["train"][5]

{'new_label_list_id': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  5,
  5,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0],
 'text_token': ['If',
  'you',
  'can',
  '’',
  't',
  'use',
  'the',
  'form',
  'finder',
  'to',
  'download',
  'the',
  'application',
  'forms',
  'and',
  'guidance',
  'notes',
  'you',
  '’',
  'll',
  'need',
  '.']}

The labels are already coded as integer ids to be easily usable by our model, but the correspondence with the actual categories is stored in the features of the dataset:

In [12]:
datasets["train"].features[f"new_label_list_id"]

Sequence(feature=ClassLabel(num_classes=13, names=['O', 'CONTACT', 'DATE', 'EVENT', 'FINANCE', 'FORM', 'LOCATION', 'MISC', 'MONEY', 'ORGANIZATION', 'PERSON', 'SCHEME', 'STATE'], names_file=None, id=None), length=-1, id=None)

In [13]:
label_list = datasets["train"].features[f"new_label_list_id"].feature.names
label_list

['O',
 'CONTACT',
 'DATE',
 'EVENT',
 'FINANCE',
 'FORM',
 'LOCATION',
 'MISC',
 'MONEY',
 'ORGANIZATION',
 'PERSON',
 'SCHEME',
 'STATE']

Show some random examples from the dataset in HTML format - this makes it easier to read than from the json.

In [14]:
def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
        elif isinstance(typ, Sequence) and isinstance(typ.feature, ClassLabel):
            df[column] = df[column].transform(lambda x: [typ.feature.names[i] for i in x])
    display(HTML(df.to_html()))

In [15]:
show_random_elements(datasets["train"])

Unnamed: 0,new_label_list_id,text_token
0,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, EVENT, O, O, O, O, O, O]","[You, can, ’, t, use, :, a, soft-top, convertible, ,, a, car, with, a, 2+2, seating, arrangement, rather, than, full-size, rear, seats, .]"
1,"[O, O, CONTACT, O, O, O, O, O, FINANCE, O, FINANCE, FINANCE, FINANCE, FINANCE, FINANCE, FINANCE, O, FINANCE]","[The, following, guides, contain, more, detailed, information, :, vouchers, and, credit, tokens, Class, 1, National, Insurance, :, vouchers]"
2,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, CONTACT, O, O, O]","[The, data, collection, calendar, helps, to, enable, more, efficient, and, effective, planning, by, providing, a, structured, timetable, for, :, data, approvals, ,, requests, and, returns, .]"
3,"[O, O, O, O, O, O, O, O, LOCATION, LOCATION, O, O, LOCATION, O, O, O, O, O, O, O, O, O]","[The, subsequent, granting, of, autonomy, in, 1969, by, the, UK, led, to, Spain, closing, the, border, and, severing, all, communication, links, .]"
4,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, LOCATION, O]","[The, victims, were, a, man, who, fell, from, his, roof, as, he, attempted, to, repair, it, and, six, migrants, who, drowned, when, their, boat, sank, while, it, was, trying, to, reach, the, islands, from, Africa, .]"
5,"[ORGANIZATION, O, O, O, O, ORGANIZATION, ORGANIZATION, O, O, ORGANIZATION, O, O, O, O, O, O, O, DATE, O]","[Business, and, technical, specification, for, local, authorities, and, software, suppliers, to, prepare, for, the, alternative, provision, census, 2019, .]"
6,"[PERSON, PERSON, O, O, LOCATION, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, STATE, O, O, DATE, LOCATION, O, O, O, O, O, O, O, O, O]","[Mr., Bolton, is, visiting, Tokyo, to, observe, multinational, exercises, ,, held, under, the, U.S.-led, Proliferation, Security, Initiative, ,, aimed, at, training, troops, from, several, countries, to, intercept, weapons, of, mass, destruction, at, sea, .]"
7,"[O, O, O, O, O, O, O, O, ORGANIZATION, ORGANIZATION, ORGANIZATION, PERSON, O, O, O, O, O, FORM, O]","[Public, Guardian, practice, note, (, PN6, ), :, Court, of, Protection, visitors, and, the, release, of, their, reports, .]"
8,"[O, O, O, O, O, O, O, O, O, O, ORGANIZATION, O, O, O, LOCATION, O]","[It, also, provides, an, indicative, pipeline, of, upcoming, and, existing, government, steel, requirements, in, England, .]"
9,"[O, ORGANIZATION, O, O, ORGANIZATION, O, O, O, O, O, FINANCE, O, O, O, O, O, O, O, O, O, O, O, O]","[Disposal, Services, Authority, (, DSA, ), is, inviting, expressions, of, interest, for, the, tender, of, the, former, HMS, Endurance, (, A171, ), .]"


## 3. Tokenise the Data

Download tokeniser that will be used to tokenise the data.

In [16]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

The assert keyword lets you test if a condition in your code returns True, if not, the program will raise an AssertionError.

In [17]:
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

How does the tokeniser work?

In [18]:
#observe how the tokeniser works on a string
tokenizer("Hello, this is one sentence!")

{'input_ids': [101, 7592, 1010, 2023, 2003, 2028, 6251, 999, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [19]:
#observe how the tokeniser works on a list of tokens
tokenizer(["Hello", ",", "this", "is", "one", "sentence", "split", "into", "words", "."], is_split_into_words=True)

{'input_ids': [101, 7592, 1010, 2023, 2003, 2028, 6251, 3975, 2046, 2616, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

Try this out on example, tokens 4 from training set.

In [20]:
example = datasets["train"][5]
print(example["text_token"])

['If', 'you', 'can', '’', 't', 'use', 'the', 'form', 'finder', 'to', 'download', 'the', 'application', 'forms', 'and', 'guidance', 'notes', 'you', '’', 'll', 'need', '.']


In [21]:
tokenized_input = tokenizer(example["text_token"], is_split_into_words=True)
print(tokenized_input)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)

{'input_ids': [101, 2065, 2017, 2064, 1521, 1056, 2224, 1996, 2433, 2424, 2121, 2000, 8816, 1996, 4646, 3596, 1998, 8606, 3964, 2017, 1521, 2222, 2342, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['[CLS]', 'if', 'you', 'can', '’', 't', 'use', 'the', 'form', 'find', '##er', 'to', 'download', 'the', 'application', 'forms', 'and', 'guidance', 'notes', 'you', '’', 'll', 'need', '.', '[SEP]']


Print the number of token labels in the data, and the length of the tokenised input. They are different, because special tokens are added to the start and end of a list when tokenised.

In [22]:
len(example[f"new_label_list_id"]), len(tokenized_input["input_ids"])

(22, 25)

If we look into these examples, we can see they are added to the start and end.

In [23]:
print(tokenized_input.word_ids())

[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, None]


We can align these labels, by adding '-100' where there are None.

In [24]:
word_ids = tokenized_input.word_ids()
print(word_ids)
aligned_labels = [-100 if i is None else example[f"new_label_list_id"][i] for i in word_ids]
print(aligned_labels)
print(len(aligned_labels), len(tokenized_input["input_ids"]))

[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, None]
[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 0, 1, 1, 0, 0, 0, 0, 0, -100]
25 25


We now need to tokenise each example and align the labels.


In [25]:
label_all_tokens = True

In [26]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["text_token"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"new_label_list_id"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

Now we can tokenise and align training examples in the datasets.

In [27]:
tokenize_and_align_labels(datasets['train'][:5])

{'input_ids': [[101, 2062, 2084, 2382, 1010, 2199, 2111, 2031, 2042, 2730, 1012, 102], [101, 2057, 1521, 2222, 2036, 4604, 2017, 1037, 2862, 1997, 1996, 5310, 18442, 2015, 2005, 2169, 2914, 1999, 2115, 3404, 1012, 102], [101, 29191, 7524, 2064, 14396, 2000, 6139, 1010, 2021, 2045, 2024, 2116, 2060, 4216, 1999, 1996, 4044, 6327, 2000, 4721, 1012, 102], [101, 5713, 7040, 19104, 2231, 11099, 1999, 1996, 2770, 1997, 2120, 2374, 8924, 1012, 102], [101, 1996, 2087, 2590, 8066, 1999, 7748, 2003, 4851, 3980, 2306, 1037, 7748, 4512, 1012, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0, -100], [-100, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, -100], [-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [28]:
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)

Loading cached processed dataset at /content/gdrive/Shared drives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data/samp_hf_govuk_data/train/cache-d85c3e8645c648c7.arrow
Loading cached processed dataset at /content/gdrive/Shared drives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data/samp_hf_govuk_data/test/cache-ac5e35aabc4af1ff.arrow


## 3. Metrics

We can use common metrics, such as those used by NER evaluation strategies - however, our training data is not organised in any of the IOB1, IOB2, IOE1, IOE2, IOBES or IO formats.

For this reason, we may need to use custom metrics. But we can try with both and assess outputs.

### 3A. 'seqeval' metrics

In [29]:
metric = load_metric("seqeval")

Downloading:   0%|          | 0.00/2.48k [00:00<?, ?B/s]

In [None]:
label_list

['O',
 'CONTACT',
 'DATE',
 'EVENT',
 'FINANCE',
 'FORM',
 'LOCATION',
 'MISC',
 'MONEY',
 'ORGANIZATION',
 'PERSON',
 'SCHEME',
 'STATE']

In [None]:
labels = [label_list[i] for i in example[f"new_label_list_id"]]
print(labels)
metric.compute(predictions=[labels], references=[labels])

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'FORM', 'FORM', 'O', 'CONTACT', 'CONTACT', 'O', 'O', 'O', 'O', 'O']




{'ONTACT': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 'ORM': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 'overall_accuracy': 1.0,
 'overall_f1': 1.0,
 'overall_precision': 1.0,
 'overall_recall': 1.0}

In [None]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

### 3B. Custom Metrics

In [33]:
def custom_get_entities(llist):
    prev_tag = ""
    indices = []
    for i, ent in enumerate(llist):
        if ent!=prev_tag:
            indices.append([ent,i,i])
        else:
            indices[-1][2] = i
        prev_tag = ent
    return [tuple(i) for i in indices if i[0]!="O"]    

def custom_f1_score(y_true, y_pred):
    true_entities = set(custom_get_entities(y_true))
    pred_entities = set(custom_get_entities(y_pred))
    
    # intersection of predicted and true indexed named
    # entities
    nb_correct = len(true_entities & pred_entities)
    nb_pred = len(pred_entities)
    nb_true = len(true_entities)
    
    p = nb_correct / nb_pred if nb_pred > 0 else 0
    r = nb_correct / nb_true if nb_true > 0 else 0

    return 2 * p * r / (p + r) if p + r > 0 else 0

def custom_precision_score(y_true, y_pred):
    true_entities = set(custom_get_entities(y_true))
    pred_entities = set(custom_get_entities(y_pred))
    
    nb_correct = len(true_entities & pred_entities)
    nb_pred = len(pred_entities)
    
    return nb_correct / nb_pred if nb_pred > 0 else 0


def custom_recall_score(y_true, y_pred):
    true_entities = set(custom_get_entities(y_true))
    pred_entities = set(custom_get_entities(y_pred))
    
    nb_correct = len(true_entities & pred_entities)
    nb_true = len(true_entities)

    return nb_correct / nb_true if nb_true > 0 else 0

def custom_classification_report(y_true, y_pred, digits=2):
    true_entities = set(custom_get_entities(y_true))
    pred_entities = set(custom_get_entities(y_pred))

    name_width = 0
    d1 = defaultdict(set)
    d2 = defaultdict(set)
    for e in true_entities:
        d1[e[0]].add((e[1], e[2]))
        name_width = max(name_width, len(e[0]))
    for e in pred_entities:
        d2[e[0]].add((e[1], e[2]))

    last_line_heading = 'macro avg'
    width = max(name_width, len(last_line_heading), digits)

    headers = ["precision", "recall", "f1-score", "support"]
    head_fmt = u'{:>{width}s} ' + u' {:>9}' * len(headers)
    report = head_fmt.format(u'', *headers, width=width)
    report += u'\n\n'

    row_fmt = u'{:>{width}s} ' + u' {:>9.{digits}f}' * 3 + u' {:>9}\n'

    ps, rs, f1s, s = [], [], [], []
    for type_name, true_entities in d1.items():
        pred_entities = d2[type_name]
        nb_correct = len(true_entities & pred_entities)
        nb_pred = len(pred_entities)
        nb_true = len(true_entities)

        p = nb_correct / nb_pred if nb_pred > 0 else 0
        r = nb_correct / nb_true if nb_true > 0 else 0
        f1 = 2 * p * r / (p + r) if p + r > 0 else 0

        report += row_fmt.format(*[type_name, p, r, f1, nb_true], 
                                 width=width, digits=digits)

        ps.append(p)
        rs.append(r)
        f1s.append(f1)
        s.append(nb_true)

    report += u'\n'

    # compute averages
    report += row_fmt.format('micro avg',
                             precision_score(y_true, y_pred),
                             recall_score(y_true, y_pred),
                             f1_score(y_true, y_pred),
                             np.sum(s),
                             width=width, digits=digits)
    report += row_fmt.format(last_line_heading,
                             np.average(ps, weights=s),
                             np.average(rs, weights=s),
                             np.average(f1s, weights=s),
                             np.sum(s),
                             width=width, digits=digits)

    return report

y_true = ['a','a','b','o','o','i','a']
y_pred = ['a','a','O','o']
print(custom_precision_score(y_true, y_pred))
print(custom_recall_score(y_true, y_pred))
print(custom_f1_score(y_true, y_pred))
print(accuracy_score(y_true, y_pred))

0.5
0.2
0.28571428571428575
0.42857142857142855


In [34]:
def custom_compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    #results = metric.compute(predictions=true_predictions, references=true_labels)
    results = {'overall_precision':custom_precision_score(true_labels, true_predictions),
               'overall_recall':custom_recall_score(true_labels, true_predictions),
               'overall_f1':custom_f1_score(true_labels, true_predictions),
               'overall_accuracy':accuracy_score(true_labels, true_predictions)}

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

## 4. Modelling

First, instantiate a model that will be used, **make sure it is the same as the tokeniser you are using!** Use the number of labels that are in your label list - this ensures there will be an output class for each token.

In [35]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN t

Define the training arguments that will dictate how the model will train.

In [36]:
model_name = model_checkpoint.split("/")[-1]
tod_date = date.today().strftime("%d-%m-%Y")
# full_model_name = f"{model_name}-finetuned-{task}-{dataset_name}-{dataset_type}-{tod_date}"
# print(full_model_name)
# print(MODEL_DIR)
OUTPUT_PATH = f"{MODEL_DIR}/{model_name}-finetuned-{task}-{dataset_name}-{dataset_type}-{tod_date}"
print(OUTPUT_PATH)

args = TrainingArguments(
    output_dir=OUTPUT_PATH,
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

/content/gdrive/Shared drives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Models/distilbert-base-uncased-finetuned-ner-govuk-SAMPLED-08-12-2021


The *Data Collator* in the trainer, automatically pads the model inputs in a batch to the length of the longest example. This bypasses the need to set a global maximum sequence length, and in practice leads to faster training since we perform fewer redundant computations on the padded tokens and attention masks.

For token classification tasks, there is a dedicated *DataCollatorForTokenClassification* which expects a list of dicts, where each dict represents a single example in the dataset.



In [37]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [38]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=custom_compute_metrics
)

In [39]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: text_token, new_label_list_id.
***** Running training *****
  Num examples = 8500
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1596


Epoch,Training Loss,Validation Loss


Saving model checkpoint to /content/gdrive/Shared drives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Models/distilbert-base-uncased-finetuned-ner-govuk-SAMPLED-08-12-2021/checkpoint-500
Configuration saved in /content/gdrive/Shared drives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Models/distilbert-base-uncased-finetuned-ner-govuk-SAMPLED-08-12-2021/checkpoint-500/config.json
Model weights saved in /content/gdrive/Shared drives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Models/distilbert-base-uncased-finetuned-ner-govuk-SAMPLED-08-12-2021/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /content/gdrive/Shared drives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Models/distilbert-base-uncased-finetuned-ner-govuk-SAMPLED-08-12-2021/checkpoint-500/tokenizer_config.json
Special tokens file saved in /content/gdrive/Shared drives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Models/distilbert-base-uncased-finetuned-ner-govuk

TypeError: ignored

In [None]:
tokenizer.save_pretrained(OUTPUT_PATH)

tokenizer config file saved in /content/gdrive/Shared drives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Models/distilbert-base-uncased-finetuned-ner-govuk-SAMPLED-07-12-2021/tokenizer_config.json
Special tokens file saved in /content/gdrive/Shared drives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Models/distilbert-base-uncased-finetuned-ner-govuk-SAMPLED-07-12-2021/special_tokens_map.json


('/content/gdrive/Shared drives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Models/distilbert-base-uncased-finetuned-ner-govuk-SAMPLED-07-12-2021/tokenizer_config.json',
 '/content/gdrive/Shared drives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Models/distilbert-base-uncased-finetuned-ner-govuk-SAMPLED-07-12-2021/special_tokens_map.json',
 '/content/gdrive/Shared drives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Models/distilbert-base-uncased-finetuned-ner-govuk-SAMPLED-07-12-2021/vocab.txt',
 '/content/gdrive/Shared drives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Models/distilbert-base-uncased-finetuned-ner-govuk-SAMPLED-07-12-2021/added_tokens.json',
 '/content/gdrive/Shared drives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Models/distilbert-base-uncased-finetuned-ner-govuk-SAMPLED-07-12-2021/tokenizer.json')

# Load Model for Inference

Load model from local


In [None]:
model_name = model_checkpoint.split("/")[-1]
model_name

'distilbert-base-uncased'

In [None]:
os.listdir(OUTPUT_PATH)

['runs',
 'checkpoint-500',
 'checkpoint-1000',
 'checkpoint-1500',
 'tokenizer_config.json',
 'special_tokens_map.json',
 'vocab.txt',
 'tokenizer.json']

In [None]:
checkpoint = os.path.join(OUTPUT_PATH, 'checkpoint-1500')
checkpoint

'/content/gdrive/Shared drives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Models/distilbert-base-uncased-finetuned-ner-govuk-SAMPLED-07-12-2021/checkpoint-1500'

In [None]:
local_tokenizer = AutoTokenizer.from_pretrained(checkpoint, local_files_only=True)
local_model = AutoModelForTokenClassification.from_pretrained(checkpoint, local_files_only=True)

Didn't find file /content/gdrive/Shared drives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Models/distilbert-base-uncased-finetuned-ner-govuk-SAMPLED-07-12-2021/checkpoint-1500/added_tokens.json. We won't load it.
loading file /content/gdrive/Shared drives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Models/distilbert-base-uncased-finetuned-ner-govuk-SAMPLED-07-12-2021/checkpoint-1500/vocab.txt
loading file /content/gdrive/Shared drives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Models/distilbert-base-uncased-finetuned-ner-govuk-SAMPLED-07-12-2021/checkpoint-1500/tokenizer.json
loading file None
loading file /content/gdrive/Shared drives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Models/distilbert-base-uncased-finetuned-ner-govuk-SAMPLED-07-12-2021/checkpoint-1500/special_tokens_map.json
loading file /content/gdrive/Shared drives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Models/distilbert-base-uncased-finetuned-ner-govuk-SAMPLED

In [None]:
sequences = ["my name is rory"]

In [None]:
processed_tokens = local_tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

In [None]:
processed_tokens

{'input_ids': tensor([[  101,  2026,  2171,  2003, 14285,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [None]:
output = local_model(**processed_tokens)

In [None]:
output

TokenClassifierOutput([('logits',
                        tensor([[[ 2.8993,  0.1470, -0.6616, -0.5108, -0.7076, -1.0041,  0.5169,
                                  -0.1816, -0.8108, -0.7580,  0.6449, -0.2432, -0.2667],
                                 [ 5.6104,  0.1993, -0.2681, -1.0564, -0.8452, -0.5151, -0.3158,
                                  -0.3174, -1.5181, -0.5763, -0.1526, -1.2987,  0.0631],
                                 [ 4.8367,  0.4680, -0.6447, -0.6074, -1.1769, -0.6730, -0.3024,
                                   0.1316, -1.6352, -0.3909,  0.8501, -2.0412,  0.7347],
                                 [ 6.5752,  0.0176, -0.5423, -0.6202, -0.9750, -0.6830, -0.4983,
                                  -0.1236, -1.3810, -0.8320, -0.0702, -1.7313,  0.7919],
                                 [ 4.9944, -1.1337, -0.7831,  0.2343, -0.8367, -1.0392,  0.2068,
                                  -0.3020, -1.8287, -1.1084,  1.8468, -1.1778,  0.7143],
                                 [ 2

In [None]:
print(output.logits)

tensor([[[ 2.8993,  0.1470, -0.6616, -0.5108, -0.7076, -1.0041,  0.5169,
          -0.1816, -0.8108, -0.7580,  0.6449, -0.2432, -0.2667],
         [ 5.6104,  0.1993, -0.2681, -1.0564, -0.8452, -0.5151, -0.3158,
          -0.3174, -1.5181, -0.5763, -0.1526, -1.2987,  0.0631],
         [ 4.8367,  0.4680, -0.6447, -0.6074, -1.1769, -0.6730, -0.3024,
           0.1316, -1.6352, -0.3909,  0.8501, -2.0412,  0.7347],
         [ 6.5752,  0.0176, -0.5423, -0.6202, -0.9750, -0.6830, -0.4983,
          -0.1236, -1.3810, -0.8320, -0.0702, -1.7313,  0.7919],
         [ 4.9944, -1.1337, -0.7831,  0.2343, -0.8367, -1.0392,  0.2068,
          -0.3020, -1.8287, -1.1084,  1.8468, -1.1778,  0.7143],
         [ 2.9916, -0.5779,  0.0218, -0.2238, -1.0729, -1.1140,  0.1425,
           0.2258, -0.9614,  0.3077,  0.8430, -0.4378, -0.9266]]],
       grad_fn=<AddBackward0>)


In [None]:
# Replace this with your own checkpoint
token_classifier = pipeline(
    "token-classification", model=local_model, tokenizer=local_tokenizer, aggregation_strategy="simple"
)

In [None]:
string = "The show is on the Disney Channel. It airs at 8pm. It will be shown in spanish and english."
print(string)
print(len(string))

The show is on the Disney Channel. It airs at 8pm. It will be shown in spanish and english.
91


In [None]:
result = token_classifier("The show is on the Disney Channel. It airs at 8pm. It will be shown in spanish and english.")

In [None]:
result

[{'end': 18,
  'entity_group': 'LABEL_0',
  'score': 0.9975546,
  'start': 0,
  'word': 'the show is on the'},
 {'end': 25,
  'entity_group': 'LABEL_9',
  'score': 0.67886204,
  'start': 19,
  'word': 'disney'},
 {'end': 45,
  'entity_group': 'LABEL_0',
  'score': 0.9279997,
  'start': 26,
  'word': 'channel. it airs at'},
 {'end': 49,
  'entity_group': 'LABEL_8',
  'score': 0.3829682,
  'start': 46,
  'word': '8pm'},
 {'end': 70,
  'entity_group': 'LABEL_0',
  'score': 0.99888295,
  'start': 49,
  'word': '. it will be shown in'},
 {'end': 78,
  'entity_group': 'LABEL_6',
  'score': 0.5059153,
  'start': 71,
  'word': 'spanish'},
 {'end': 82,
  'entity_group': 'LABEL_0',
  'score': 0.99886703,
  'start': 79,
  'word': 'and'},
 {'end': 90,
  'entity_group': 'LABEL_7',
  'score': 0.55656165,
  'start': 83,
  'word': 'english'},
 {'end': 91,
  'entity_group': 'LABEL_0',
  'score': 0.99909663,
  'start': 90,
  'word': '.'}]

In [None]:
label_list

['O',
 'CONTACT',
 'DATE',
 'EVENT',
 'FINANCE',
 'FORM',
 'LOCATION',
 'MISC',
 'MONEY',
 'ORGANIZATION',
 'PERSON',
 'SCHEME',
 'STATE']

## Visualise Entites

In [None]:
import spacy
from spacy import displacy

text = "Hi my name is Rory Hurley. I work for the Cabinet Office. I speak english and a little bit of spanish"

nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
displacy.render(doc, style='ent', jupyter=True, options={'distance': 90})

In [None]:
text = "My name is John Smith and I live in Paris"
entities = [
    ("Employee", 11, 21),  # John Smith
    ("Location", 36, 41),  # Paris
]

In [None]:
import spacy

def display_entities(text, entities):
  nlp = spacy.blank("en")
  doc = nlp(text)
  ents = []
  for ee in entities:
      ents.append(doc.char_span(ee[1], ee[2], ee[0]))
  doc.ents = ents
  displacy.render(doc, style='ent', jupyter=True, options={'distance': 90})

def tokenise_and_display(text):
  result = token_classifier(text)
  print(result)
  res_ents = [(i['entity_group'], i['start'], i['end']) for i in result]
  print(res_ents)
  display_entities(text, entities=res_ents)

In [None]:
display_entities(text, entities)

In [None]:
result[:2]

[{'end': 18,
  'entity_group': 'LABEL_0',
  'score': 0.9975546,
  'start': 0,
  'word': 'the show is on the'},
 {'end': 25,
  'entity_group': 'LABEL_9',
  'score': 0.67886204,
  'start': 19,
  'word': 'disney'}]

In [None]:
res_ents = [(i['entity_group'], i['start'], i['end']) for i in result]
res_ents

[('LABEL_0', 0, 18),
 ('LABEL_9', 19, 25),
 ('LABEL_0', 26, 45),
 ('LABEL_8', 46, 49),
 ('LABEL_0', 49, 70),
 ('LABEL_6', 71, 78),
 ('LABEL_0', 79, 82),
 ('LABEL_7', 83, 90),
 ('LABEL_0', 90, 91)]

In [None]:
display_entities(string, res_ents)

In [None]:
tokenise_and_display(string)

[{'entity_group': 'LABEL_0', 'score': 0.9975546, 'word': 'the show is on the', 'start': 0, 'end': 18}, {'entity_group': 'LABEL_9', 'score': 0.67886204, 'word': 'disney', 'start': 19, 'end': 25}, {'entity_group': 'LABEL_0', 'score': 0.9279997, 'word': 'channel. it airs at', 'start': 26, 'end': 45}, {'entity_group': 'LABEL_8', 'score': 0.3829682, 'word': '8pm', 'start': 46, 'end': 49}, {'entity_group': 'LABEL_0', 'score': 0.99888295, 'word': '. it will be shown in', 'start': 49, 'end': 70}, {'entity_group': 'LABEL_6', 'score': 0.5059153, 'word': 'spanish', 'start': 71, 'end': 78}, {'entity_group': 'LABEL_0', 'score': 0.99886703, 'word': 'and', 'start': 79, 'end': 82}, {'entity_group': 'LABEL_7', 'score': 0.55656165, 'word': 'english', 'start': 83, 'end': 90}, {'entity_group': 'LABEL_0', 'score': 0.99909663, 'word': '.', 'start': 90, 'end': 91}]
[('LABEL_0', 0, 18), ('LABEL_9', 19, 25), ('LABEL_0', 26, 45), ('LABEL_8', 46, 49), ('LABEL_0', 49, 70), ('LABEL_6', 71, 78), ('LABEL_0', 79, 82)