In [1]:
# Transformers installation
! pip -q install transformers datasets 

# To install from source instead of the last release, comment the command above and uncomment the following one.
# ! pip install git+https://github.com/huggingface/transformers.git

# Fine-tune a pretrained model for NER task

## Prepare a dataset

In [2]:
from datasets import load_dataset
import datasets

dataset = datasets.load_dataset('conllpp')

print(dataset)
print(dataset["train"][4:5])



  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})
{'id': ['4'], 'tokens': [['Germany', "'s", 'representative', 'to', 'the', 'European', 'Union', "'s", 'veterinary', 'committee', 'Werner', 'Zwingmann', 'said', 'on', 'Wednesday', 'consumers', 'should', 'buy', 'sheepmeat', 'from', 'countries', 'other', 'than', 'Britain', 'until', 'the', 'scientific', 'advice', 'was', 'clearer', '.']], 'pos_tags': [[22, 27, 21, 35, 12, 22, 22, 27, 16, 21, 22, 22, 38, 15, 22, 24, 20, 37, 21, 15, 24, 16, 15, 22, 15, 12, 16, 21, 38, 17, 7]], 'chunk_tags': [[11, 11, 12, 13, 11, 12, 12, 11, 12, 12, 12, 12, 21, 13, 11, 12, 21, 22, 11, 13, 11, 1, 13, 11, 17, 11, 12, 12, 21

In [3]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})


In [4]:
label_list = dataset["train"].features["ner_tags"].feature.names 

label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

As you now know, you need a tokenizer to process the text and include a padding and truncation strategy to handle any variable sequence lengths. To process your dataset in one step, use 🤗 Datasets [`map`](https://huggingface.co/docs/datasets/process.html#map) method to apply a preprocessing function over the entire dataset:

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2", add_prefix_space=True)
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples["tokens"], is_split_into_words=True, truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)



Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [6]:
def tokenize_and_align_labels(examples, label_all_tokens=True): 
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True) 
    labels = [] 
    for i, label in enumerate(examples["ner_tags"]): 
        word_ids = tokenized_inputs.word_ids(batch_index=i) 
        # word_ids() => Return a list mapping the tokens
        # to their actual word in the initial sentence.
        # It Returns a list indicating the word corresponding to each token. 
        previous_word_idx = None 
        label_ids = []
        # Special tokens like `<s>` and `<\s>` are originally mapped to None 
        # We need to set the label to 0 so they are automatically ignored in the loss function.
        for word_idx in word_ids: 
            if word_idx is None: 
                # set 0 as the label for these special tokens
                label_ids.append(0)
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            elif word_idx != previous_word_idx:
                # if current word_idx is != prev then its the most regular case
                # and add the corresponding token                 
                label_ids.append(label[word_idx]) 
            else: 
                # to take care of sub-words which have the same word_idx
                # set 0 as well for them, but only if label_all_tokens == False
                label_ids.append(label[word_idx] if label_all_tokens else 0) 
                # mask the subword representations after the first subword
                 
            previous_word_idx = word_idx 
        labels.append(label_ids) 
    tokenized_inputs["labels"] = labels 
    return tokenized_inputs 

In [7]:
q = tokenize_and_align_labels(dataset['train'][4:5]) 
print(q)

for token, label in zip(tokenizer.convert_ids_to_tokens(q["input_ids"][0]),q["labels"][0]): 
    print(token) 

{'input_ids': [[4486, 705, 82, 8852, 284, 262, 3427, 4479, 705, 82, 38435, 5583, 48642, 1168, 5469, 9038, 531, 319, 3583, 7008, 815, 2822, 15900, 41495, 422, 2678, 584, 621, 5491, 1566, 262, 5654, 5608, 373, 22363, 764]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[5, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0]]}
ĠGermany
Ġ'
s
Ġrepresentative
Ġto
Ġthe
ĠEuropean
ĠUnion
Ġ'
s
Ġveterinary
Ġcommittee
ĠWerner
ĠZ
wing
mann
Ġsaid
Ġon
ĠWednesday
Ġconsumers
Ġshould
Ġbuy
Ġsheep
meat
Ġfrom
Ġcountries
Ġother
Ġthan
ĠBritain
Ġuntil
Ġthe
Ġscientific
Ġadvice
Ġwas
Ġclearer
Ġ.


In [8]:
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

If you like, you can create a smaller subset of the full dataset to fine-tune on to reduce the time it takes:

In [9]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(2000))
small_eval_dataset = tokenized_datasets["validation"].shuffle(seed=42).select(range(600))
small_test_dataset = tokenized_datasets["test"].shuffle(seed=42)

print(small_train_dataset[0])
print(small_eval_dataset[0])
print(small_test_dataset[0])

{'id': '1469', 'tokens': ['"', 'Neither', 'the', 'National', 'Socialists', '(', 'Nazis', ')', 'nor', 'the', 'communists', 'dared', 'to', 'kidnap', 'an', 'American', 'citizen', ',', '"', 'he', 'shouted', ',', 'in', 'an', 'oblique', 'reference', 'to', 'his', 'extradition', 'to', 'Germany', 'from', 'Denmark', '.', '"'], 'pos_tags': [0, 12, 12, 22, 23, 4, 23, 5, 10, 12, 24, 38, 35, 37, 12, 16, 21, 6, 0, 28, 38, 6, 15, 12, 16, 21, 35, 29, 21, 35, 22, 15, 22, 7, 0], 'chunk_tags': [0, 11, 11, 12, 12, 0, 11, 0, 0, 11, 12, 21, 22, 22, 11, 12, 12, 0, 0, 11, 21, 0, 13, 11, 12, 12, 13, 11, 12, 13, 11, 13, 11, 12, 0], 'ner_tags': [0, 0, 0, 7, 8, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 5, 0, 0], 'input_ids': [366, 16126, 262, 2351, 5483, 1023, 357, 19147, 1267, 4249, 262, 40938, 28765, 284, 49679, 281, 1605, 9511, 837, 366, 339, 17293, 837, 287, 281, 909, 41522, 4941, 284, 465, 34908, 284, 4486, 422, 16490, 764, 366], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1

<a id='trainer'></a>

## Train

At this point, you should follow the section corresponding to the framework you want to use. You can use the links
in the right sidebar to jump to the one you want - and if you want to hide all of the content for a given framework,
just use the button at the top-right of that framework's block!

## Train with PyTorch Trainer

🤗 Transformers provides a [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer) class optimized for training 🤗 Transformers models, making it easier to start training without manually writing your own training loop. The [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer) API supports a wide range of training options and features such as logging, gradient accumulation, and mixed precision.

Start by loading your model and specify the number of expected labels. From the Yelp Review [dataset card](https://huggingface.co/datasets/yelp_review_full#data-fields), you know there are five labels:

In [10]:
from transformers import GPT2ForTokenClassification

model = GPT2ForTokenClassification.from_pretrained("gpt2", num_labels=9)
model.config.pad_token_id = model.config.eos_token_id

model

Some weights of GPT2ForTokenClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['classifier.bias', 'classifier.weight', 'h.3.attn.masked_bias', 'h.8.attn.masked_bias', 'h.11.attn.masked_bias', 'h.10.attn.masked_bias', 'h.9.attn.masked_bias', 'h.5.attn.masked_bias', 'h.2.attn.masked_bias', 'h.0.attn.masked_bias', 'h.1.attn.masked_bias', 'h.4.attn.masked_bias', 'h.6.attn.masked_bias', 'h.7.attn.masked_bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForTokenClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dr

### Training hyperparameters

Next, create a [TrainingArguments](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments) class which contains all the hyperparameters you can tune as well as flags for activating different training options. For this tutorial you can start with the default training [hyperparameters](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments), but feel free to experiment with these to find your optimal settings.

Specify where to save the checkpoints from your training:

In [11]:
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="test_trainer")

### Evaluate

In [12]:
!pip -q install seqeval

import numpy as np

from datasets import load_metric
metric = load_metric("seqeval")

  metric = load_metric("seqeval")


Call `compute` on `metric` to calculate the accuracy of your predictions. Before passing your predictions to `compute`, you need to convert the predictions to logits (remember all 🤗 Transformers models return logits):

In [13]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"]}
 

If you'd like to monitor your evaluation metrics during fine-tuning, specify the `evaluation_strategy` parameter in your training arguments to report the evaluation metric at the end of each epoch:

In [14]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(num_train_epochs=10, output_dir="test_trainer", evaluation_strategy="epoch", per_device_train_batch_size=4, logging_steps=100)

### Trainer

Create a [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer) object with your model, training arguments, training and test datasets, and evaluation function:

In [15]:
from transformers import DataCollatorForTokenClassification 
data_collator = DataCollatorForTokenClassification(tokenizer) 

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator, 
    tokenizer=tokenizer,    
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics)

In [16]:
import torch
torch.cuda.empty_cache()
print(torch.cuda.memory_summary(device=None, abbreviated=False))


|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  499195 KB |  499195 KB |  499195 KB |       0 B  |
|       from large pool |  486400 KB |  486400 KB |  486400 KB |       0 B  |
|       from small pool |   12795 KB |   12795 KB |   12795 KB |       0 B  |
|---------------------------------------------------------------------------|
| Active memory         |  499195 KB |  499195 KB |  499195 KB |       0 B  |
|       from large pool |  486400 KB |  486400 KB |  486400 KB |       0 B  |
|       from small pool |   12795 KB |   12795 KB |   12795 KB |       0 B  |
|---------------------------------------------------------------

Then fine-tune your model by calling [train()](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.train):

In [17]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `GPT2ForTokenClassification.forward` and have been ignored: chunk_tags, pos_tags, ner_tags, tokens, id. If chunk_tags, pos_tags, ner_tags, tokens, id are not expected by `GPT2ForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2000
  Num Epochs = 10
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 5000
  Number of trainable parameters = 124446729
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.3067,0.248665,0.634979,0.631002,0.632984,0.921384
2,0.1955,0.171955,0.732523,0.754697,0.743445,0.947646
3,0.1459,0.171168,0.76696,0.772965,0.769951,0.952065
4,0.1002,0.1892,0.765301,0.789666,0.777293,0.953425
5,0.0886,0.210309,0.757301,0.784969,0.770887,0.95283
6,0.0587,0.223951,0.751249,0.784969,0.767739,0.952065
7,0.0449,0.226826,0.781568,0.801148,0.791237,0.95691
8,0.0498,0.24096,0.763591,0.799061,0.780923,0.95402
9,0.0409,0.246911,0.767964,0.803236,0.785204,0.95538
10,0.0395,0.244298,0.770771,0.803758,0.786919,0.95504


Saving model checkpoint to test_trainer/checkpoint-500
Configuration saved in test_trainer/checkpoint-500/config.json
Model weights saved in test_trainer/checkpoint-500/pytorch_model.bin
tokenizer config file saved in test_trainer/checkpoint-500/tokenizer_config.json
Special tokens file saved in test_trainer/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `GPT2ForTokenClassification.forward` and have been ignored: chunk_tags, pos_tags, ner_tags, tokens, id. If chunk_tags, pos_tags, ner_tags, tokens, id are not expected by `GPT2ForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 600
  Batch size = 8
Saving model checkpoint to test_trainer/checkpoint-1000
Configuration saved in test_trainer/checkpoint-1000/config.json
Model weights saved in test_trainer/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in test_trainer/checkpoint-1000/toke

TrainOutput(global_step=5000, training_loss=0.1321024639606476, metrics={'train_runtime': 548.3775, 'train_samples_per_second': 36.471, 'train_steps_per_second': 9.118, 'total_flos': 339029499951072.0, 'train_loss': 0.1321024639606476, 'epoch': 10.0})

In [29]:
device = torch.device('cuda')

yPredList = [] 
yTrueList = []
print(len(small_test_dataset))

for i in range(len(small_test_dataset)):
    outputs= model(torch.tensor(small_test_dataset[i]['input_ids']).to(device))
    pred = torch.argmax(outputs.logits, dim=1)

    yPred = [label_list[p] for p in pred]
    yTrue = [label_list[p] for p in torch.tensor(small_test_dataset[i]['labels']) ]

    yPredList.append(yPred)
    yTrueList.append(yTrue)

results = metric.compute(predictions=yPredList, references=yTrueList)
report = {"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"]}
print(report)

3453
{'precision': 0.7246600443630051, 'recall': 0.7450669310857709, 'f1': 0.7347218148039504, 'accuracy': 0.9392402545743834}


In [31]:
import torch
from torch.nn import functional as F
device = torch.device('cuda')

def classifySentences(sentences):
    for sent in sentences:
        encoded_prompt = tokenizer(sent)
        outputs= model(torch.tensor(encoded_prompt["input_ids"]).to(device))
        pred = torch.argmax(outputs.logits, dim=1)
        tokens = tokenizer.convert_ids_to_tokens(encoded_prompt["input_ids"])

        sentList = []
        labels = []
        for i in range(len(encoded_prompt["input_ids"])):
            tmp = tokenizer.decode(encoded_prompt["input_ids"][i]).strip()
            sentList.append(tmp)
            labels.append(label_list[pred[i]])
        print(sentList)
        print(labels)
        print("\n")        
sentences = [
    "Alan Yeung is studying natural language processing with GPT in Hong Kong.",
    "Japan began the defence of their Asian Cup title with a lucky 2-1 win against Syria in a Group C championship match on Friday .",
    "Tax incentives ‘not enough’ to reverse Hong Kong’s declining birth rate",
    "Liu Guangyuan, commissioner of local Ministry of Foreign Affairs Office, warns US Consul General Gregory May not to endanger national security or slander city’s prospects"
]

classifySentences(sentences)        

['Alan', 'Ye', 'ung', 'is', 'studying', 'natural', 'language', 'processing', 'with', 'G', 'PT', 'in', 'Hong', 'Kong', '.']
['B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'B-ORG', 'O', 'B-LOC', 'I-LOC', 'O']


['Japan', 'began', 'the', 'defence', 'of', 'their', 'Asian', 'Cup', 'title', 'with', 'a', 'lucky', '2', '-', '1', 'win', 'against', 'Syria', 'in', 'a', 'Group', 'C', 'championship', 'match', 'on', 'Friday', '.']
['B-LOC', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'I-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'B-MISC', 'I-MISC', 'O', 'O', 'O', 'O', 'O']


['Tax', 'incentives', '�', '�', 'not', 'enough', '�', '�', 'to', 'reverse', 'Hong', 'Kong', '�', '�', 's', 'declining', 'birth', 'rate']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O']


['Liu', 'Guang', 'y', 'uan', ',', 'commissioner', 'of', 'local', 'Ministry', 'of', 'Foreign', 'Affairs', 'Office', ',', 'warns', 'US', 'Cons', 'ul', 'Gener

<a id='pytorch_native'></a>

<a id='additional-resources'></a>