# Baseline for the NER task
Baseline tagger for NER task on ewt dataset. The baseline is based on the notebook from HuggingFace:
https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/token_classification.ipynb#scrollTo=DDtsaJeVIrJT

The baseline is a BERT model that was fine tuned on the ewt train dataset. The model is trained on the training set and evaluated on the development set. The model is evaluated using the accuracy, precision, recall and F1 score.

In [2]:
from datetime import datetime

import numpy as np
import pandas as pd
import transformers
from evaluate import load as load_metric
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    Trainer,
    TrainingArguments,
)

from utils import load_into_datasetdict

### Hyperparameters and settings

In [4]:
# Set the task and name of the pretrained model and the batch size for finetuning
task = "ner"
model_name = "distilbert-base-multilingual-cased"     # "bert-base-multilingual-cased" "bert-base-cased"
batch_size = 32

# Flag to indicate whether to label all tokens or just the first token of each word
label_all_tokens = False

# File paths to splits of the chosen dataset
file_paths = {
    "train": "data/baseline/en_ewt_nn_train.conll",
    "validation": "data/baseline/en_ewt_nn_answers_dev.conll",
    "test": "data/baseline/en_ewt_nn_answers_test.conll",
}

### Load dataset and tokenizer
Loading the EWT into the desired format by the huggingface pretrained models

In [5]:
# Load the datasets into a DatasetDict
datasets = load_into_datasetdict(file_paths)

# Get the label names from the datasets
label_list = datasets["train"].features["tags"].feature.names

# Save label list to csv
with open('label_list.txt', 'w') as f:
    for item in label_list:
        f.write(item + "\n")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Initialize model and tokenizer

In [6]:
# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

# Load in the model from the pretrained checkpoint
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertForTokenClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You s

### Tokenize to sub-word level and align labels

In [7]:
# Function to tokenize and align the labels on a sub-word level
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Tokenize and align the labels on a sub-word level for all datasets
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/12544 [00:00<?, ? examples/s]

Map:   0%|          | 0/420 [00:00<?, ? examples/s]

Map:   0%|          | 0/439 [00:00<?, ? examples/s]

### Setup the trainer for finetuning

In [8]:
# Arguments for the trainer object
args = TrainingArguments(
    f"{model_name}-finetuned-{task}",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy = "epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model = "accuracy",
)

# datacollator to pad the sentences and labels to the maximum length of the sequences in the examples given
data_collator = DataCollatorForTokenClassification(tokenizer)

# Load the metrics function
metric = load_metric("seqeval")

# Function for computing the metrics
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

### Model fine-tuning

In [9]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.096223,0.682927,0.664032,0.673347,0.972827
2,0.121000,0.090762,0.688259,0.671937,0.68,0.97514
3,0.035500,0.097084,0.673387,0.660079,0.666667,0.974754


TrainOutput(global_step=1176, training_loss=0.07017471190212535, metrics={'train_runtime': 259.46, 'train_samples_per_second': 145.04, 'train_steps_per_second': 4.532, 'total_flos': 627663432594240.0, 'train_loss': 0.07017471190212535, 'epoch': 3.0})

### Evaluation on dev set

In [10]:
predictions, labels, _ = trainer.predict(tokenized_datasets["validation"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

{'LOC': {'precision': 0.7516778523489933,
  'recall': 0.7320261437908496,
  'f1': 0.7417218543046359,
  'number': 153},
 'MISC': {'precision': 0.5428571428571428,
  'recall': 0.5135135135135135,
  'f1': 0.5277777777777778,
  'number': 37},
 'ORG': {'precision': 0.6363636363636364,
  'recall': 0.6363636363636364,
  'f1': 0.6363636363636364,
  'number': 44},
 'PER': {'precision': 0.5789473684210527,
  'recall': 0.5789473684210527,
  'f1': 0.5789473684210527,
  'number': 19},
 'overall_precision': 0.6882591093117408,
 'overall_recall': 0.6719367588932806,
 'overall_f1': 0.6799999999999999,
 'overall_accuracy': 0.9751397186355752}

In [11]:
predictions, labels, _ = trainer.predict(tokenized_datasets["test"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

{'LOC': {'precision': 0.8549618320610687,
  'recall': 0.875,
  'f1': 0.8648648648648648,
  'number': 128},
 'MISC': {'precision': 0.6111111111111112,
  'recall': 0.6111111111111112,
  'f1': 0.6111111111111112,
  'number': 54},
 'ORG': {'precision': 0.6521739130434783,
  'recall': 0.42857142857142855,
  'f1': 0.5172413793103448,
  'number': 70},
 'PER': {'precision': 0.5483870967741935,
  'recall': 0.7391304347826086,
  'f1': 0.6296296296296297,
  'number': 23},
 'overall_precision': 0.732824427480916,
 'overall_recall': 0.6981818181818182,
 'overall_f1': 0.7150837988826817,
 'overall_accuracy': 0.9684980311269454}

### Saving the model

In [12]:
path_to_model = f"models/{model_name}-finetuned-{task}"

path_to_model

'models/distilbert-base-multilingual-cased-finetuned-ner'

In [13]:
trainer.save_model(path_to_model)

In [18]:
import os
import math

def convert_size(size_bytes):
   if size_bytes == 0:
       return "0B"
   size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
   i = int(math.floor(math.log(size_bytes, 1024)))
   p = math.pow(1024, i)
   s = round(size_bytes / p, 2)
   return "%s %s" % (s, size_name[i])

size = os.path.getsize("label_list.txt")

convert_size(size)

'52.0 B'

## Try Loading model

In [22]:
# test data path
test_file_paths = {
    "test": "data/baseline/en_ewt_nn_answers_test.conll",
}

# load data into dataset
test_datasets = load_into_datasetdict(test_file_paths)
test_datasets

DatasetDict({
    test: Dataset({
        features: ['id', 'tokens', 'tags'],
        num_rows: 439
    })
})

In [23]:
# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

# Load in the model from the pretrained checkpoint
model = AutoModelForTokenClassification.from_pretrained(path_to_model)

# Function to tokenize and align the labels on a sub-word level
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Tokenize and align the labels on a sub-word level for all datasets
tokenized_test_datasets = test_datasets.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/439 [00:00<?, ? examples/s]

In [24]:
test_trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_test_datasets["test"],
    eval_dataset=tokenized_test_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [27]:
predictions, labels, _ = test_trainer.predict(tokenized_test_datasets["test"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

{'LOC': {'precision': 0.8536585365853658,
  'recall': 0.8203125,
  'f1': 0.8366533864541833,
  'number': 128},
 'MISC': {'precision': 0.7073170731707317,
  'recall': 0.5370370370370371,
  'f1': 0.6105263157894738,
  'number': 54},
 'ORG': {'precision': 0.5769230769230769,
  'recall': 0.42857142857142855,
  'f1': 0.4918032786885245,
  'number': 70},
 'PER': {'precision': 0.5588235294117647,
  'recall': 0.8260869565217391,
  'f1': 0.6666666666666667,
  'number': 23},
 'overall_precision': 0.732,
 'overall_recall': 0.6654545454545454,
 'overall_f1': 0.6971428571428572,
 'overall_accuracy': 0.9656853553347085}

In [29]:
print(true_labels)

print(true_predictions)

[['O', 'O', 'O', 'O', 'O', 'B-LOC', 'O'], ['B-LOC'], ['B-LOC'], ['O', 'O', 'O', 'O', 'O', 'O'], ['O'], ['O', 'O', 'O'], ['O', 'O'], ['O', 'O', 'O', 'B-LOC', 'O'], ['O', 'O', 'O', 'O', 'O', 'B-LOC', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O'], ['O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O'], ['O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['O', 'O', 'O', 'O'], ['O', 'O'], ['O', 'O', 'O', 'B-LOC', 'O'], ['B-LOC', 'O', 'O', 'B-LOC', 'O', 'O'], ['O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O'], ['O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O'], ['O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O'],

In [20]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
