# Imports

In [1]:
from datasets import load_dataset
import evaluate
import numpy as np
from transformers import AutoTokenizer, DataCollatorForTokenClassification, AutoModelForTokenClassification, \
                         TrainingArguments, Trainer
from torch.optim import AdamW
from pprint import pprint

  from .autonotebook import tqdm as notebook_tqdm


# Load and Process Dataset

We start by loading the conll data via the hugging face api and its load_dataset function.

In [2]:
raw_datasets = load_dataset("wnut_17")

In [3]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 3394
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 1009
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 1287
    })
})

In [4]:
raw_datasets["train"][0]["tokens"]

['@paulwalk',
 'It',
 "'s",
 'the',
 'view',
 'from',
 'where',
 'I',
 "'m",
 'living',
 'for',
 'two',
 'weeks',
 '.',
 'Empire',
 'State',
 'Building',
 '=',
 'ESB',
 '.',
 'Pretty',
 'bad',
 'storm',
 'here',
 'last',
 'evening',
 '.']

In [5]:
raw_datasets["train"][0]["ner_tags"]

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 7,
 8,
 8,
 0,
 7,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

Followingly, we extract the named entitiy recognition features and the respective IOB labels.

In [6]:
ner_feature = raw_datasets["train"].features["ner_tags"]
ner_feature

Sequence(feature=ClassLabel(names=['O', 'B-corporation', 'I-corporation', 'B-creative-work', 'I-creative-work', 'B-group', 'I-group', 'B-location', 'I-location', 'B-person', 'I-person', 'B-product', 'I-product'], id=None), length=-1, id=None)

We see that the train body features labels about corporations, creative-work, groups, locations, perople, and products.

In [7]:
label_names = ner_feature.feature.names
label_names

['O',
 'B-corporation',
 'I-corporation',
 'B-creative-work',
 'I-creative-work',
 'B-group',
 'I-group',
 'B-location',
 'I-location',
 'B-person',
 'I-person',
 'B-product',
 'I-product']

Based on this, we can express the decoded information and a respective sentence jointly:

In [8]:
words = raw_datasets["train"][0]["tokens"]
labels = raw_datasets["train"][0]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

@paulwalk It 's the view from where I 'm living for two weeks . Empire     State      Building   = ESB        . Pretty bad storm here last evening . 
O         O  O  O   O    O    O     O O  O      O   O   O     O B-location I-location I-location O B-location O O      O   O     O    O    O       O 


We will use a pretrained bert model to evaluate the contents of the WNUT dataset. This dataset features a lot of rare entities and thereby allows for testing models on largely unseen information.

In [9]:
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [10]:
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)

Next, we will give the tokens new labels which align with their new labels, which express their purpose numerically.

In [11]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [12]:
labels = raw_datasets["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0]
[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 0, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0, -100]


In [13]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

Having composed a function to tokenise and align the labels, we finally arrive at a preprocessed and tokenised dataset.

In [14]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

In [15]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Setting Evaluation

In [16]:
metric = evaluate.load("seqeval")

In [17]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [18]:
def evaluate_token_classification(predictions, labels, entity_types):
    predictions = np.argmax(predictions, axis=-1)
    labels = [[label_names[l] for l in label if l != -100] for label in labels]
    predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    precision_dict = {}
    recall_dict = {}
    f1_dict = {}
    support_dict = {}
    
    for entity_type in entity_types:
        precision_dict[entity_type] = {}
        recall_dict[entity_type] = {}
        f1_dict[entity_type] = {}
        support_dict[entity_type] = {}
        b_true_labels_binary = [[l if f"B-{entity_type}" == l else 'O' for l in label] for label in labels]
        b_pred_labels_binary = [[l if f"B-{entity_type}" == l else 'O' for l in label] for label in predictions]
        
        b_metrics = metric.compute(predictions=b_pred_labels_binary, references=b_true_labels_binary)
        
        precision_dict[entity_type]['B-label'] = b_metrics[entity_type]['precision']
        recall_dict[entity_type]['B-label'] = b_metrics[entity_type]['recall']
        f1_dict[entity_type]['B-label'] = b_metrics[entity_type]['f1']
        support_dict[entity_type]['B-label'] = b_metrics[entity_type]['number']
        
        i_true_labels_binary = [[l if f"I-{entity_type}" == l else 'O' for l in label] for label in labels]
        i_pred_labels_binary = [[l if f"I-{entity_type}" == l else 'O' for l in label] for label in predictions]
    
        i_metrics = metric.compute(predictions=i_pred_labels_binary, references=i_true_labels_binary)
        
        precision_dict[entity_type]['I-label'] = i_metrics[entity_type]['precision']
        recall_dict[entity_type]['I-label'] = i_metrics[entity_type]['recall']
        f1_dict[entity_type]['I-label'] = i_metrics[entity_type]['f1']
        support_dict[entity_type]['I-label'] = i_metrics[entity_type]['number']

    entity_metrics = metric.compute(predictions=predictions, references=labels)
    
    for entity_type in entity_types:
        precision_dict[entity_type]['entity'] = entity_metrics[entity_type]['precision']
        recall_dict[entity_type]['entity'] = entity_metrics[entity_type]['recall']
        f1_dict[entity_type]['entity'] = entity_metrics[entity_type]['f1']
        support_dict[entity_type]['entity'] = entity_metrics[entity_type]['number']
        
    f1_scores_list = [f1_dict[entity_type]["entity"] for entity_type in f1_dict] 
    support_list = [support_dict[entity_type]["entity"] for entity_type in support_dict]
    weights_support_list = [support / len(support_list) for support in support_list]
    
    final_dict = {
        "precision": precision_dict,
        "recall": recall_dict,
        "f1": f1_dict,
        "macro_f1": sum(f1_scores_list) / len(f1_scores_list),
        "micro_f1": sum([f1_score * support for f1_score, support in zip(f1_scores_list, weights_support_list)])
    }
    return final_dict

In [19]:
entity_types = ['corporation', 'creative-work', 'group', 'location', 'person', 'product']

# Fine-tune Model

In [20]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

### Baseline model

We will use the bert model with baseline parameters for our baseline model.

In [21]:
args_baseline = TrainingArguments("bert-finetuned-baseline")
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

trainer_baseline = Trainer(
    model=model,
    args=args_baseline,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

trainer_baseline.train()

test_pred_baseline = trainer_baseline.predict(test_dataset=tokenized_datasets["test"])
test_pred_baseline.metrics

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.1117
1000,0.0419


{'test_loss': 0.24935071170330048,
 'test_precision': 0.47183098591549294,
 'test_recall': 0.31047265987025024,
 'test_f1': 0.37451089994410286,
 'test_accuracy': 0.9348994974874372,
 'test_runtime': 102.8448,
 'test_samples_per_second': 12.514,
 'test_steps_per_second': 1.565}

In [25]:
macro_micro_baseline = evaluate_token_classification(predictions=test_pred_baseline.predictions, \
                                                     labels=test_pred_baseline.label_ids, \
                                                     entity_types=entity_types)
pprint(macro_micro_baseline)

{'f1': {'corporation': {'B-label': 0.0, 'I-label': 0.0, 'entity': 0.0},
        'creative-work': {'B-label': 0.0,
                          'I-label': 0.010256410256410256,
                          'entity': 0.0},
        'group': {'B-label': 0.0, 'I-label': 0.0, 'entity': 0.0},
        'location': {'B-label': 0.030769230769230767,
                     'I-label': 0.0,
                     'entity': 0.007246376811594203},
        'person': {'B-label': 0.009022556390977444,
                   'I-label': 0.0,
                   'entity': 0.0029112081513828236},
        'product': {'B-label': 0.0, 'I-label': 0.0, 'entity': 0.0}},
 'macro_f1': 0.0016929308271628378,
 'micro_f1': 0.38931080311372696,
 'precision': {'corporation': {'B-label': 0.0, 'I-label': 0.0, 'entity': 0.0},
               'creative-work': {'B-label': 0.0,
                                 'I-label': 0.015151515151515152,
                                 'entity': 0.0},
               'group': {'B-label': 0.0, 'I-label': 

Overall, the results are rather poor. Next, we will apply different optimisation approaches to improve the performance.

### AdamW Optimization

We begin by implementing the AdamW optimiser, which makes use of the general Adam optimisation approach and adds L2 regularisation via a decay in the parameter weights at each iteration. By implementing weight decay, we ensure that the model is more generaliseable and performs better on unseen data. The hyperparameters are again based on a a premade bert configuration.

In [26]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)
optimizer1 = AdamW(model.parameters(), lr=2e-5)
args_adam1 = TrainingArguments(
    "bert-finetuned-adam1",
    evaluation_strategy="epoch",
)

trainer_adam1 = Trainer(
    model=model,
    args=args_adam1,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    optimizers=(optimizer1, None)
)

trainer_adam1.train()
test_pred_adam1 = trainer_adam1.predict(test_dataset=tokenized_datasets["test"])
test_pred_adam1.metrics

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.167101,0.526126,0.349282,0.419842,0.915547
2,0.120500,0.180026,0.588138,0.427033,0.494802,0.922826
3,0.046500,0.173017,0.585227,0.492823,0.535065,0.927696


{'test_loss': 0.23202502727508545,
 'test_precision': 0.5274390243902439,
 'test_recall': 0.3206672845227062,
 'test_f1': 0.3988472622478386,
 'test_accuracy': 0.9359798994974874,
 'test_runtime': 9.4282,
 'test_samples_per_second': 136.506,
 'test_steps_per_second': 17.076}

In [27]:
macro_micro_adam1 = evaluate_token_classification(predictions=test_pred_adam1.predictions, \
                                                  labels=test_pred_adam1.label_ids, \
                                                  entity_types=entity_types)
pprint(macro_micro_adam1)

{'f1': {'corporation': {'B-label': 0.0, 'I-label': 0.0, 'entity': 0.0},
        'creative-work': {'B-label': 0.0,
                          'I-label': 0.01015228426395939,
                          'entity': 0.0},
        'group': {'B-label': 0.0, 'I-label': 0.0, 'entity': 0.0},
        'location': {'B-label': 0.04743083003952569,
                     'I-label': 0.0,
                     'entity': 0.0},
        'person': {'B-label': 0.011782032400589101,
                   'I-label': 0.0,
                   'entity': 0.0},
        'product': {'B-label': 0.0, 'I-label': 0.0, 'entity': 0.0}},
 'macro_f1': 0.0,
 'micro_f1': 0.0,
 'precision': {'corporation': {'B-label': 0.0, 'I-label': 0.0, 'entity': 0.0},
               'creative-work': {'B-label': 0.0,
                                 'I-label': 0.014705882352941176,
                                 'entity': 0.0},
               'group': {'B-label': 0.0, 'I-label': 0.0, 'entity': 0.0},
               'location': {'B-label': 0.058252427

We directly observe an improvement in precision and recall.

We continue by manually specifying a larger batch size. While this will speed up computation, it may also harm generalisability.

In [21]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)
optimizer2 = AdamW(model.parameters(), lr=2e-5)
args_adam2 = TrainingArguments(
    "bert-finetuned-adam2",
    evaluation_strategy="epoch",
    per_device_train_batch_size=32,
)

trainer_adam2 = Trainer(
    model=model,
    args=args_adam2,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    optimizers=(optimizer2, None)
)

trainer_adam2.train()
test_pred_adam2 = trainer_adam2.predict(test_dataset=tokenized_datasets["test"])
test_pred_adam2.metrics

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.206411,0.529412,0.107656,0.178926,0.890875
2,No log,0.187617,0.607229,0.301435,0.402878,0.909874
3,No log,0.178684,0.550909,0.36244,0.437229,0.916457


  _warn_prf(average, modifier, msg_start, len(result))


{'test_loss': 0.21888786554336548,
 'test_precision': 0.4740882917466411,
 'test_recall': 0.2289156626506024,
 'test_f1': 0.30874999999999997,
 'test_accuracy': 0.9302261306532663,
 'test_runtime': 139.8478,
 'test_samples_per_second': 9.203,
 'test_steps_per_second': 1.151}

In [22]:
macro_micro_adam2 = evaluate_token_classification(predictions=test_pred_adam2.predictions, \
                                                  labels=test_pred_adam2.label_ids, \
                                                  entity_types=entity_types)
pprint(macro_micro_adam2)

  _warn_prf(average, modifier, msg_start, len(result))


{'f1': {'corporation': {'B-label': 0.0, 'I-label': 0.0, 'entity': 0.0},
        'creative-work': {'B-label': 0.0, 'I-label': 0.0, 'entity': 0.0},
        'group': {'B-label': 0.0, 'I-label': 0.0, 'entity': 0.0},
        'location': {'B-label': 0.022988505747126433,
                     'I-label': 0.0,
                     'entity': 0.014598540145985401},
        'person': {'B-label': 0.006201550387596899,
                   'I-label': 0.00404040404040404,
                   'entity': 0.0},
        'product': {'B-label': 0.0,
                    'I-label': 0.01183431952662722,
                    'entity': 0.0}},
 'macro_f1': 0.0024330900243309003,
 'micro_f1': 0.36496350364963503,
 'precision': {'corporation': {'B-label': 0.0, 'I-label': 0.0, 'entity': 0.0},
               'creative-work': {'B-label': 0.0, 'I-label': 0.0, 'entity': 0.0},
               'group': {'B-label': 0.0, 'I-label': 0.0, 'entity': 0.0},
               'location': {'B-label': 0.02702702702702703,
                 

Indeed, the model took less time to run 3 epochs, however suffered in performance. Particularly recall and thus F1 were harmed.

Next, we increase the learning rate. This may also have a positive impact on computation time, while also acting as regularisation. Perhaps, the impact will not be as dire as with batch computation.

In [21]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)
optimizer3 = AdamW(model.parameters(), lr=9e-5)
args_adam3 = TrainingArguments(
    "bert-finetuned-adam3",
    evaluation_strategy="epoch",
)

trainer_adam3 = Trainer(
    model=model,
    args=args_adam3,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    optimizers=(optimizer3, None)
)

trainer_adam3.train()
test_pred_adam3 = trainer_adam3.predict(test_dataset=tokenized_datasets["test"])
test_pred_adam3.metrics

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.162601,0.522763,0.398325,0.452138,0.917527
2,0.102600,0.213299,0.616906,0.410287,0.492816,0.91817
3,0.036600,0.212423,0.562315,0.453349,0.501987,0.922933


{'test_loss': 0.25418907403945923,
 'test_precision': 0.45779685264663805,
 'test_recall': 0.2965708989805375,
 'test_f1': 0.3599550056242969,
 'test_accuracy': 0.9331155778894472,
 'test_runtime': 89.2379,
 'test_samples_per_second': 14.422,
 'test_steps_per_second': 1.804}

In [22]:
macro_micro_adam3 = evaluate_token_classification(predictions=test_pred_adam3.predictions, \
                                                  labels=test_pred_adam3.label_ids, \
                                                  entity_types=entity_types)
pprint(macro_micro_adam3)

{'f1': {'corporation': {'B-label': 0.0, 'I-label': 0.0, 'entity': 0.0},
        'creative-work': {'B-label': 0.0,
                          'I-label': 0.010810810810810811,
                          'entity': 0.0},
        'group': {'B-label': 0.008583690987124463,
                  'I-label': 0.0,
                  'entity': 0.0},
        'location': {'B-label': 0.03187250996015937,
                     'I-label': 0.0,
                     'entity': 0.0},
        'person': {'B-label': 0.015128593040847202,
                   'I-label': 0.004048582995951417,
                   'entity': 0.005822416302765647},
        'product': {'B-label': 0.0, 'I-label': 0.0, 'entity': 0.0}},
 'macro_f1': 0.0009704027171276078,
 'micro_f1': 0.41630276564774377,
 'precision': {'corporation': {'B-label': 0.0, 'I-label': 0.0, 'entity': 0.0},
               'creative-work': {'B-label': 0.0,
                                 'I-label': 0.017857142857142856,
                                 'entity': 0.0},
 

The model actually has better precision than the baseline. However, recall still underperforms, leading to an overall lower F1. It appears that the true positives of the test data do not resemble the training data too well, making the recall worse.
We will try the same learning rate, including the increased batch size.


In [21]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)
optimizer4 = AdamW(model.parameters(), lr=9e-5)
args_adam4 = TrainingArguments(
    "bert-finetuned-adam4",
    evaluation_strategy="epoch",
    per_device_train_batch_size=32,
)

trainer_adam4 = Trainer(
    model=model,
    args=args_adam4,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    optimizers=(optimizer4, None)
)

trainer_adam4.train()
test_pred_adam4 = trainer_adam4.predict(test_dataset=tokenized_datasets["test"])
test_pred_adam4.metrics

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.207578,0.457198,0.2811,0.348148,0.903345
2,No log,0.180311,0.599641,0.399522,0.479541,0.91635
3,No log,0.17767,0.52819,0.425837,0.471523,0.92015


  _warn_prf(average, modifier, msg_start, len(result))


{'test_loss': 0.2290385514497757,
 'test_precision': 0.4583333333333333,
 'test_recall': 0.29564411492122333,
 'test_f1': 0.3594366197183098,
 'test_accuracy': 0.9332663316582914,
 'test_runtime': 144.5128,
 'test_samples_per_second': 8.906,
 'test_steps_per_second': 1.114}

In [22]:
macro_micro_adam4 = evaluate_token_classification(predictions=test_pred_adam4.predictions, \
                                                  labels=test_pred_adam4.label_ids, \
                                                  entity_types=entity_types)
pprint(macro_micro_adam4)

{'f1': {'corporation': {'B-label': 0.014492753623188406,
                        'I-label': 0.0,
                        'entity': 0.0},
        'creative-work': {'B-label': 0.0, 'I-label': 0.0, 'entity': 0.0},
        'group': {'B-label': 0.009389671361502348,
                  'I-label': 0.0,
                  'entity': 0.0},
        'location': {'B-label': 0.01556420233463035,
                     'I-label': 0.0,
                     'entity': 0.0},
        'person': {'B-label': 0.008771929824561403,
                   'I-label': 0.00404040404040404,
                   'entity': 0.0028653295128939827},
        'product': {'B-label': 0.0,
                    'I-label': 0.01098901098901099,
                    'entity': 0.0}},
 'macro_f1': 0.0004775549188156638,
 'micro_f1': 0.20487106017191978,
 'precision': {'corporation': {'B-label': 0.013888888888888888,
                               'I-label': 0.0,
                               'entity': 0.0},
               'creative-work': {'

As it has proven challenging to improve the model, we will also try a lower learning rate than at baseline.

In [21]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)
optimizer5 = AdamW(model.parameters(), lr=1e-5)
args_adam5 = TrainingArguments(
    "bert-finetuned-adam5",
    evaluation_strategy="epoch",
)

trainer_adam5 = Trainer(
    model=model,
    args=args_adam5,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    optimizers=(optimizer5, None)
)

trainer_adam5.train()
test_pred_adam5 = trainer_adam5.predict(test_dataset=tokenized_datasets["test"])
test_pred_adam5.metrics

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.179638,0.536398,0.334928,0.412371,0.910142
2,0.141800,0.18405,0.593066,0.388756,0.469653,0.91817
3,0.061700,0.173764,0.578295,0.446172,0.503714,0.922023


{'test_loss': 0.2258116602897644,
 'test_precision': 0.4864864864864865,
 'test_recall': 0.283595922150139,
 'test_f1': 0.3583138173302108,
 'test_accuracy': 0.9335175879396985,
 'test_runtime': 88.1595,
 'test_samples_per_second': 14.599,
 'test_steps_per_second': 1.826}

In [22]:
macro_micro_adam5 = evaluate_token_classification(predictions=test_pred_adam5.predictions, \
                                                  labels=test_pred_adam5.label_ids, \
                                                  entity_types=entity_types)
pprint(macro_micro_adam5)

{'f1': {'corporation': {'B-label': 0.0, 'I-label': 0.0, 'entity': 0.0},
        'creative-work': {'B-label': 0.0, 'I-label': 0.0, 'entity': 0.0},
        'group': {'B-label': 0.0, 'I-label': 0.0, 'entity': 0.0},
        'location': {'B-label': 0.030303030303030304,
                     'I-label': 0.0,
                     'entity': 0.0},
        'person': {'B-label': 0.00904977375565611,
                   'I-label': 0.0,
                   'entity': 0.0},
        'product': {'B-label': 0.0, 'I-label': 0.03125, 'entity': 0.0}},
 'macro_f1': 0.0,
 'micro_f1': 0.0,
 'precision': {'corporation': {'B-label': 0.0, 'I-label': 0.0, 'entity': 0.0},
               'creative-work': {'B-label': 0.0, 'I-label': 0.0, 'entity': 0.0},
               'group': {'B-label': 0.0, 'I-label': 0.0, 'entity': 0.0},
               'location': {'B-label': 0.03508771929824561,
                            'I-label': 0.0,
                            'entity': 0.0},
               'person': {'B-label': 0.0128205128

The decrease in model performance is negligible.
Again, we also try to increase the batch size to 32.

In [21]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)
optimizer6 = AdamW(model.parameters(), lr=1e-5)
args_adam6 = TrainingArguments(
    "bert-finetuned-adam6",
    evaluation_strategy="epoch",
    per_device_train_batch_size=32,
)

trainer_adam6 = Trainer(
    model=model,
    args=args_adam6,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    optimizers=(optimizer6, None)
)

trainer_adam6.train()
test_pred_adam6 = trainer_adam6.predict(test_dataset=tokenized_datasets["test"])
test_pred_adam6.metrics

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.242225,0.0,0.0,0.0,0.88167
2,No log,0.2215,0.408696,0.05622,0.098843,0.88836
3,No log,0.212453,0.506977,0.130383,0.207422,0.892695


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'test_loss': 0.23187460005283356,
 'test_precision': 0.5045454545454545,
 'test_recall': 0.10287303058387395,
 'test_f1': 0.17090069284064663,
 'test_accuracy': 0.9205778894472362,
 'test_runtime': 143.5492,
 'test_samples_per_second': 8.966,
 'test_steps_per_second': 1.122}

In [22]:
macro_micro_adam6 = evaluate_token_classification(predictions=test_pred_adam6.predictions, \
                                                  labels=test_pred_adam6.label_ids, \
                                                  entity_types=entity_types)
pprint(macro_micro_adam6)

{'f1': {'corporation': {'B-label': 0.0, 'I-label': 0.0, 'entity': 0.0},
        'creative-work': {'B-label': 0.0, 'I-label': 0.0, 'entity': 0.0},
        'group': {'B-label': 0.0, 'I-label': 0.0, 'entity': 0.0},
        'location': {'B-label': 0.0,
                     'I-label': 0.017543859649122806,
                     'entity': 0.0},
        'person': {'B-label': 0.0,
                   'I-label': 0.019569471624266144,
                   'entity': 0.0},
        'product': {'B-label': 0.0, 'I-label': 0.0, 'entity': 0.0}},
 'macro_f1': 0.0,
 'micro_f1': 0.0,
 'precision': {'corporation': {'B-label': 0.0, 'I-label': 0.0, 'entity': 0.0},
               'creative-work': {'B-label': 0.0, 'I-label': 0.0, 'entity': 0.0},
               'group': {'B-label': 0.0, 'I-label': 0.0, 'entity': 0.0},
               'location': {'B-label': 0.0,
                            'I-label': 0.05263157894736842,
                            'entity': 0.0},
               'person': {'B-label': 0.0,
          

This model struggles a lot to correctly identify true positives from false negatives, leading to a terrible Recall and F1 score.