code borrowed from:
https://github.com/huggingface/notebooks/blob/main/examples/token_classification.ipynb

In [1]:
LOCAL = False   # training on local Mac vs. in Colab

import os
import sys
import time

if LOCAL:
    USE_MPS_DEVICE = True   # for training on M1 chip
    BASE_PATH = "/Users/carolanderson/Dropbox/"

else:
    USE_MPS_DEVICE = False
    BASE_PATH = "/content/drive/My Drive/"
    ! pip install comet-ml
    ! pip install datasets
    ! pip install evaluate
    ! pip install transformers[torch]
    ! pip install seqeval
    from google.colab import drive
    drive.mount('/content/drive')

Collecting comet-ml
  Downloading comet_ml-3.33.5-py3-none-any.whl (549 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/549.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━[0m [32m389.1/549.1 kB[0m [31m11.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m549.1/549.1 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
Collecting python-box<7.0.0 (from comet-ml)
  Downloading python_box-6.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m42.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting requests-toolbelt>=0.8.0 (from comet-ml)
  Downloading requests_toolbelt-1.0.0-py2.py3-none-any.whl (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.5/54.5 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
Collecting semantic-version>=2.8.0 (from c

In [2]:
import comet_ml
from datasets import load_dataset, load_metric, Dataset, DatasetDict
import evaluate
import numpy as np
import transformers
from transformers import AutoTokenizer, \
                        AutoConfig, \
                        DataCollatorForTokenClassification, \
                        AutoModelForTokenClassification, \
                        TrainingArguments, \
                        Trainer, \
                        EarlyStoppingCallback

In [3]:
comet_ml.init(project_name='food-ner')

In [4]:
def get_current_time():
    t = time.localtime()
    current_time = time.strftime("%H_%M_%S", t)
    return current_time


def read_conll_file(file):
    """
    Given a file in CoNLL format, read in tokens and labels. Treat each sentence as a training example.
    :param file: file in CoNLL format; tokens are assumed to be in the first column and labels in the last column.
    :returns a nested list, in which each sublist is a sentence and contains a sublist [token, label] for each token.

    .. note:: Ignores document boundaries and treats each sentence as an independent training example.
    """
    documents = []  # holds all documents
    sentence = [] # will hold the first sentence
    with open(file, 'r') as infile:
        for line in infile:
            if '-DOCSTART-' in line:  # beginning of a new document; ignore this since we will treat each sentence as a training example
                continue
            elif not line.split():  # beginning of a new sentence
                if sentence:
                    documents.append(sentence)
                sentence = []
            else:
                token, *other_columns, label = line.split()
                sentence.append([token, label])
    return documents


def convert_to_dataset(data, label_map):
    formatted_data = {"tokens": [], "ner_tags": []}
    for sentence in data:
        tokens = [token_data[0] for token_data in sentence]
        ner_tags = [label_map[token_data[1]] for token_data in sentence]
        formatted_data["tokens"].append(tokens)
        formatted_data["ner_tags"].append(ner_tags)
    return Dataset.from_dict(formatted_data)


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True, padding=True
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


def compute_metrics(p):
    experiment = comet_ml.get_global_experiment()
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    result_dict = {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }
    experiment.log_metrics(result_dict)
    return result_dict

# Load datasets

In [5]:
train_data = read_conll_file(os.path.join(BASE_PATH, "nlp_data", "recipe_data", "20200523_food_gold_train.conll"))
validation_data = read_conll_file(os.path.join(BASE_PATH, "nlp_data",  "recipe_data", "20200523_food_gold_dev.conll"))
test_data = read_conll_file(os.path.join(BASE_PATH, "nlp_data",  "recipe_data", "20200523_food_gold_test.conll"))

In [6]:
label_list = sorted(list(set([token_data[1] for sentence in train_data for token_data in sentence])))
label_map = {label: i for i, label in enumerate(label_list)}

# mappings for config
label2id = label_map
id2label = {v:k for k, v in label2id.items()}

train_dataset = convert_to_dataset(train_data, label_map)
validation_dataset = convert_to_dataset(validation_data, label_map)
test_dataset = convert_to_dataset(test_data, label_map)

food_datasets = DatasetDict({
    "train": train_dataset,
    "validation": validation_dataset,
    "test": test_dataset,
})

# Tokenize and align labels to tokens

In [8]:
model_checkpoint = "roberta-base"

In [12]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

In [13]:
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [14]:
tokenized_datasets = food_datasets.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/592 [00:00<?, ? examples/s]

Map:   0%|          | 0/195 [00:00<?, ? examples/s]

Map:   0%|          | 0/194 [00:00<?, ? examples/s]

# Set up training

In [33]:
task = "ner"
batch_size = 4
experiment_id = "20230705_" + get_current_time()

In [34]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    output_dir = os.path.join(BASE_PATH, "food_ner_models", f"{experiment_id}-{model_name}-finetuned-{task}"),
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    logging_steps = 20,
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01,
    push_to_hub=False,
    use_mps_device=USE_MPS_DEVICE,
    load_best_model_at_end=True,
    metric_for_best_model = "f1"
)

In [35]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [36]:
config = AutoConfig.from_pretrained(model_checkpoint, label2id=label2id, id2label=id2label)

In [37]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, config=config)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

In [38]:
metric = evaluate.load("seqeval")

In [39]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(2)]
)

# Train

In [40]:
trainer.train()

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/content' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.
[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/carolmanderson/food-ner/81e29a1194d94011bbe50af0d342818c



Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0733,0.043431,0.92605,0.937872,0.931924,0.986352
2,0.0315,0.036146,0.946565,0.949787,0.948173,0.989191
3,0.0239,0.037944,0.938912,0.954894,0.946835,0.988825
4,0.0135,0.038335,0.955707,0.954894,0.9553,0.990474
5,0.0206,0.038603,0.953586,0.961702,0.957627,0.99084
6,0.0095,0.047093,0.95611,0.945532,0.950792,0.990382
7,0.0078,0.044359,0.94941,0.958298,0.953833,0.990748


[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/carolmanderson/food-ner/81e29a1194d94011bbe50af0d342818c
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     accuracy [7]                   : (0.986351561784373, 0.9908399743519282)
[1;38;5;39mCOMET INFO:[0m     epoch [59]                     : (0.14, 7.0)
[1;38;5;39mCOMET INFO:[0m     eval/accuracy [7]              : (0.986351561784373, 0.9908399743519282)
[1;38;5;39mCOMET INFO:[0m     eval/f1 [7]                    : (0.9319238900634249, 0.9576271186440678)
[1;38;5;39mCOMET INFO:[0m     

TrainOutput(global_step=1036, training_loss=0.03952890948093996, metrics={'train_runtime': 374.5643, 'train_samples_per_second': 15.805, 'train_steps_per_second': 3.951, 'total_flos': 634466923574400.0, 'train_loss': 0.03952890948093996, 'epoch': 7.0})