Steps:
1. Implement a pipeline https://huggingface.co/docs/transformers/v4.28.1/en/add_new_pipeline
2. https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb

Goals:
1. Load dataset in hugging face dataset

Target architecture
1. DistilBERT
2. RoBERTA
3. T5

In [1]:
from datasets import load_dataset
from src.utility import Constants

train_ds = load_dataset("json", data_files="data/all_data.json", split=f"train[:70%]" )
val_ds = load_dataset("json", data_files="data/all_data.json", split=f"train[70%:90%]")
test_ds = load_dataset("json", data_files="data/all_data.json", split="train[90%:]")

print(train_ds[1])
# print(val_ds[1])
# print(test_ds[1])

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(Constants.MODEL_CHECKPOINT)

Found cached dataset json (/Users/mohimenul.admin/.cache/huggingface/datasets/json/default-6d06769e317d80a9/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
Found cached dataset json (/Users/mohimenul.admin/.cache/huggingface/datasets/json/default-6d06769e317d80a9/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
Found cached dataset json (/Users/mohimenul.admin/.cache/huggingface/datasets/json/default-6d06769e317d80a9/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)


{'sentence': 'vasconcelos was diagnosed with lung cancer in mid 2015 .', 'ner_tags': [0, 0, 0, 0, 58, 25, 0, 0, 0, 0]}


In [2]:
label_all_tokens = True
task= Constants.TASK

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["sentence"], truncation=True,
    )

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                try:
                    label_ids.append(label[word_idx])
                except IndexError:
                    label_ids.append(-100)
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                try:
                    label_ids.append(label[word_idx] if label_all_tokens else -100)
                except IndexError:
                    label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# tokenize_and_align_labels(train_ds[:])

tokenized_train_ds = train_ds.map(tokenize_and_align_labels, batched=True)
tokenized_val_ds = val_ds.map(tokenize_and_align_labels, batched=True)
tokenized_test_ds = test_ds.map(tokenize_and_align_labels, batched=True)

print(
tokenized_train_ds[:10],
)

Loading cached processed dataset at /Users/mohimenul.admin/.cache/huggingface/datasets/json/default-6d06769e317d80a9/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e/cache-433531d9476d68e1.arrow
Loading cached processed dataset at /Users/mohimenul.admin/.cache/huggingface/datasets/json/default-6d06769e317d80a9/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e/cache-753887e45470d01b.arrow
Loading cached processed dataset at /Users/mohimenul.admin/.cache/huggingface/datasets/json/default-6d06769e317d80a9/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e/cache-862899f0e0883648.arrow


{'sentence': ['in the new york times writer jon caramanica ranked it as 29th best song of year on his end list .', 'vasconcelos was diagnosed with lung cancer in mid 2015 .', 'junior ( licence built walter ) with 82 kw 110 hp nominal power and 90 120 take off .', 'the leones de ponce basketball team is one of leading teams island winning 12 championships during their tenure .', 'to orlando magic 2028 second round pickcash considerations', "google leading a $ 1 billion investment in 2017 lyft which could support waymo 's robotaxi strategy .", 'saint columba blessed it for him .', 'the next part of his career was spent in hong kong where he involved coaching .', 'the music score by composed marvin hamlisch .', 'when the temple was excavated in 1764 by karl jakob weber many remnants of life were found .'], 'ner_tags': [[0, 0, 1, 1, 1, 0, 63, 30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 58, 25, 0, 0, 0, 0], [13, 0, 0, 0, 46, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 40, 7,

In [3]:
from transformers import TFAutoModelForTokenClassification
from src.preprocess import PreProcess

unique_labels, id2label, label2id = PreProcess.readLabelInfo("data/label_info.json", True)

model = TFAutoModelForTokenClassification.from_pretrained(
    Constants.MODEL_CHECKPOINT, num_labels=len(unique_labels), id2label=id2label, label2id=label2id
)

Metal device set to: Apple M1 Pro


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForTokenClassification: ['vocab_projector', 'activation_13', 'vocab_layer_norm', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['dropout_19', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferenc

In [9]:
from transformers import create_optimizer

num_train_epochs = Constants.MAX_EPOCH
num_train_steps = (len(tokenized_train_ds) // Constants.BATCH_SIZE) * num_train_epochs
optimizer, lr_schedule = create_optimizer(
    init_lr=2e-5,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
    num_warmup_steps=0,
)

In [10]:
import tensorflow as tf
from transformers import DataCollatorForTokenClassification

model.compile(optimizer=optimizer)
data_collator = DataCollatorForTokenClassification(tokenizer, return_tensors="np")

train_set = model.prepare_tf_dataset(
    tokenized_train_ds,
    shuffle=True,
    batch_size=Constants.BATCH_SIZE,
    collate_fn=data_collator,
)

validation_set = model.prepare_tf_dataset(
    tokenized_val_ds,
    shuffle=False,
    batch_size=Constants.BATCH_SIZE,
    collate_fn=data_collator,
)

print(train_set)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


<_PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(64, None), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(64, None), dtype=tf.int64, name=None)}, TensorSpec(shape=(64, None), dtype=tf.int64, name=None))>


In [11]:
from datasets import load_metric
import numpy as np
from transformers.keras_callbacks import KerasMetricCallback

# metric = load_metric("seqeval")
# labels = [unique_labels[i] for i in example[f"{task}_tags"]]
# metric.compute(predictions=[labels], references=[labels])

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [unique_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [unique_labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = load_metric("seqeval").compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


metric_callback = KerasMetricCallback(
    metric_fn=compute_metrics, eval_dataset=validation_set
)

In [12]:
# from transformers.keras_callbacks import PushToHubCallback
# model_name = model_checkpoint.split("/")[-1]
# push_to_hub_model_id = f"{model_name}-finetuned-{task}"
# push_to_hub_callback = PushToHubCallback(
#     output_dir="./tc_model_save",
#     tokenizer=tokenizer,
#     hub_model_id=push_to_hub_model_id,
# )
import os
os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
os.environ['TF_GPU_THREAD_COUNT'] = '16'

modelCheckpoint_callback = tf.keras.callbacks.ModelCheckpoint('./model/model_best', monitor='val_loss', save_best_only=True, mode='min')

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="./logs")

early_stopping_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=6, mode='min', verbose=1)

callbacks = [metric_callback, tensorboard_callback, early_stopping_callback]

model.fit(
    train_set,
    validation_data=validation_set,
    epochs=Constants.MAX_EPOCH,
    callbacks=callbacks,
)

Epoch 1/1000

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2/1000


  tensor = as_tensor(value)


 24/193 [==>...........................] - ETA: 4:57 - loss: 1.5775

KeyboardInterrupt: 