Steps:
1. Implement a pipeline https://huggingface.co/docs/transformers/v4.28.1/en/add_new_pipeline
2. https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb

Goals:
1. Load dataset in hugging face dataset

Target architecture
1. DistilBERT
2. RoBERTA
3. T5

In [36]:
from datasets import load_dataset

train_ds = load_dataset("json", data_files="data/all_data.json", split=f"train[:70%]" )
val_ds = load_dataset("json", data_files="data/all_data.json", split=f"train[70%:90%]")
test_ds = load_dataset("json", data_files="data/all_data.json", split="train[90%:]")
print(train_ds[1])
print(val_ds[1])
print(test_ds[1])



Found cached dataset json (/Users/mohimenul.admin/.cache/huggingface/datasets/json/default-d7164a1d0bbc86eb/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
Found cached dataset json (/Users/mohimenul.admin/.cache/huggingface/datasets/json/default-d7164a1d0bbc86eb/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
Found cached dataset json (/Users/mohimenul.admin/.cache/huggingface/datasets/json/default-d7164a1d0bbc86eb/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)


{'sentence': 'the perianth is bright pink with a green central tube that less than long .', 'label_array': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}
{'sentence': 'the palm oil used is non hydrogenated and certified by roundtable on sustainable .', 'label_array': ['O', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'O']}
{'sentence': 'in the summer of 2013 company raised $ 3 million funding by existing investor spark capital .', 'label_array': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PrivateCorp', 'I-PrivateCorp', 'O']}


#### Findings

1. I only need sentence for tokenization, so dataset should have sentence column
2. Need to figure out how to provide labels

In [37]:
from transformers import AutoTokenizer

# Same as before
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(sample):
    return tokenizer(sample["sentence"], add_special_tokens=False, truncation=True, return_tensors="tf", padding="max_length", max_length=35)

tokenized_ds = test_ds.map(tokenize_function)
print(tokenized_ds[0])


Map:   0%|          | 0/1765 [00:00<?, ? examples/s]

{'sentence': 'the irish times hailed him as one of most creative writers our time .', 'label_array': ['O', 'I-WrittenWork', 'I-WrittenWork', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], 'input_ids': [[1996, 3493, 2335, 16586, 2032, 2004, 2028, 1997, 2087, 5541, 4898, 2256, 2051, 1012, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}


In [28]:
# Add padding to label
from src.utility import Constants

def addLabelPadding(sample):
    label_count = len(sample["label_array"])
    for i in range(label_count, Constants.SEQUENCE_LENGTH):
        sample["label_array"].append(Constants.PAD_TOKEN)
    return sample

padded_ds = tokenized_ds.map(addLabelPadding)
input_size = len(padded_ds[0]['input_ids'][0])
label_array_size = len(padded_ds[0]['label_array'])
print(padded_ds[0], input_size, label_array_size)

Map:   0%|          | 0/1765 [00:00<?, ? examples/s]

{'sentence': 'the irish times hailed him as one of most creative writers our time .', 'label_array': ['O', 'I-WrittenWork', 'I-WrittenWork', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>'], 'input_ids': [[1996, 3493, 2335, 16586, 2032, 2004, 2028, 1997, 2087, 5541, 4898, 2256, 2051, 1012, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]} 35 35


In [41]:
# Add label in the map as well
from src.label_process import LabelProcess
from src.utility import Constants

train_label_array = LabelProcess.getLabelArray("../data/en-train.conll")
dev_label_array = LabelProcess.getLabelArray("../data/en-dev.conll")

label_array = train_label_array + dev_label_array
label_array = list(set(label_array))

lp_class = LabelProcess(label_array)

def addEncoding(sample):
    sample["labels"] = list(map(lambda x: LabelProcess.encode(x, label_array), sample['label_array']))
    return sample

final_ds = padded_ds.map(addEncoding)
print(final_ds[0])

Loading cached processed dataset at /Users/mohimenul.admin/.cache/huggingface/datasets/json/default-d7164a1d0bbc86eb/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e/cache-5021842c30c90c2e.arrow


1765
{'sentence': 'the irish times hailed him as one of most creative writers our time .', 'label_array': ['O', 'I-WrittenWork', 'I-WrittenWork', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>'], 'input_ids': [[1996, 3493, 2335, 16586, 2032, 2004, 2028, 1997, 2087, 5541, 4898, 2256, 2051, 1012, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'labels': [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 

In [42]:
# Convert ta dataset to tf dataset
tf_train_dataset = final_ds.to_tf_dataset(
    columns=["input_ids", "attention_mask"],
    label_cols=["labels"],
    shuffle=False,
    batch_size=8,
)

print(tf_train_dataset)

<_PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(None, 1, 35), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(None, 1, 35), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, 35, 68), dtype=tf.float32, name=None))>


In [44]:
# Start training loop
import tensorflow as tf
import numpy as np
from transformers import TFAutoModelForTokenClassification

# Same as before
checkpoint = "distilbert-base-uncased"
model = TFAutoModelForTokenClassification.from_pretrained(checkpoint)

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(
    optimizer="adam",
    loss=loss,
    metrics=["accuracy"],
)
model.fit(
    tf_train_dataset,
    validation_data=tf_train_dataset,
    batch_size=8,
)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForTokenClassification: ['vocab_layer_norm', 'vocab_transform', 'vocab_projector', 'activation_13']
- This IS expected if you are initializing TFDistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['dropout_119', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferen

ValueError: in user code:

    File "/Users/mohimenul.admin/miniforge3/envs/transformer_fine_tuning/lib/python3.10/site-packages/keras/engine/training.py", line 1284, in train_function  *
        return step_function(self, iterator)
    File "/Users/mohimenul.admin/miniforge3/envs/transformer_fine_tuning/lib/python3.10/site-packages/keras/engine/training.py", line 1268, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/mohimenul.admin/miniforge3/envs/transformer_fine_tuning/lib/python3.10/site-packages/keras/engine/training.py", line 1249, in run_step  **
        outputs = model.train_step(data)
    File "/Users/mohimenul.admin/miniforge3/envs/transformer_fine_tuning/lib/python3.10/site-packages/transformers/modeling_tf_utils.py", line 1567, in train_step
        y_pred = self(x, training=True)
    File "/Users/mohimenul.admin/miniforge3/envs/transformer_fine_tuning/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/var/folders/ln/90446hl52_qfwrvp7fhtg0400000gq/T/__autograph_generated_file426wjxtz.py", line 36, in tf__run_call_with_unpacked_inputs
        retval_ = ag__.converted_call(ag__.ld(func), (ag__.ld(self),), dict(**ag__.ld(unpacked_inputs)), fscope)
    File "/var/folders/ln/90446hl52_qfwrvp7fhtg0400000gq/T/__autograph_generated_filejwvr8fxe.py", line 14, in tf__call
        outputs = ag__.converted_call(ag__.ld(self).distilbert, (), dict(input_ids=ag__.ld(input_ids), attention_mask=ag__.ld(attention_mask), head_mask=ag__.ld(head_mask), inputs_embeds=ag__.ld(inputs_embeds), output_attentions=ag__.ld(output_attentions), output_hidden_states=ag__.ld(output_hidden_states), return_dict=ag__.ld(return_dict), training=ag__.ld(training)), fscope)
    File "/var/folders/ln/90446hl52_qfwrvp7fhtg0400000gq/T/__autograph_generated_file426wjxtz.py", line 36, in tf__run_call_with_unpacked_inputs
        retval_ = ag__.converted_call(ag__.ld(func), (ag__.ld(self),), dict(**ag__.ld(unpacked_inputs)), fscope)
    File "/var/folders/ln/90446hl52_qfwrvp7fhtg0400000gq/T/__autograph_generated_file_v8c68s9.py", line 92, in tf__call
        embedding_output = ag__.converted_call(ag__.ld(self).embeddings, (ag__.ld(input_ids),), dict(inputs_embeds=ag__.ld(inputs_embeds)), fscope)
    File "/var/folders/ln/90446hl52_qfwrvp7fhtg0400000gq/T/__autograph_generated_file2m_t8175.py", line 53, in tf__call
        final_embeddings = ag__.converted_call(ag__.ld(self).LayerNorm, (), dict(inputs=ag__.ld(final_embeddings)), fscope)

    ValueError: Exception encountered when calling layer 'tf_distil_bert_for_token_classification_5' (type TFDistilBertForTokenClassification).
    
    in user code:
    
        File "/Users/mohimenul.admin/miniforge3/envs/transformer_fine_tuning/lib/python3.10/site-packages/transformers/modeling_tf_utils.py", line 825, in run_call_with_unpacked_inputs  *
            return func(self, **unpacked_inputs)
        File "/Users/mohimenul.admin/miniforge3/envs/transformer_fine_tuning/lib/python3.10/site-packages/transformers/models/distilbert/modeling_tf_distilbert.py", line 831, in call  *
            outputs = self.distilbert(
        File "/Users/mohimenul.admin/miniforge3/envs/transformer_fine_tuning/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler  **
            raise e.with_traceback(filtered_tb) from None
        File "/var/folders/ln/90446hl52_qfwrvp7fhtg0400000gq/T/__autograph_generated_file426wjxtz.py", line 36, in tf__run_call_with_unpacked_inputs
            retval_ = ag__.converted_call(ag__.ld(func), (ag__.ld(self),), dict(**ag__.ld(unpacked_inputs)), fscope)
        File "/var/folders/ln/90446hl52_qfwrvp7fhtg0400000gq/T/__autograph_generated_file_v8c68s9.py", line 92, in tf__call
            embedding_output = ag__.converted_call(ag__.ld(self).embeddings, (ag__.ld(input_ids),), dict(inputs_embeds=ag__.ld(inputs_embeds)), fscope)
        File "/var/folders/ln/90446hl52_qfwrvp7fhtg0400000gq/T/__autograph_generated_file2m_t8175.py", line 53, in tf__call
            final_embeddings = ag__.converted_call(ag__.ld(self).LayerNorm, (), dict(inputs=ag__.ld(final_embeddings)), fscope)
    
        ValueError: Exception encountered when calling layer 'distilbert' (type TFDistilBertMainLayer).
        
        in user code:
        
            File "/Users/mohimenul.admin/miniforge3/envs/transformer_fine_tuning/lib/python3.10/site-packages/transformers/modeling_tf_utils.py", line 825, in run_call_with_unpacked_inputs  *
                return func(self, **unpacked_inputs)
            File "/Users/mohimenul.admin/miniforge3/envs/transformer_fine_tuning/lib/python3.10/site-packages/transformers/models/distilbert/modeling_tf_distilbert.py", line 409, in call  *
                embedding_output = self.embeddings(input_ids, inputs_embeds=inputs_embeds)  # (bs, seq_length, dim)
            File "/Users/mohimenul.admin/miniforge3/envs/transformer_fine_tuning/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler  **
                raise e.with_traceback(filtered_tb) from None
            File "/var/folders/ln/90446hl52_qfwrvp7fhtg0400000gq/T/__autograph_generated_file2m_t8175.py", line 53, in tf__call
                final_embeddings = ag__.converted_call(ag__.ld(self).LayerNorm, (), dict(inputs=ag__.ld(final_embeddings)), fscope)
        
            ValueError: Exception encountered when calling layer 'embeddings' (type TFEmbeddings).
            
            in user code:
            
                File "/Users/mohimenul.admin/miniforge3/envs/transformer_fine_tuning/lib/python3.10/site-packages/transformers/models/distilbert/modeling_tf_distilbert.py", line 131, in call  *
                    final_embeddings = self.LayerNorm(inputs=final_embeddings)
                File "/Users/mohimenul.admin/miniforge3/envs/transformer_fine_tuning/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler  **
                    raise e.with_traceback(filtered_tb) from None
            
                ValueError: Exception encountered when calling layer 'LayerNorm' (type LayerNormalization).
                
                Cannot reshape a tensor with 768 elements to shape [1,1,35,1] (35 elements) for '{{node tf_distil_bert_for_token_classification_5/distilbert/embeddings/LayerNorm/Reshape}} = Reshape[T=DT_FLOAT, Tshape=DT_INT32](tf_distil_bert_for_token_classification_5/distilbert/embeddings/LayerNorm/Reshape/ReadVariableOp, tf_distil_bert_for_token_classification_5/distilbert/embeddings/LayerNorm/Reshape/shape)' with input shapes: [768], [4] and with input tensors computed as partial shapes: input[1] = [1,1,35,1].
                
                Call arguments received by layer 'LayerNorm' (type LayerNormalization):
                  • inputs=tf.Tensor(shape=(None, 1, 35, 768), dtype=float32)
            
            
            Call arguments received by layer 'embeddings' (type TFEmbeddings):
              • input_ids=tf.Tensor(shape=(None, 1, 35), dtype=int32)
              • position_ids=None
              • inputs_embeds=None
              • training=True
        
        
        Call arguments received by layer 'distilbert' (type TFDistilBertMainLayer):
          • input_ids=tf.Tensor(shape=(None, 1, 35), dtype=int32)
          • attention_mask=tf.Tensor(shape=(None, 1, 35), dtype=int32)
          • head_mask=None
          • inputs_embeds=None
          • output_attentions=False
          • output_hidden_states=False
          • return_dict=True
          • training=True
    
    
    Call arguments received by layer 'tf_distil_bert_for_token_classification_5' (type TFDistilBertForTokenClassification):
      • input_ids={'input_ids': 'tf.Tensor(shape=(None, 1, 35), dtype=int64)', 'attention_mask': 'tf.Tensor(shape=(None, 1, 35), dtype=int64)'}
      • attention_mask=None
      • head_mask=None
      • inputs_embeds=None
      • output_attentions=None
      • output_hidden_states=None
      • return_dict=None
      • labels=None
      • training=True


In [14]:
# Explore version 2
from datasets import load_dataset
from src.utility import Constants

train_ds = load_dataset("json", data_files="data/all_data.json", split=f"train[:70%]" )
val_ds = load_dataset("json", data_files="data/all_data.json", split=f"train[70%:90%]")
test_ds = load_dataset("json", data_files="data/all_data.json", split="train[90%:]")

print(train_ds[1])
# print(val_ds[1])
# print(test_ds[1])

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(Constants.MODEL_CHECKPOINT)

Found cached dataset json (/Users/mohimenul.admin/.cache/huggingface/datasets/json/default-6d06769e317d80a9/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
Found cached dataset json (/Users/mohimenul.admin/.cache/huggingface/datasets/json/default-6d06769e317d80a9/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
Found cached dataset json (/Users/mohimenul.admin/.cache/huggingface/datasets/json/default-6d06769e317d80a9/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)


{'sentence': 'vasconcelos was diagnosed with lung cancer in mid 2015 .', 'ner_tags': [0, 0, 0, 0, 58, 25, 0, 0, 0, 0]}


In [15]:
label_all_tokens = True
task= Constants.TASK

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["sentence"], truncation=True,
    )

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                try:
                    label_ids.append(label[word_idx])
                except IndexError:
                    label_ids.append(-100)
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                try:
                    label_ids.append(label[word_idx] if label_all_tokens else -100)
                except IndexError:
                    label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# tokenize_and_align_labels(train_ds[:])

tokenized_train_ds = train_ds.map(tokenize_and_align_labels, batched=True)
tokenized_val_ds = val_ds.map(tokenize_and_align_labels, batched=True)
tokenized_test_ds = test_ds.map(tokenize_and_align_labels, batched=True)

print(
tokenized_train_ds["labels"][:10],
tokenized_val_ds["labels"][1000:1010],
tokenized_test_ds["labels"][300:310]
)

Loading cached processed dataset at /Users/mohimenul.admin/.cache/huggingface/datasets/json/default-6d06769e317d80a9/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e/cache-433531d9476d68e1.arrow


Map:   0%|          | 0/3530 [00:00<?, ? examples/s]

Loading cached processed dataset at /Users/mohimenul.admin/.cache/huggingface/datasets/json/default-6d06769e317d80a9/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e/cache-2ca1bb3e83849440.arrow


[[-100, 0, 0, 1, 1, 1, 0, 63, 30, 30, 30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100], [-100, 0, 0, 0, 0, 0, 0, 0, 58, 25, 0, 0, 0, 0, -100], [-100, 13, 0, 0, 0, 46, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100], [-100, 0, 40, 40, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100], [-100, 0, 40, 7, 0, 0, 0, 0, 0, 0, 0, 0, -100], [-100, 43, 0, 0, 0, 0, 0, 0, 0, 0, 43, 43, 43, 0, 0, 0, 61, 61, 0, 0, 0, 0, 0, 0, -100, -100], [-100, 0, 60, 60, 60, 0, 0, 0, 0, 0, -100], [-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 54, 21, 0, 0, 0, 0, 0, -100], [-100, 0, 0, 0, 0, 0, 63, 30, 30, 30, 0, -100], [-100, 0, 0, 0, 0, 0, 0, 0, 0, 42, 9, 9, 0, 0, 0, 0, 0, 0, 0, -100]] [[-100, 62, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 36, 36, 36, 0, -100], [-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 40, 7, 0, -100], [-100, 0, 0, 0, 0, 34, 1, 0, 50, 0, -100], [-100, 42, 9, 9, 0, 0, 49, 49, 49, 0, 0, 0, 35, 2, 2, 35, 2, 2, 2, -100, -100], [-100, 0, 0, 0, 0, 0, 0, 0, 0, 41, 0, 0, 0, 0, -100], [-100, 0, 0, 0, 49, 16, 16, 16, 0

In [16]:
from transformers import TFAutoModelForTokenClassification
from src.preprocess import PreProcess

unique_labels, id2label, label2id = PreProcess.readLabelInfo("data/label_info.json", True)

model = TFAutoModelForTokenClassification.from_pretrained(
    Constants.MODEL_CHECKPOINT, num_labels=len(unique_labels), id2label=id2label, label2id=label2id
)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForTokenClassification: ['vocab_layer_norm', 'vocab_projector', 'vocab_transform', 'activation_13']
- This IS expected if you are initializing TFDistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['dropout_39', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferenc

In [17]:
from transformers import create_optimizer

num_train_epochs = 3
num_train_steps = (len(tokenized_train_ds) // Constants.BATCH_SIZE) * num_train_epochs
optimizer, lr_schedule = create_optimizer(
    init_lr=1e-1,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
    num_warmup_steps=0,
)

In [18]:
import tensorflow as tf
from transformers import DataCollatorForTokenClassification

model.compile(optimizer=optimizer)
data_collator = DataCollatorForTokenClassification(tokenizer, return_tensors="np")

train_set = model.prepare_tf_dataset(
    tokenized_train_ds,
    shuffle=True,
    batch_size=Constants.BATCH_SIZE,
    collate_fn=data_collator,
)

validation_set = model.prepare_tf_dataset(
    tokenized_val_ds,
    shuffle=False,
    batch_size=Constants.BATCH_SIZE,
    collate_fn=data_collator,
)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [19]:
from datasets import load_metric
import numpy as np
from transformers.keras_callbacks import KerasMetricCallback

metric = load_metric("seqeval")
# labels = [unique_labels[i] for i in example[f"{task}_tags"]]
# metric.compute(predictions=[labels], references=[labels])


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [unique_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [unique_labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


metric_callback = KerasMetricCallback(
    metric_fn=compute_metrics, eval_dataset=validation_set
)

In [24]:
# from transformers.keras_callbacks import PushToHubCallback
# model_name = model_checkpoint.split("/")[-1]
# push_to_hub_model_id = f"{model_name}-finetuned-{task}"
# push_to_hub_callback = PushToHubCallback(
#     output_dir="./tc_model_save",
#     tokenizer=tokenizer,
#     hub_model_id=push_to_hub_model_id,
# )
import os
os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
os.environ['TF_GPU_THREAD_COUNT'] = '16'

modelCheckpoint_callback = tf.keras.callbacks.ModelCheckpoint('./model/model_best.h5', monitor='val_loss', save_best_only=True, mode='min')

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="./logs")

early_stopping_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=6, mode='min', verbose=1)

callbacks = [metric_callback, tensorboard_callback, modelCheckpoint_callback, early_stopping_callback]

model.fit(
    train_set,
    validation_data=validation_set,
    epochs=num_train_epochs,
    callbacks=callbacks,
)

Epoch 1/3


  tensor = as_tensor(value)


111/772 [===>..........................] - ETA: 19:30 - loss: 9.5678

KeyboardInterrupt: 