Steps:
1. Implement a pipeline https://huggingface.co/docs/transformers/v4.28.1/en/add_new_pipeline

Goals:
1. Load dataset in hugging face dataset

Target architecture
1. DistilBERT
2. RoBERTA
3. T5

In [36]:
from datasets import load_dataset

train_ds = load_dataset("json", data_files="data/all_data.json", split=f"train[:70%]" )
val_ds = load_dataset("json", data_files="data/all_data.json", split=f"train[70%:90%]")
test_ds = load_dataset("json", data_files="data/all_data.json", split="train[90%:]")
print(train_ds[1])
print(val_ds[1])
print(test_ds[1])



Found cached dataset json (/Users/mohimenul.admin/.cache/huggingface/datasets/json/default-d7164a1d0bbc86eb/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
Found cached dataset json (/Users/mohimenul.admin/.cache/huggingface/datasets/json/default-d7164a1d0bbc86eb/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
Found cached dataset json (/Users/mohimenul.admin/.cache/huggingface/datasets/json/default-d7164a1d0bbc86eb/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)


{'sentence': 'the perianth is bright pink with a green central tube that less than long .', 'label_array': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}
{'sentence': 'the palm oil used is non hydrogenated and certified by roundtable on sustainable .', 'label_array': ['O', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'O']}
{'sentence': 'in the summer of 2013 company raised $ 3 million funding by existing investor spark capital .', 'label_array': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PrivateCorp', 'I-PrivateCorp', 'O']}


#### Findings

1. I only need sentence for tokenization, so dataset should have sentence column
2. Need to figure out how to provide labels

In [33]:
# from src.utility import Utility
#
# def add_sentence(sample):
#     sample["sentence"] = Utility.make_sentence(list(sample.keys()))
#     return sample
#
# formatted_ds = test_ds.map(add_sentence)
# print(formatted_ds[5])

In [37]:
from transformers import AutoTokenizer

# Same as before
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(sample):
    return tokenizer(sample["sentence"], add_special_tokens=False, truncation=True, return_tensors="tf", padding="max_length", max_length=35)

tokenized_ds = test_ds.map(tokenize_function)
print(tokenized_ds[0])


Map:   0%|          | 0/1765 [00:00<?, ? examples/s]

{'sentence': 'the irish times hailed him as one of most creative writers our time .', 'label_array': ['O', 'I-WrittenWork', 'I-WrittenWork', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], 'input_ids': [[1996, 3493, 2335, 16586, 2032, 2004, 2028, 1997, 2087, 5541, 4898, 2256, 2051, 1012, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}


In [28]:
# Add padding to label
from src.utility import Constants

def addLabelPadding(sample):
    label_count = len(sample["label_array"])
    for i in range(label_count, Constants.SEQUENCE_LENGTH):
        sample["label_array"].append(Constants.PAD_TOKEN)
    return sample

padded_ds = tokenized_ds.map(addLabelPadding)
input_size = len(padded_ds[0]['input_ids'][0])
label_array_size = len(padded_ds[0]['label_array'])
print(padded_ds[0], input_size, label_array_size)

Map:   0%|          | 0/1765 [00:00<?, ? examples/s]

{'sentence': 'the irish times hailed him as one of most creative writers our time .', 'label_array': ['O', 'I-WrittenWork', 'I-WrittenWork', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>'], 'input_ids': [[1996, 3493, 2335, 16586, 2032, 2004, 2028, 1997, 2087, 5541, 4898, 2256, 2051, 1012, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]} 35 35


In [41]:
# Add label in the map as well
from src.label_process import LabelProcess
from src.utility import Constants

train_label_array = LabelProcess.getLabelArray("../data/en-train.conll")
dev_label_array = LabelProcess.getLabelArray("../data/en-dev.conll")

label_array = train_label_array + dev_label_array
label_array = list(set(label_array))

lp_class = LabelProcess(label_array)

def addEncoding(sample):
    sample["labels"] = list(map(lambda x: LabelProcess.encode(x, label_array), sample['label_array']))
    return sample

final_ds = padded_ds.map(addEncoding)
print(final_ds[0])

Loading cached processed dataset at /Users/mohimenul.admin/.cache/huggingface/datasets/json/default-d7164a1d0bbc86eb/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e/cache-5021842c30c90c2e.arrow


1765
{'sentence': 'the irish times hailed him as one of most creative writers our time .', 'label_array': ['O', 'I-WrittenWork', 'I-WrittenWork', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>'], 'input_ids': [[1996, 3493, 2335, 16586, 2032, 2004, 2028, 1997, 2087, 5541, 4898, 2256, 2051, 1012, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'labels': [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 

In [42]:
# Convert ta dataset to tf dataset
tf_train_dataset = final_ds.to_tf_dataset(
    columns=["input_ids", "attention_mask"],
    label_cols=["labels"],
    shuffle=False,
    batch_size=8,
)

print(tf_train_dataset)

<_PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(None, 1, 35), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(None, 1, 35), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, 35, 68), dtype=tf.float32, name=None))>


In [44]:
# Start training loop
import tensorflow as tf
import numpy as np
from transformers import TFAutoModelForTokenClassification

# Same as before
checkpoint = "distilbert-base-uncased"
model = TFAutoModelForTokenClassification.from_pretrained(checkpoint)

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(
    optimizer="adam",
    loss=loss,
    metrics=["accuracy"],
)
model.fit(
    tf_train_dataset,
    validation_data=tf_train_dataset,
    batch_size=8,
)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForTokenClassification: ['vocab_layer_norm', 'vocab_transform', 'vocab_projector', 'activation_13']
- This IS expected if you are initializing TFDistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['dropout_119', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferen

ValueError: in user code:

    File "/Users/mohimenul.admin/miniforge3/envs/transformer_fine_tuning/lib/python3.10/site-packages/keras/engine/training.py", line 1284, in train_function  *
        return step_function(self, iterator)
    File "/Users/mohimenul.admin/miniforge3/envs/transformer_fine_tuning/lib/python3.10/site-packages/keras/engine/training.py", line 1268, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/mohimenul.admin/miniforge3/envs/transformer_fine_tuning/lib/python3.10/site-packages/keras/engine/training.py", line 1249, in run_step  **
        outputs = model.train_step(data)
    File "/Users/mohimenul.admin/miniforge3/envs/transformer_fine_tuning/lib/python3.10/site-packages/transformers/modeling_tf_utils.py", line 1567, in train_step
        y_pred = self(x, training=True)
    File "/Users/mohimenul.admin/miniforge3/envs/transformer_fine_tuning/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/var/folders/ln/90446hl52_qfwrvp7fhtg0400000gq/T/__autograph_generated_file426wjxtz.py", line 36, in tf__run_call_with_unpacked_inputs
        retval_ = ag__.converted_call(ag__.ld(func), (ag__.ld(self),), dict(**ag__.ld(unpacked_inputs)), fscope)
    File "/var/folders/ln/90446hl52_qfwrvp7fhtg0400000gq/T/__autograph_generated_filejwvr8fxe.py", line 14, in tf__call
        outputs = ag__.converted_call(ag__.ld(self).distilbert, (), dict(input_ids=ag__.ld(input_ids), attention_mask=ag__.ld(attention_mask), head_mask=ag__.ld(head_mask), inputs_embeds=ag__.ld(inputs_embeds), output_attentions=ag__.ld(output_attentions), output_hidden_states=ag__.ld(output_hidden_states), return_dict=ag__.ld(return_dict), training=ag__.ld(training)), fscope)
    File "/var/folders/ln/90446hl52_qfwrvp7fhtg0400000gq/T/__autograph_generated_file426wjxtz.py", line 36, in tf__run_call_with_unpacked_inputs
        retval_ = ag__.converted_call(ag__.ld(func), (ag__.ld(self),), dict(**ag__.ld(unpacked_inputs)), fscope)
    File "/var/folders/ln/90446hl52_qfwrvp7fhtg0400000gq/T/__autograph_generated_file_v8c68s9.py", line 92, in tf__call
        embedding_output = ag__.converted_call(ag__.ld(self).embeddings, (ag__.ld(input_ids),), dict(inputs_embeds=ag__.ld(inputs_embeds)), fscope)
    File "/var/folders/ln/90446hl52_qfwrvp7fhtg0400000gq/T/__autograph_generated_file2m_t8175.py", line 53, in tf__call
        final_embeddings = ag__.converted_call(ag__.ld(self).LayerNorm, (), dict(inputs=ag__.ld(final_embeddings)), fscope)

    ValueError: Exception encountered when calling layer 'tf_distil_bert_for_token_classification_5' (type TFDistilBertForTokenClassification).
    
    in user code:
    
        File "/Users/mohimenul.admin/miniforge3/envs/transformer_fine_tuning/lib/python3.10/site-packages/transformers/modeling_tf_utils.py", line 825, in run_call_with_unpacked_inputs  *
            return func(self, **unpacked_inputs)
        File "/Users/mohimenul.admin/miniforge3/envs/transformer_fine_tuning/lib/python3.10/site-packages/transformers/models/distilbert/modeling_tf_distilbert.py", line 831, in call  *
            outputs = self.distilbert(
        File "/Users/mohimenul.admin/miniforge3/envs/transformer_fine_tuning/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler  **
            raise e.with_traceback(filtered_tb) from None
        File "/var/folders/ln/90446hl52_qfwrvp7fhtg0400000gq/T/__autograph_generated_file426wjxtz.py", line 36, in tf__run_call_with_unpacked_inputs
            retval_ = ag__.converted_call(ag__.ld(func), (ag__.ld(self),), dict(**ag__.ld(unpacked_inputs)), fscope)
        File "/var/folders/ln/90446hl52_qfwrvp7fhtg0400000gq/T/__autograph_generated_file_v8c68s9.py", line 92, in tf__call
            embedding_output = ag__.converted_call(ag__.ld(self).embeddings, (ag__.ld(input_ids),), dict(inputs_embeds=ag__.ld(inputs_embeds)), fscope)
        File "/var/folders/ln/90446hl52_qfwrvp7fhtg0400000gq/T/__autograph_generated_file2m_t8175.py", line 53, in tf__call
            final_embeddings = ag__.converted_call(ag__.ld(self).LayerNorm, (), dict(inputs=ag__.ld(final_embeddings)), fscope)
    
        ValueError: Exception encountered when calling layer 'distilbert' (type TFDistilBertMainLayer).
        
        in user code:
        
            File "/Users/mohimenul.admin/miniforge3/envs/transformer_fine_tuning/lib/python3.10/site-packages/transformers/modeling_tf_utils.py", line 825, in run_call_with_unpacked_inputs  *
                return func(self, **unpacked_inputs)
            File "/Users/mohimenul.admin/miniforge3/envs/transformer_fine_tuning/lib/python3.10/site-packages/transformers/models/distilbert/modeling_tf_distilbert.py", line 409, in call  *
                embedding_output = self.embeddings(input_ids, inputs_embeds=inputs_embeds)  # (bs, seq_length, dim)
            File "/Users/mohimenul.admin/miniforge3/envs/transformer_fine_tuning/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler  **
                raise e.with_traceback(filtered_tb) from None
            File "/var/folders/ln/90446hl52_qfwrvp7fhtg0400000gq/T/__autograph_generated_file2m_t8175.py", line 53, in tf__call
                final_embeddings = ag__.converted_call(ag__.ld(self).LayerNorm, (), dict(inputs=ag__.ld(final_embeddings)), fscope)
        
            ValueError: Exception encountered when calling layer 'embeddings' (type TFEmbeddings).
            
            in user code:
            
                File "/Users/mohimenul.admin/miniforge3/envs/transformer_fine_tuning/lib/python3.10/site-packages/transformers/models/distilbert/modeling_tf_distilbert.py", line 131, in call  *
                    final_embeddings = self.LayerNorm(inputs=final_embeddings)
                File "/Users/mohimenul.admin/miniforge3/envs/transformer_fine_tuning/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler  **
                    raise e.with_traceback(filtered_tb) from None
            
                ValueError: Exception encountered when calling layer 'LayerNorm' (type LayerNormalization).
                
                Cannot reshape a tensor with 768 elements to shape [1,1,35,1] (35 elements) for '{{node tf_distil_bert_for_token_classification_5/distilbert/embeddings/LayerNorm/Reshape}} = Reshape[T=DT_FLOAT, Tshape=DT_INT32](tf_distil_bert_for_token_classification_5/distilbert/embeddings/LayerNorm/Reshape/ReadVariableOp, tf_distil_bert_for_token_classification_5/distilbert/embeddings/LayerNorm/Reshape/shape)' with input shapes: [768], [4] and with input tensors computed as partial shapes: input[1] = [1,1,35,1].
                
                Call arguments received by layer 'LayerNorm' (type LayerNormalization):
                  • inputs=tf.Tensor(shape=(None, 1, 35, 768), dtype=float32)
            
            
            Call arguments received by layer 'embeddings' (type TFEmbeddings):
              • input_ids=tf.Tensor(shape=(None, 1, 35), dtype=int32)
              • position_ids=None
              • inputs_embeds=None
              • training=True
        
        
        Call arguments received by layer 'distilbert' (type TFDistilBertMainLayer):
          • input_ids=tf.Tensor(shape=(None, 1, 35), dtype=int32)
          • attention_mask=tf.Tensor(shape=(None, 1, 35), dtype=int32)
          • head_mask=None
          • inputs_embeds=None
          • output_attentions=False
          • output_hidden_states=False
          • return_dict=True
          • training=True
    
    
    Call arguments received by layer 'tf_distil_bert_for_token_classification_5' (type TFDistilBertForTokenClassification):
      • input_ids={'input_ids': 'tf.Tensor(shape=(None, 1, 35), dtype=int64)', 'attention_mask': 'tf.Tensor(shape=(None, 1, 35), dtype=int64)'}
      • attention_mask=None
      • head_mask=None
      • inputs_embeds=None
      • output_attentions=None
      • output_hidden_states=None
      • return_dict=None
      • labels=None
      • training=True


In [46]:
https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb#scrollTo=5m4ebkAM4ph5