# Sequential Labelling

## Import Modules

In [2]:
!pip install evaluate nusacrowd seqeval
import evaluate
import numpy as np
import transformers
import tensorflow as tf

from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorForTokenClassification, create_optimizer, TFAutoModelForTokenClassification
from transformers.keras_callbacks import KerasMetricCallback, PushToHubCallback

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nusacrowd
  Downloading nusacrowd-0.1.2-py3-none-any.whl (384 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m384.2/384.2 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting loguru>=0.5.3 (from nusacrowd)
  Downloading loguru-0.7.2-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bioc>=1.3.7 (from nusacrowd)
  Downloading bioc-2.1-py3-none-any.whl (33 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Download



## Import Dataset

In [3]:
nergrit = load_dataset('NusaCrowd/nergrit')

Downloading builder script:   0%|          | 0.00/6.33k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/15.0M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [4]:
print(nergrit["train"][0])

{'index': '0', 'tokens': ['Indonesia', 'mengekspor', 'produk', 'industri', 'skala', 'besar', 'ke', 'Amerika', 'Serikat', '.', 'Ekspor', 'dilakukan', 'melalui', 'Pelabuhan', 'Tanjung', 'Priok', ',', 'Jakarta', 'Utara', ',', 'Selasa', '(', '15', '/', '8', '/', '2018', ')', '.', 'Komoditas', 'yang', 'dikirim', 'terdiri', 'dari', '50', 'persen', 'sepatu', ',', '15', 'persen', 'garmen', ',', '10', 'persen', 'produk', 'karet', ',', 'ban', 'dan', 'turunannya', ',', 'alat', '-', 'alat', 'elektronik', '10', 'persen', ',', 'dan', 'produk', 'lainnya', '15', 'persen', '.', '"', 'Bukan', 'bahan', 'mentah', ',', 'tetapi', 'sudah', 'bahan', '-', 'bahan', 'produksi', ',', 'produk', '-', 'produk', 'industri', 'yang', 'kita', 'harapkan', 'ini', 'akan', 'meningkatkan', 'ekspor', 'kita', ',', '"', 'kata', 'Presiden', 'dalam', 'sambutannya', 'pada', 'acara', 'pelepasan', 'ekspor', 'di', 'Jakarta', 'International', 'Container', 'Terminal', '(', 'JICT', ')', ',', 'Selasa', '(', '15', '/', '5', '/', '2018', '

In [5]:
# Extract the 'ner_tag' column from the training set
ner_tags_list = nergrit["train"]["ner_tag"]

# Flatten the list of lists
flattened_ner_tags = [tag for sublist in ner_tags_list for tag in sublist]

# Get the unique labels
unique_labels = list(set(flattened_ner_tags))

print(unique_labels)

['I-PRC', 'B-NOR', 'I-QTY', 'I-PER', 'I-CRD', 'B-DAT', 'B-FAC', 'B-REG', 'B-ORD', 'B-LAN', 'B-EVT', 'I-FAC', 'B-LOC', 'B-PRD', 'B-WOA', 'B-ORG', 'I-TIM', 'I-LAN', 'I-REG', 'B-TIM', 'I-EVT', 'I-LOC', 'I-GPE', 'B-QTY', 'B-MON', 'I-DAT', 'B-PRC', 'I-MON', 'B-PER', 'B-CRD', 'B-GPE', 'I-ORG', 'B-LAW', 'I-NOR', 'I-PRD', 'O', 'I-LAW', 'I-ORD', 'I-WOA']


Based on the documentation in https://huggingface.co/datasets/NusaCrowd/nergrit.
<br>
Label:

'CRD': Cardinal

'DAT': Date

'EVT': Event

'FAC': Facility

'GPE': Geopolitical Entity

'LAW': Law Entity (such as Undang-Undang)

'LOC': Location

'MON': Money

'NOR': Political Organization

'ORD': Ordinal

'ORG': Organization

'PER': Person

'PRC': Percent

'PRD': Product

'QTY': Quantity

'REG': Religion

'TIM': Time

'WOA': Work of Art

'LAN': Language

In [6]:
labels_dict = {
    'O': 0,
    'B-CRD': 1, 'I-CRD': 2,
    'B-DAT': 3, 'I-DAT': 4,
    'B-EVT': 5, 'I-EVT': 6,
    'B-FAC': 7, 'I-FAC': 8,
    'B-GPE': 9, 'I-GPE': 10,
    'B-LAW': 11, 'I-LAW': 12,
    'B-LOC': 13, 'I-LOC': 14,
    'B-MON': 15, 'I-MON': 16,
    'B-NOR': 17, 'I-NOR': 18,
    'B-ORD': 19, 'I-ORD': 20,
    'B-ORG': 21, 'I-ORG': 22,
    'B-PER': 23, 'I-PER': 24,
    'B-PRC': 25, 'I-PRC': 26,
    'B-PRD': 27, 'I-PRD': 28,
    'B-QTY': 29, 'I-QTY': 30,
    'B-REG': 31, 'I-REG': 32,
    'B-TIM': 33, 'I-TIM': 34,
    'B-WOA': 35, 'I-WOA': 36,
    'B-LAN': 37, 'I-LAN': 38
}

## Preprocessing

In [7]:
# use IndoBERT
tokenizer = AutoTokenizer.from_pretrained('indolem/indobert-base-uncased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/234k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [8]:
example = nergrit["train"][0]
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens[:10]

['[CLS]',
 'indonesia',
 'mengekspor',
 'produk',
 'industri',
 'skala',
 'besar',
 'ke',
 'amerika',
 'serikat']

Based on the documentation:

> This adds some special tokens [CLS] and [SEP] and the subword tokenization creates a mismatch between the input and labels. A single word corresponding to a single label may now be split into two subwords. You'll need to realign the tokens and labels by:

1. Mapping all tokens to their corresponding word with the word_ids method.
2. Assigning the label -100 to the special tokens [CLS] and [SEP] so they're ignored by the PyTorch loss function.
3. Only labeling the first token of a given word. Assign -100 to other subtokens from the same word.

So it is needed to realign the token and labels, and truncate the sequence if it is longer than the models maximum length

Based on the reference:

> It's more efficient to dynamically pad the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.

So, creating the data collator.

In [9]:
# Rename the 'labels' column to 'ner_tag'
nergrit["train"] = nergrit["train"].rename_column("ner_tag", "labels")
nergrit["validation"] = nergrit["validation"].rename_column("ner_tag", "labels")
nergrit["test"] = nergrit["test"].rename_column("ner_tag", "labels")

In [10]:
def tokenize_and_align_labels(dataset):
    tokenized_inputs = tokenizer(dataset["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, labels_per_example in enumerate(dataset["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                token_labels = [labels_dict[label] for label in labels_per_example]
                label_ids.append(token_labels[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [11]:
tokenized_nergrit = nergrit.map(tokenize_and_align_labels, batched=True) #processing in batch

Map:   0%|          | 0/12551 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/2402 [00:00<?, ? examples/s]

Map:   0%|          | 0/2526 [00:00<?, ? examples/s]

In [12]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf")

## Metrics Evaluation

I will be using just accuracy

In [13]:
seqeval = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [14]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [unique_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [unique_labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "accuracy": results["overall_accuracy"],
    }

## Fine-Tuning the Model

In [15]:
labels = list(labels_dict.keys())

id2label = {i: label for i, label in enumerate(labels)}
label2id = {label: i for i, label in id2label.items()}


In [16]:
len(labels)

39

In [17]:
batch_size = 16
num_train_epochs = 3
num_train_steps = (len(tokenized_nergrit["train"]) // batch_size) * num_train_epochs
optimizer, lr_schedule = create_optimizer(
    init_lr=2e-5,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
    num_warmup_steps=0,
)

In [18]:
class LoRALayer(tf.keras.layers.Layer):
    def __init__(self, original_dim, rank, **kwargs):
        super(LoRALayer, self).__init__(**kwargs)
        self.original_dim = original_dim
        self.rank = rank

    def build(self, input_shape):
        self.u = self.add_weight(name='u', shape=(self.original_dim, self.rank), initializer='uniform')
        self.v = self.add_weight(name='v', shape=(self.rank, self.original_dim), initializer='uniform')
        super(LoRALayer, self).build(input_shape)

    def call(self, x):
        low_rank_matrix = tf.matmul(tf.matmul(self.u, self.v), x)
        return x + low_rank_matrix
    
class LoRAModel(TFAutoModelForTokenClassification):
    def __init__(self, *args, **kwargs):
        super(LoRAModel, self).__init__(*args, **kwargs)
        self.lora = LoRALayer(original_dim=self.config.hidden_size, rank=32)  # You can adjust the rank

    def call(self, inputs):
        # Apply LoRA to embeddings
        embeddings = super(LoRAModel, self).bert.embeddings(inputs)
        embeddings = self.lora(embeddings)
        
        # Continue with the rest of the model's forward pass
        outputs = super(LoRAModel, self).bert(inputs, inputs_embeds=embeddings)
        return outputs

model = LoRAModel.from_pretrained(
    "indolem/indobert-base-uncased", num_labels=39, id2label=id2label, label2id=label2id, from_pt=True
)

Downloading pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
print(tokenized_nergrit["train"][0])

{'index': '0', 'tokens': ['Indonesia', 'mengekspor', 'produk', 'industri', 'skala', 'besar', 'ke', 'Amerika', 'Serikat', '.', 'Ekspor', 'dilakukan', 'melalui', 'Pelabuhan', 'Tanjung', 'Priok', ',', 'Jakarta', 'Utara', ',', 'Selasa', '(', '15', '/', '8', '/', '2018', ')', '.', 'Komoditas', 'yang', 'dikirim', 'terdiri', 'dari', '50', 'persen', 'sepatu', ',', '15', 'persen', 'garmen', ',', '10', 'persen', 'produk', 'karet', ',', 'ban', 'dan', 'turunannya', ',', 'alat', '-', 'alat', 'elektronik', '10', 'persen', ',', 'dan', 'produk', 'lainnya', '15', 'persen', '.', '"', 'Bukan', 'bahan', 'mentah', ',', 'tetapi', 'sudah', 'bahan', '-', 'bahan', 'produksi', ',', 'produk', '-', 'produk', 'industri', 'yang', 'kita', 'harapkan', 'ini', 'akan', 'meningkatkan', 'ekspor', 'kita', ',', '"', 'kata', 'Presiden', 'dalam', 'sambutannya', 'pada', 'acara', 'pelepasan', 'ekspor', 'di', 'Jakarta', 'International', 'Container', 'Terminal', '(', 'JICT', ')', ',', 'Selasa', '(', '15', '/', '5', '/', '2018', '

In [20]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_nergrit["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_nergrit["validation"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [21]:
model.compile(optimizer=optimizer)

In [22]:
# create the callback for the model
metric_callback = KerasMetricCallback(
    metric_fn=compute_metrics, 
    eval_dataset=tf_validation_set,
)

push_to_hub_callback = PushToHubCallback(
    output_dir="indobert-base-uncased-lora-nergrit",
    tokenizer=tokenizer
)

callbacks = [metric_callback, push_to_hub_callback]

Cloning https://huggingface.co/apwic/indobert-base-uncased-lora-nergrit into local empty directory.


In [23]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=10, callbacks=callbacks)

Epoch 1/10

  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f1706dfa770>