In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/My Drive/Colab Notebooks/

/content/drive/My Drive/Colab Notebooks


In [None]:
!pip install keras_nlp
!pip install jax

Collecting keras_nlp
  Downloading keras_nlp-0.11.1-py3-none-any.whl (515 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/515.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.9/515.3 kB[0m [31m3.4 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━[0m [32m389.1/515.3 kB[0m [31m5.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m515.3/515.3 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting keras-core (from keras_nlp)
  Downloading keras_core-0.1.7-py3-none-any.whl (950 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m950.8/950.8 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
Collecting tensorflow-text (from keras_nlp)
  Downloading tensorflow_text-2.16.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━

# Import Libraries

In [None]:
import os
os.environ["KERAS_BACKEND"] = "jax" # # you can also use tensorflow or torch

import keras
import keras_nlp
from keras import ops
import tensorflow as tf

import json
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

import plotly.graph_objs as go
import plotly.express as px

# Configuration

In [None]:
class CFG:
    seed = 42
    preset = "deberta_v3_base_en" # name of pretrained backbone
    train_seq_len = 1024 # max size of input sequence for training
    train_batch_size =  8 # size of the input batch in training, x 2 as two GPUs
    infer_seq_len = 2000 # max size of input sequence for inference
    infer_batch_size =  2 # size of the input batch in inference, x 2 as two GPUs
    epochs = 10 # number of epochs to train
    lr_mode = "exp" # lr scheduler mode from one of "cos", "step", "exp"

    labels = ["B-EMAIL", "B-ID_NUM", "B-NAME_STUDENT", "B-PHONE_NUM",
              "B-STREET_ADDRESS", "B-URL_PERSONAL", "B-USERNAME",
              "I-ID_NUM", "I-NAME_STUDENT", "I-PHONE_NUM",
              "I-STREET_ADDRESS","I-URL_PERSONAL","O"]
    id2label = dict(enumerate(labels)) # integer label to BIO format label mapping
    label2id = {v:k for k,v in id2label.items()} # BIO format label to integer label mapping
    num_labels = len(labels) # number of PII (NER) tags

    train = False # whether to train or use already trained ckpt

# Reproducibility
Sets value for random seed to produce similar result in each run.

In [None]:
keras.utils.set_random_seed(CFG.seed)

In [None]:
# Get devices default "gpu" or "tpu"
devices = keras.distribution.list_devices()
print("Device:", devices)

if len(devices) > 1:
    # Data parallelism
    data_parallel = keras.distribution.DataParallel(devices=devices)

    # Set the global distribution.
    keras.distribution.set_distribution(data_parallel)

Device: ['gpu:0']


In [None]:
# check gpu availability
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


In [None]:
keras.mixed_precision.set_global_policy("mixed_float16")

In [None]:
BASE_PATH = "/content/drive/My Drive/Colab Notebooks/"

In [None]:
# Train-Valid data
data = json.load(open(f"{BASE_PATH}/train.json"))

# Initialize empty arrays
words = np.empty(len(data), dtype=object)
labels = np.empty(len(data), dtype=object)

# Fill the arrays
for i, x in tqdm(enumerate(data), total=len(data)):
    words[i] = np.array(x["tokens"])
    labels[i] = np.array([CFG.label2id[label] for label in x["labels"]])

  0%|          | 0/6807 [00:00<?, ?it/s]

# Exploratory Data Analysis

In [None]:
# Get unique labels and their frequency
all_labels = np.array([x for label in labels for x in label])
unique_labels, label_counts = np.unique(all_labels, return_counts=True)

# Plotting
fig = go.Figure(data=go.Bar(x=CFG.labels, y=label_counts))
fig.update_layout(
    title="Label Distribution",
    xaxis_title="Labels",
    yaxis_title="Count",
    yaxis_type="log",
)

fig.update_traces(text=label_counts, textposition="outside")
fig.show()


In [None]:
# Splitting the data into training and testing sets
train_words, valid_words, train_labels, valid_labels = train_test_split(
    words, labels, test_size=0.2, random_state=CFG.seed
)

In [None]:
# To convert string input or list of strings input to numerical tokens
tokenizer = keras_nlp.models.DebertaV3Tokenizer.from_preset(
    CFG.preset,
)

# Preprocessing layer to add spetical tokens: [CLS], [SEP], [PAD]
packer = keras_nlp.layers.MultiSegmentPacker(
    start_value=tokenizer.cls_token_id,
    end_value=tokenizer.sep_token_id,
    sequence_length=10,
)

Downloading from https://www.kaggle.com/api/v1/models/keras/deberta_v3/keras/deberta_v3_base_en/2/download/metadata.json...
100%|██████████| 141/141 [00:00<00:00, 180kB/s]
Downloading from https://www.kaggle.com/api/v1/models/keras/deberta_v3/keras/deberta_v3_base_en/2/download/tokenizer.json...
100%|██████████| 424/424 [00:00<00:00, 384kB/s]
Downloading from https://www.kaggle.com/api/v1/models/keras/deberta_v3/keras/deberta_v3_base_en/2/download/assets/tokenizer/vocabulary.spm...
100%|██████████| 2.35M/2.35M [00:00<00:00, 125MB/s]


In [None]:
tf.experimental.numpy.experimental_enable_numpy_behavior()

# Data Processing


In [None]:
def get_tokens(words, seq_len, packer):
    # Tokenize input
    token_words = tf.expand_dims(
        tokenizer(words), axis=-1
    )  # ex: (words) ["It's", "a", "cat"] ->  (token_words) [[1, 2], [3], [4]]
    tokens = tf.reshape(
        token_words, [-1]
    )  # ex: (token_words) [[1, 2], [3], [4]] -> (tokens) [1, 2, 3, 4]
    # Pad tokens
    tokens = packer(tokens)[0][:seq_len]
    inputs = {"token_ids": tokens, "padding_mask": tokens != 0}
    return inputs, tokens, token_words


def get_token_ids(token_words):
    # Get word indices
    word_ids = tf.range(tf.shape(token_words)[0])
    # Get size of each word
    word_size = tf.reshape(tf.map_fn(lambda word: tf.shape(word)[0:1], token_words), [-1])
    # Repeat word_id with size of word to get token_id
    token_ids = tf.repeat(word_ids, word_size)
    return token_ids


def get_token_labels(word_labels, token_ids, seq_len):
    # Create token_labels from word_labels ->  alignment
    token_labels = tf.gather(word_labels, token_ids)
    # Only label the first token of a given word and assign -100 to others
    mask = tf.concat([[True], token_ids[1:] != token_ids[:-1]], axis=0)
    token_labels = tf.where(mask, token_labels, -100)
    # Truncate to max sequence length
    token_labels = token_labels[: seq_len - 2]  # -2 for special tokens ([CLS], [SEP])
    # Pad token_labels to align with tokens (use -100 to pad for loss/metric ignore)
    pad_start = 1  # for [CLS] token
    pad_end = seq_len - tf.shape(token_labels)[0] - 1  # for [SEP] and [PAD] tokens
    token_labels = tf.pad(token_labels, [[pad_start, pad_end]], constant_values=-100)
    return token_labels


def process_token_ids(token_ids, seq_len):
    # Truncate to max sequence length
    token_ids = token_ids[: seq_len - 2]  # -2 for special tokens ([CLS], [SEP])
    # Pad token_ids to align with tokens (use -1 to pad for later identification)
    pad_start = 1  # [CLS] token
    pad_end = seq_len - tf.shape(token_ids)[0] - 1  # [SEP] and [PAD] tokens
    token_ids = tf.pad(token_ids, [[pad_start, pad_end]], constant_values=-1)
    return token_ids


def process_data(seq_len=720, has_label=True, return_ids=False):
    # To add spetical tokens: [CLS], [SEP], [PAD]
    packer = keras_nlp.layers.MultiSegmentPacker(
        start_value=tokenizer.cls_token_id,
        end_value=tokenizer.sep_token_id,
        sequence_length=seq_len,
    )

    def process(x):
        # Generate inputs from tokens
        inputs, tokens, words_int = get_tokens(x["words"], seq_len, packer)
        # Generate token_ids for maping tokens to words
        token_ids = get_token_ids(words_int)
        if has_label:
            # Generate token_labels from word_labels
            token_labels = get_token_labels(x["labels"], token_ids, seq_len)
            return inputs, token_labels
        elif return_ids:
            # Pad token_ids to align with tokens
            token_ids = process_token_ids(token_ids, seq_len)
            return token_ids
        else:
            return inputs

    return process

# Dataloader

In [None]:
def build_dataset(words, labels=None, return_ids=False, batch_size=4,
                  seq_len=512, shuffle=False, cache=True, drop_remainder=True):
    AUTO = tf.data.AUTOTUNE

    slices = {"words": tf.ragged.constant(words)}
    if labels is not None:
        slices.update({"labels": tf.ragged.constant(labels)})

    ds = tf.data.Dataset.from_tensor_slices(slices)
    ds = ds.map(process_data(seq_len=seq_len,
                             has_label=labels is not None,
                             return_ids=return_ids), num_parallel_calls=AUTO) # apply processing
    ds = ds.cache() if cache else ds  # cache dataset
    if shuffle: # shuffle dataset
        ds = ds.shuffle(1024, seed=CFG.seed)
        opt = tf.data.Options()
        opt.experimental_deterministic = False
        ds = ds.with_options(opt)
    ds = ds.batch(batch_size, drop_remainder=drop_remainder)  # batch dataset
    ds = ds.prefetch(AUTO)  # prefetch next batch
    return ds

## Build Train & Valid Dataloader

In [None]:
train_ds = build_dataset(train_words, train_labels,  batch_size=CFG.train_batch_size,
                         seq_len=CFG.train_seq_len, shuffle=True)

valid_ds = build_dataset(valid_words, valid_labels, batch_size=CFG.train_batch_size,
                         seq_len=CFG.train_seq_len, shuffle=False)

In [None]:
X,y = next(iter(train_ds))
X_val_data,y_val_data = next(iter(valid_ds))


In [None]:
X.keys()

dict_keys(['token_ids', 'padding_mask'])

In [None]:
token_ids = X['token_ids'].numpy().flatten()
padding_masks = X['padding_mask'].numpy().flatten()
token_ids = token_ids.reshape(-1, 1)  # Reshape to have 1 column
padding_masks = padding_masks.reshape(-1, 1)  # Reshape to have 1 column

X_train = np.concatenate((token_ids, padding_masks), axis = 1)
y_train = y.numpy().flatten()

token_ids_val = X_val_data['token_ids'].numpy().flatten()
padding_masks_val = X_val_data['padding_mask'].numpy().flatten()
token_ids_val = token_ids_val.reshape(-1, 1)  # Reshape to have 1 column
padding_masks_val = padding_masks_val.reshape(-1, 1)  # Reshape to have 1 column

X_test = np.concatenate((token_ids_val, padding_masks_val), axis = 1)
y_test = y_val_data.numpy().flatten()

X_train,y_train

(array([[   1,    1],
        [6738,    1],
        [ 877,    1],
        ...,
        [   0,    0],
        [   0,    0],
        [   0,    0]], dtype=int32),
 array([-100,   12,   12, ..., -100, -100, -100], dtype=int32))

# Loss & Metric

## Loss: CrossEntropy

In [None]:
class CrossEntropy(keras.losses.SparseCategoricalCrossentropy):
    def __init__(self, ignore_class=-100, reduction=None, **args):
        super().__init__(reduction=reduction, **args)
        self.ignore_class = ignore_class

    def call(self, y_true, y_pred):
        y_true = ops.reshape(y_true, [-1])
        y_pred = ops.reshape(y_pred, [-1, CFG.num_labels])
        loss = super().call(y_true, y_pred)
        if self.ignore_class is not None:
            valid_mask = ops.not_equal(
                y_true, ops.cast(self.ignore_class, y_pred.dtype)
            )
            loss = ops.where(valid_mask, loss, 0.0)
            loss = ops.sum(loss)
            loss /= ops.maximum(ops.sum(ops.cast(valid_mask, loss.dtype)), 1)
        else:
            loss = ops.mean(loss)
        return loss


## Metric: FBetaScore ($\beta = 5$)

In [None]:
class FBetaScore(keras.metrics.FBetaScore):
    def __init__(self, ignore_classes=[-100, 12], average="micro", beta=5.0,
                 name="f5_score", **args):
        super().__init__(beta=beta, average=average, name=name, **args)
        self.ignore_classes = ignore_classes or []

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_true = ops.convert_to_tensor(y_true, dtype=self.dtype)
        y_pred = ops.convert_to_tensor(y_pred, dtype=self.dtype)

        y_true = ops.reshape(y_true, [-1])
        y_pred = ops.reshape(y_pred, [-1, CFG.num_labels])

        # valid_mask = ops.ones_like(y_true, dtype=self.dtype)
        valid_mask = ops.ones_like(y_true, dtype='int32')
        if self.ignore_classes:
            for ignore_class in self.ignore_classes:
                valid_mask &= ops.not_equal(y_true, ops.cast(ignore_class, y_pred.dtype))
        valid_mask = ops.expand_dims(valid_mask, axis=-1)

        y_true = ops.one_hot(y_true, CFG.num_labels)

        if not self._built:
            self._build(y_true.shape, y_pred.shape)

        threshold = ops.max(y_pred, axis=-1, keepdims=True)
        y_pred = ops.logical_and(
            y_pred >= threshold, ops.abs(y_pred) > 1e-9
        )

        y_pred = ops.cast(y_pred, dtype=self.dtype)
        y_true = ops.cast(y_true, dtype=self.dtype)

        tp = ops.sum(y_pred * y_true * valid_mask, self.axis)
        fp = ops.sum(y_pred * (1 - y_true) * valid_mask, self.axis)
        fn = ops.sum((1 - y_pred) * y_true * valid_mask, self.axis)

        self.true_positives.assign_add(tp)
        self.false_positives.assign_add(fp)
        self.false_negatives.assign_add(fn)

In [None]:
from sklearn.metrics import precision_score, recall_score
def calculate_f5_score(y_true, y_pred, average = 'weighted'):
    # Define weights for precision and recall
    beta = 5

    # Calculate precision and recall
    precision = precision_score(y_true, y_pred, average=average)
    recall = recall_score(y_true, y_pred, average = average)

    # Calculate F5 score
    f5 = (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)

    return f5


In [None]:
# Build Token Classification model

backbone = keras_nlp.models.DebertaV3Backbone.from_preset(
    CFG.preset,
)
out = backbone.output
out = keras.layers.Dense(backbone.hidden_dim, activation="relu")(out)
out = keras.layers.Dropout(0.1)(out)
out = keras.layers.Dense(backbone.hidden_dim, activation="relu")(out)
out = keras.layers.Dropout(0.1)(out)
out = keras.layers.Dense(CFG.num_labels, name="logits")(out)
out = keras.layers.Dropout(0.1)(out)
out = keras.layers.Activation("softmax", dtype="float32", name="prediction")(out)
model = keras.models.Model(backbone.input, out)

# Compile model for optimizer, loss and metric
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=2e-5),
    loss=CrossEntropy(),
    metrics=[FBetaScore()],
)

# Summary of the model architecture
model.summary()

Downloading from https://www.kaggle.com/api/v1/models/keras/deberta_v3/keras/deberta_v3_base_en/2/download/config.json...
100%|██████████| 540/540 [00:00<00:00, 745kB/s]
Downloading from https://www.kaggle.com/api/v1/models/keras/deberta_v3/keras/deberta_v3_base_en/2/download/model.weights.h5...
100%|██████████| 702M/702M [00:06<00:00, 121MB/s]


# LR Schedule

In [None]:
import math

def get_lr_callback(batch_size=8, mode='cos', epochs=10, plot=False):
    lr_start, lr_max, lr_min = 6e-6, 2.5e-6 * batch_size, 1e-6
    lr_ramp_ep, lr_sus_ep, lr_decay = 4, 0, 0.75

    def lrfn(epoch):  # Learning rate update function
        if epoch < lr_ramp_ep: lr = (lr_max - lr_start) / lr_ramp_ep * epoch + lr_start
        elif epoch < lr_ramp_ep + lr_sus_ep: lr = lr_max
        elif mode == 'exp': lr = (lr_max - lr_min) * lr_decay**(epoch - lr_ramp_ep - lr_sus_ep) + lr_min
        elif mode == 'step': lr = lr_max * lr_decay**((epoch - lr_ramp_ep - lr_sus_ep) // 2)
        elif mode == 'cos':
            decay_total_epochs, decay_epoch_index = epochs - lr_ramp_ep - lr_sus_ep + 3, epoch - lr_ramp_ep - lr_sus_ep
            phase = math.pi * decay_epoch_index / decay_total_epochs
            lr = (lr_max - lr_min) * 0.5 * (1 + math.cos(phase)) + lr_min
        return lr

    if plot:  # Plot lr curve if plot is True
        fig = px.line(x=np.arange(epochs),
                      y=[lrfn(epoch) for epoch in np.arange(epochs)],
                      title='LR Scheduler',
                      markers=True,
                      labels={'x': 'epoch', 'y': 'lr'})
        fig.update_layout(
            yaxis = dict(
                showexponent = 'all',
                exponentformat = 'e'
            )
        )
        fig.show()

    return keras.callbacks.LearningRateScheduler(lrfn, verbose=False)  # Create lr callback

In [None]:
lr_cb = get_lr_callback(CFG.train_batch_size, mode=CFG.lr_mode, plot=True)

In [None]:
model.load_weights("nn_model.weights.h5")


Skipping variable loading for optimizer 'loss_scale_optimizer', because it has 4 variables whereas the saved optimizer has 414 variables. 


Skipping variable loading for optimizer 'adam', because it has 2 variables whereas the saved optimizer has 410 variables. 



In [None]:
model.save("nn_model_fin.keras")

# Training

In [None]:
if CFG.train:
    mc = keras.callbacks.ModelCheckpoint(
    filepath='nn_model.weights.h5',
    monitor='val_f5_score',  # Monitor validation F5 score
    mode='max',  # Maximizing the F5 score
    save_best_only=True,  # Save only the best model
    save_weights_only=True,  # Save only the model weights
    verbose=1
    )
    history = model.fit(
        train_ds,
        validation_data=valid_ds,
        epochs=CFG.epochs,
        callbacks=[lr_cb,mc],
        verbose=1,
    )
else:
    model.load_weights("nn_model.weights.h5")

# Evaluation


In [None]:
# Build Validation dataloader with "infer_seq_len"
valid_ds = build_dataset(valid_words, valid_labels, return_ids=False, batch_size=CFG.infer_batch_size,
                        seq_len=CFG.infer_seq_len, shuffle=False, cache=False)

In [None]:
# Evaluate
model.evaluate(valid_ds, return_dict=True, verbose=0)

{'f5_score': 0.940966010093689, 'loss': 0.0003325996804051101}

# Prediction

In [None]:
# Test data
text_data = "Waseem Mabunda 591 Smith Centers Apt. 656 Joshuamouth, RI 95963 ( The Netherlands) 410.526.1667 vpi@mn.nlMind Mapping, Challenge: For several years I have been working for an Asset manager in the Netherlands. During this period I have been involved in many projects. Certainly in the world of asset management, much has changed in recent years in the area of Law and Regulations. What I mainly experience in these projects is that all departments have a different interest in starting a new project. This certainly does not benefit the project. How do you get everyone to complete a project in the common interest and how do you motivate everyone who participate in the project? Selection: An improvement project can be approached in different ways. The most common way is the scrum approach. We work in multidisciplinary teams that work in short sprints, with a fixed length of 1 to 4 weeks. Cooperation is very important and everyone must be able to respond quickly to changing circumstances. Scrum is based on the theory of empirical process control, or empiricism. Empiricism assumes that knowledge arises from experience and making decisions based on what is known. I chose mind mapping because I am looking for a way to show the creativity colleagues always have at the start of a project, to keep this up to date and very important to keep it visible. But also with the thoughts to keep colleagues motivated and to show how their creativity contributes to the project. So I want to see if scrum can be combined with Design Thinking and especially with Mind mapping. Application: When starting a new project at work, I checked whether it is workable to combine the scrum approach with Mind Mapping. The central theme was to increase the STP (Straight through processing) rate for a specific product that we trade with an x percentage. As a scrum team, we have tried to provide insight into the various topics related to the 'increase STP rate' via a paper diagram. Each team member could indicate in this diagram his or her creativity which related to increasing the STP rate. After this we went to see if there was a connection between certain ideas. We quickly learned that certain ideas could be combined and that certain steps in the project could be skipped. By combining scrum work and mind mapping, we were able to go live with implementation faster and increase STP speed step by step. By making the project visible through a diagram, colleagues also indicated that this gave them more energy to participate in the project.D e s i g n T h i n k i n gInsight: The insight I got to combine scrum with mind mapping (Design thinking) is that if you make everyone's creativity and thinking visible through Mind mapping, you will come sooner to a solid solution to complete a project. The feedback we received is that it also gives more energy to colleagues who have participated in this project. The biggest challenge was to create support for this new way of working. At the beginning of the project, we showed a short video of how mind mapping works. This gave us immediate support from our fellow team members to combine scrum with mind mapping. https://www.youtube.com/watch?v=tIBN9VJ0S4a The conclusion is that you definitely can combine scrum and Design thinking. Approach: In terms of approach, I wouldn't be much different from what I did in this project. I only see advantages of combining scrum with mind mapping. As described in the alinia insight, there are only benefitsn"
# Ensure number of samples is divisble by number of devices
tokenize = text_data.split(" ")
texts_data ={
    "document":0,
    "full_text": text_data,
    "tokens": tokenize
}
test_data = []
test_data.append(texts_data)

need_samples  = len(devices) - len(test_data) % len(devices)
for _ in range(need_samples):
    test_data.append(test_data[-1]) # repeat the last sample

# Initialize empty arrays
test_words = np.empty(len(test_data), dtype=object)
test_docs = np.empty(len(test_data), dtype=np.int32)

# Fill the arrays
for i, x in tqdm(enumerate(test_data), total=len(test_data)):
    test_words[i] = np.array(x["tokens"])
    test_docs[i] = x["document"]

# Get token ids
id_ds = build_dataset(test_words, return_ids=True, batch_size=len(test_words),
                        seq_len=CFG.infer_seq_len, shuffle=False, cache=False, drop_remainder=False)
test_token_ids = ops.convert_to_numpy([ids for ids in iter(id_ds)][0])

# Build test dataloader
test_ds = build_dataset(test_words, return_ids=False, batch_size=CFG.infer_batch_size,
                        seq_len=CFG.infer_seq_len, shuffle=False, cache=False, drop_remainder=False)

  0%|          | 0/2 [00:00<?, ?it/s]

## Inference

In [None]:
# Do inference
test_preds = model.predict(test_ds, verbose=1)

# Convert probabilities to class labels via max confidence
test_preds = np.argmax(test_preds, axis=-1)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step


In [None]:
test_docs = test_docs[:-need_samples]
test_token_ids = test_token_ids[:-need_samples]
test_preds = test_preds[:-need_samples]
test_words = test_words[:-need_samples]

In [None]:
test_token_ids

array([[-1,  0,  0, ..., -1, -1, -1]], dtype=int32)

## Post-Processing

In [None]:
document_list = []
token_id_list = []
label_id_list = []
token_list = []

for doc, token_ids, preds, tokens in tqdm(
    zip(test_docs, test_token_ids, test_preds, test_words), total=len(test_words)
):
    # Create mask for filtering
    mask1 = np.concatenate(([True], token_ids[1:] != token_ids[:-1])) # ignore non-start tokens of a word
    mask2 = (preds != 12) # ignore `O` (BIO format) label -> 12 (integer format) label
    mask3 = (token_ids != -1)  # ignore [CLS], [SEP], and [PAD] tokens
    mask = (mask1 & mask2 & mask3) # merge filters

    # Apply filter
    token_ids = token_ids[mask]
    preds = preds[mask]

     # Store prediction if number of tokens is not zero
    if len(token_ids):
        token_list.extend(tokens[token_ids])
        document_list.extend([doc] * len(token_ids))
        token_id_list.extend(token_ids)
        label_id_list.extend(preds)

  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
print(token_list,label_id_list)

['Waseem', 'Mabunda', 'RI', '95963', '410.526.1667', 'vpi@mn.nlMind'] [2, 8, 10, 1, 1, 0]


In [None]:
pred_df = pd.DataFrame(
    {
        "document": document_list,
        "token": token_id_list,
        "label_id": label_id_list,
        "token_string": token_list,
    }
)
pred_df = pred_df.rename_axis("row_id").reset_index() # add `row_id` column
pred_df["label"] = pred_df.label_id.map(CFG.id2label) # map integer label to BIO format label
pred_df.head(10)

Unnamed: 0,row_id,document,token,label_id,token_string,label
0,0,0,0,2,Waseem,B-NAME_STUDENT
1,1,0,1,8,Mabunda,I-NAME_STUDENT
2,2,0,8,10,RI,I-STREET_ADDRESS
3,3,0,9,1,95963,B-ID_NUM
4,4,0,13,1,410.526.1667,B-ID_NUM
5,5,0,14,0,vpi@mn.nlMind,B-EMAIL
