In [None]:
from google.cloud import bigquery
from google.colab import auth
auth.authenticate_user()
print('Authenticated')

Authenticated


In [None]:
# read the (original) data from the bigquery
def read_bq(project_id, dataset_id, table_id):

    query = f"""
        SELECT *
        FROM {project_id}.{dataset_id}.{table_id}
    """

    query_job = bigquery_client.query(query)

    # Convert the result into a Pandas DataFrame
    df = query_job.to_dataframe()

    return df

In [None]:
PROJECT_ID, DATASET_ID, TABLE_ID = "intern-project-415606", "Criminal_Dataset", "criminal_data_self"
bigquery_client = bigquery.Client(project=PROJECT_ID)

In [None]:
dataset = read_bq(PROJECT_ID, DATASET_ID, TABLE_ID)
dataset = dataset.sort_values(by='extract_id')
dataset.head()

Unnamed: 0,extract_id,text,ner_underthesea,tag_underthesea,self_label
469929,0,:,O,CH,other
469930,0,–,O,CH,other
469931,0,:,O,CH,other
111155,0,bà,O,N,other
111152,0,Nghề nghiệp,O,N,other


## Import Libraries

In [None]:
%pip install conlleval
%pip install datasets

Collecting conlleval
  Downloading conlleval-0.2-py3-none-any.whl (5.4 kB)
Installing collected packages: conlleval
Successfully installed conlleval-0.2
Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134

In [None]:
import os

os.environ["KERAS_BACKEND"] = "tensorflow"

import os
import keras
import numpy as np
import tensorflow as tf
from keras import layers
from collections import Counter
import pickle

## Data preprocessing

In [None]:
dataset

Unnamed: 0,extract_id,text,ner_underthesea,tag_underthesea,self_label
469929,0,:,O,CH,other
469930,0,–,O,CH,other
469931,0,:,O,CH,other
111155,0,bà,O,N,other
111152,0,Nghề nghiệp,O,N,other
...,...,...,...,...,...
722973,2440,H,I-LOC,Np,other
722988,2440,Việt Nam,B-LOC,Np,other
722980,2440,Trại,B-LOC,Np,other
722989,2440,Kinh,B-PER,Np,other


In [None]:
# eliminate all the CH (punctuation)
dataset = dataset[dataset['tag_underthesea'] != 'CH']
# Define the mapping
label_map = {"N": 0, "M": 1, "other": 2}

# Replace numeric labels with NER labels
dataset['self_label'] = dataset['self_label'].map(label_map)

dataset

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['self_label'] = dataset['self_label'].map(label_map)


Unnamed: 0,extract_id,text,ner_underthesea,tag_underthesea,self_label
111155,0,bà,O,N,2
111152,0,Nghề nghiệp,O,N,2
111153,0,Lao động,O,N,2
111154,0,Con,O,N,2
111156,0,phiên,O,N,2
...,...,...,...,...,...
722973,2440,H,I-LOC,Np,2
722988,2440,Việt Nam,B-LOC,Np,2
722980,2440,Trại,B-LOC,Np,2
722989,2440,Kinh,B-PER,Np,2


In [None]:
dataset_group = {'id':[], 'token':[], 'ner_tag':[], 'pos_tag':[], 'self_tag':[]}

for index, row in dataset.iterrows():
    extract_id = row['extract_id']
    if extract_id in dataset_group['id']:
        group_index = dataset_group['id'].index(extract_id)
        dataset_group['token'][group_index].append(row['text'])
        dataset_group['ner_tag'][group_index].append(row['ner_underthesea'])
        dataset_group['pos_tag'][group_index].append(row['tag_underthesea'])
        dataset_group['self_tag'][group_index].append(row['self_label'])
    else:
        dataset_group['id'].append(extract_id)
        dataset_group['token'].append([row['text']])
        dataset_group['ner_tag'].append([row['ner_underthesea']])
        dataset_group['pos_tag'].append([row['tag_underthesea']])
        dataset_group['self_tag'].append([row['self_label']])

for group in dataset_group['id']:
    group_index = dataset_group['id'].index(group)
    print('Group ID:', group)
    print('Token:', dataset_group['token'][group_index])
    print('NER Tag:', dataset_group['ner_tag'][group_index])
    print('POS Tag:', dataset_group['pos_tag'][group_index])
    print('Self Tag:', dataset_group['self_tag'][group_index])
    print()
    break

Group ID: 0
Token: ['bà', 'Nghề nghiệp', 'Lao động', 'Con', 'phiên', '25/01/1994', 'Dân tộc', 'Tôn giáo', 'Thiên chúa giáo', 'Trình độ', 'tự do', 'văn hóa', '9/12 Nơi', 'thôn', 'tên', 'tại', 'ông', 'con', 'Không', 'Giới tính', 'ngày', 'xã', 'huyện', 'tỉnh', 'Nguyễn Thị Hà', 'và', '12', 'Nguyễn Văn C', 'cư trú', 'Tiền án', 'tiền sự', 'Có mặt', 'họp', 'Nam Sinh', 'Kinh', 'T', 'A', 'Họ', 'N', 'Nguyễn Văn Trung']
NER Tag: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'B-LOC', 'B-LOC', 'B-PER', 'O', 'I-LOC', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'B-PER', 'I-LOC', 'I-LOC', 'O', 'I-LOC', 'B-PER']
POS Tag: ['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'A', 'N', 'N', 'N', 'N', 'E', 'Nc', 'Nc', 'R', 'N', 'N', 'N', 'N', 'N', 'Np', 'C', 'M', 'Np', 'V', 'V', 'V', 'V', 'V', 'Np', 'Np', 'Np', 'Np', 'P', 'Np', 'Np']
Self Tag: [2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 0, 2, 2, 2, 2

In [None]:
from sklearn.model_selection import train_test_split

from datasets import Dataset, DatasetDict

# Assuming dataset_group is already constructed as in your previous code

# Convert dataset_group to a DataFrame
import pandas as pd
dataset_df = pd.DataFrame(dataset_group)

# Split data into train and test sets
train_df, test_df = train_test_split(dataset_df, test_size=0.2, random_state=42)

# Create train and test datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Construct DatasetDict
dataset_dict = DatasetDict({'train': train_dataset, 'test': test_dataset})

# Print DatasetDict
print(dataset_dict)


DatasetDict({
    train: Dataset({
        features: ['id', 'token', 'ner_tag', 'pos_tag', 'self_tag', '__index_level_0__'],
        num_rows: 1952
    })
    test: Dataset({
        features: ['id', 'token', 'ner_tag', 'pos_tag', 'self_tag', '__index_level_0__'],
        num_rows: 489
    })
})


In [None]:
print(type(dataset_dict['train']['self_tag'][0][0]))

<class 'int'>


## Turn into tensorflow acceptable **datatype**

In [None]:
def export_to_file(export_file_path, data):
    with open(export_file_path, "w") as f:
        for record in data:
            ner_tags = record["self_tag"]
            tokens = record["token"]
            if len(tokens) > 0:
                f.write(
                    str(len(tokens))
                    + "\t"
                    + "\t".join(tokens)
                    + "\t"
                    + "\t".join(map(str, ner_tags))
                    + "\n"
                )
try:
    os.mkdir('./data')
except:
    print('directory exist')
export_to_file("./data/crime_train.txt", dataset_dict['train'])
export_to_file("./data/crime_val.txt", dataset_dict['test'])

In [None]:
def make_tag_lookup_table():
    ner_labels = ["[PAD]", "N", "M", "other"]
    return dict(zip(range(len(ner_labels)), ner_labels))

mapping = make_tag_lookup_table()
print(mapping)

{0: '[PAD]', 1: 'N', 2: 'M', 3: 'other'}


In [None]:
all_tokens = sum(dataset_dict["train"]["token"], [])
all_tokens_array = np.array(all_tokens)

counter = Counter(all_tokens_array)
print(len(counter))

num_tags = len(mapping)
vocab_size = 25000

# We only take (vocab_size - 2) most common words from the training data since
# the `StringLookup` class uses 2 additional tokens - one denoting an unknown
# token and another one denoting a masking token
vocabulary = [token for token, count in counter.most_common(vocab_size - 2)]

# Save the list to a file
with open('./data/vocabulary.pkl', 'wb') as f:
    pickle.dump(vocabulary, f)


# The StringLookup class will convert tokens to token IDs
lookup_layer = keras.layers.StringLookup(vocabulary=vocabulary)


22093


In [None]:
print(vocabulary)

['ngày', 'năm', 'sinh', 'tỉnh', 'Không', 'con', 'và', 'tại', 'huyện', 'bị', 'tạm', 'có', 'xã', 'cư trú', 'tháng', 'ông', 'bà', 'Việt Nam', 'tiền sự', 'Bị cáo', 'nơi', 'đến', 'giam', 'từ', 'Nam', 'Kinh', 'về', 'giữ', 'thành phố', 'văn hóa', 'không', 'có mặt', 'tên', 'trình độ', 'xử phạt', 'đã', 'Nơi', 'giới tính', 'dân tộc', 'biện pháp', 'tôn giáo', 'nghề nghiệp', 'bị cáo', '2', 'tiền án', 'quốc tịch', 'Con', 'Tiền án', 'là', 'vợ', 'số', 'tù', 'Nghề nghiệp', 'học vấn', 'phiên', '02', 'Tòa án', 'Dân tộc', 'nhân dân', 'Ngày', 'Tôn giáo', 'khác', 'T', 'tội', 'Trình độ', 'áp dụng', 'Công an', 'đi', 'tự do', '1', 'Quốc tịch', 'khỏi', 'tòa', 'chết', 'bắt', 'gọi', 'ở', 'Giới tính', 'được', 'Lao động', 'ngăn chặn', '01', 'phường', 'Sinh', 'nay', 'Có mặt', 'Họ', 'lớn', 'đang', 'chưa', 'tài sản', 'H', 'của', 'nhất', '3', 'xong', 'hành chính', 'cho', 'B', '< Page', 'hành vi', 'chấp hành', 'thường trú', '03', '12/12', 'C', 'Đ', 'Có', 'người', 'chỗ', 'Tiền sự', 'L', 'cấm', 'nhỏ', 'Bản án', 'Nhân thâ

In [None]:
train_data = tf.data.TextLineDataset("./data/crime_train.txt")
val_data = tf.data.TextLineDataset("./data/crime_val.txt")

In [None]:
print(list(train_data.take(1).as_numpy_iterator()))

[b'135\tt\xc3\xb2a\tt\xe1\xbb\xb1 do\tkh\xc3\xa1c\t\xc4\x90\t\xc4\x91\xc3\xa3\t\xc4\x91\xe1\xbb\x81u\tt\xe1\xba\xa1i\tn\xc4\x83m\th\xc3\xa0nh ch\xc3\xadnh\tO x\xe1\xbb\xad ph\xe1\xba\xa1t\tphi\xc3\xaan\thuy\xe1\xbb\x87n\tHi\xe1\xbb\x87n\tb\xe1\xbb\x8b c\xc3\xa1o\tt\xe1\xba\xa1i ngo\xe1\xba\xa1i\tx\xc3\xa3\thuy\xe1\xbb\x87n\t02/10/2020\t1.500.000\tt\xc3\xaan\tt\xc3\xaan\tng\xc3\xa0y\t02/3/1971\tn\xc6\xa1i\tx\xc3\xa3\tt\xe1\xbb\x89nh\tngh\xe1\xbb\x81 nghi\xe1\xbb\x87p\tLao \xc4\x91\xe1\xbb\x99ng\ttr\xc3\xacnh \xc4\x91\xe1\xbb\x99\th\xe1\xbb\x8dc v\xe1\xba\xa5n\t5/12\tx\xc3\xa3\tKinh\tNguy\xe1\xbb\x85n Xu\xc3\xa2n V\t\xc4\x90\xc6\xb0\xe1\xbb\x9dng Th\xe1\xbb\x8b A\tO\tTh\xc3\xb4n \xc4\x90\tt\xc3\xb4n gi\xc3\xa1o\td\xc3\xa2n t\xe1\xbb\x99c\tVi\xe1\xbb\x87t Nam\thuy\xe1\xbb\x87n\tV\xc4\xa9nh Ph\xc3\xbac\tY\tbi\xe1\xbb\x87n ph\xc3\xa1p\tn\xc6\xa1i\t\xc4\x91\xc6\xb0\xe1\xbb\xa3c\tm\xe1\xbb\xa9c\th\xc3\xa0nh vi\tB\xe1\xbb\x8b c\xc3\xa1o\tng\xc3\xa0y\tng\xc3\xa0y\tbi\xe1\xbb\x87n ph\xc3\xa1p\t\

## Transformer block layer

In [None]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = keras.layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.ffn = keras.Sequential(
            [
                keras.layers.Dense(ff_dim, activation="relu"),
                keras.layers.Dense(embed_dim),
            ]
        )
        self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = keras.layers.Dropout(rate)
        self.dropout2 = keras.layers.Dropout(rate)

    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

## Token and Position Embedding Layer

In [None]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = keras.layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.pos_emb = keras.layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, inputs):
        maxlen = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        position_embeddings = self.pos_emb(positions)
        token_embeddings = self.token_emb(inputs)
        return token_embeddings + position_embeddings

## NER model class

In [None]:
class NERModel(keras.Model):
    def __init__(
        self, num_tags, vocab_size, maxlen=128, embed_dim=32, num_heads=2, ff_dim=32
    ):
        super().__init__()
        self.embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
        self.transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
        self.dropout1 = layers.Dropout(0.1)
        self.ff = layers.Dense(ff_dim, activation="relu")
        self.dropout2 = layers.Dropout(0.1)
        self.ff_final = layers.Dense(num_tags, activation="softmax")

    def call(self, inputs, training=False):
        x = self.embedding_layer(inputs)
        x = self.transformer_block(x)
        x = self.dropout1(x, training=training)
        x = self.ff(x)
        x = self.dropout2(x, training=training)
        x = self.ff_final(x)
        return x

## custom loss function

In [None]:
class CustomNonPaddingTokenLoss(keras.losses.Loss):
    def __init__(self, reduction=keras.losses.Reduction.AUTO, name="custom_ner_loss"):
        super().__init__(reduction=reduction, name=name)

    def call(self, y_true, y_pred):
        loss_fn = keras.losses.SparseCategoricalCrossentropy(
            from_logits=False, reduction=self.reduction  # Pass the reduction argument here
        )
        loss = loss_fn(y_true, y_pred)
        mask = tf.cast((y_true > 0), dtype=tf.float32)
        loss = loss * mask
        return tf.reduce_sum(loss) / tf.reduce_sum(mask)
loss = CustomNonPaddingTokenLoss()

In [None]:
class CustomNonPaddingTokenLoss(keras.losses.Loss):
    def __init__(self, name="custom_ner_loss"):
        super().__init__(name=name)

    def call(self, y_true, y_pred):
        loss_fn = keras.losses.SparseCategoricalCrossentropy(
            from_logits=False, reduction=tf.keras.losses.Reduction.SUM
        )
        loss = loss_fn(y_true, y_pred)
        mask = tf.cast((y_true > 0), dtype=tf.float32)
        loss = loss * mask
        return tf.reduce_sum(loss) / tf.reduce_sum(mask)

loss = CustomNonPaddingTokenLoss()

In [None]:
train_data

<TextLineDatasetV2 element_spec=TensorSpec(shape=(), dtype=tf.string, name=None)>

## modify dataset since every sentence have different length

In [None]:
from sklearn.preprocessing import LabelEncoder

def map_record_to_training_data(record):
    record = tf.strings.split(record, sep="\t")
    length = tf.strings.to_number(record[0], out_type=tf.int32)
    tokens = record[1 : length + 1]
    tags = record[length + 1 :]
    tags = tf.strings.to_number(tags, out_type=tf.int64)
    tags += 1
    return tokens, tags

def lowercase_and_convert_to_ids(tokens):
    # No need to lowercase Vietnamese characters
    return lookup_layer(tokens)

# We use `padded_batch` here because each record in the dataset has a
# different length.
batch_size = 8



# Map records to training data and pad the batches
train_dataset = (
    train_data.map(map_record_to_training_data)
    .map(lambda x, y: (lowercase_and_convert_to_ids(x),  y))
    .padded_batch(batch_size)
)

# Map records to validation data and pad the batches
val_dataset = (
    val_data.map(map_record_to_training_data)
    .map(lambda x, y: (lowercase_and_convert_to_ids(x), y))
    .padded_batch(batch_size)
)

# Define and initialize your NER model
ner_model = NERModel(num_tags, vocab_size, embed_dim=32, num_heads=4, ff_dim=64)

In [None]:
for a, b in train_dataset:
    print(a)
    print(b)
    break

tf.Tensor(
[[  73   69   62 ...    0    0    0]
 [   7    7   77 ...    0    0    0]
 [ 480   73   55 ...    0    0    0]
 ...
 [ 172   51  185 ...  737  125  374]
 [8851   26   16 ...    0    0    0]
 [ 142   17 1524 ...    0    0    0]], shape=(8, 687), dtype=int64)
tf.Tensor(
[[3 3 3 ... 0 0 0]
 [3 3 3 ... 0 0 0]
 [3 3 3 ... 0 0 0]
 ...
 [3 3 3 ... 3 3 3]
 [1 3 3 ... 0 0 0]
 [2 3 3 ... 0 0 0]], shape=(8, 687), dtype=int64)


## Compile and fit the model

In [None]:
from keras.losses import SparseCategoricalCrossentropy

# Compile the model with a specified loss function and optimizer
#ner_model.compile(optimizer="adam", loss=SparseCategoricalCrossentropy(reduction='auto'))
ner_model.compile(optimizer="adam", loss=loss)
ner_model.fit(train_dataset, epochs=20)

def tokenize_and_convert_to_ids(text):
    tokens = text.split()
    return lowercase_and_convert_to_ids(tokens)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
#Assuming `ner_model` is your TensorFlow model object
ner_model.save("./data/ner_model")

In [None]:
def calculate_metrics(dataset):
    all_true_tag_ids, all_predicted_tag_ids = [], []

    for x, y in dataset:
        output = ner_model.predict(x, verbose=0)
        predictions = np.argmax(output, axis=-1)
        predictions = np.reshape(predictions, [-1])

        true_tag_ids = np.reshape(y, [-1])

        mask = (true_tag_ids > 0) & (predictions > 0)
        true_tag_ids = true_tag_ids[mask]
        predicted_tag_ids = predictions[mask]

        all_true_tag_ids.append(true_tag_ids)
        all_predicted_tag_ids.append(predicted_tag_ids)

    all_true_tag_ids = np.concatenate(all_true_tag_ids)
    all_predicted_tag_ids = np.concatenate(all_predicted_tag_ids)

    predicted_tags = [mapping[tag] for tag in all_predicted_tag_ids]
    real_tags = [mapping[tag] for tag in all_true_tag_ids]
    print(predicted_tags)
    print(real_tags)
    print(len(predicted_tags), len(predicted_tags))
    return predicted_tags, real_tags
    #evaluate(real_tags, predicted_tags)


y_pred, y_val = calculate_metrics(val_dataset)

['other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'N', 'N', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other',

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tabulate import tabulate

def calculate_metrics(y_true, y_pred, labels):
    # Mapping labels to numeric indices
    label_to_index = {label: idx for idx, label in enumerate(labels)}
    index_to_label = {idx: label for label, idx in label_to_index.items()}

    y_true_mapped = [label_to_index[label] for label in y_true]
    y_pred_mapped = [label_to_index[label] for label in y_pred]

    # Overall metrics
    accuracy = accuracy_score(y_true_mapped, y_pred_mapped)
    precision = precision_score(y_true_mapped, y_pred_mapped, average='weighted')
    recall = recall_score(y_true_mapped, y_pred_mapped, average='weighted', zero_division=0)  # Set zero_division to 0
    f1 = f1_score(y_true_mapped, y_pred_mapped, average='weighted')

    # Per-label metrics
    per_label_accuracy = {}
    per_label_precision = {}
    per_label_recall = {}
    per_label_f1 = {}

    for label in labels:
        label_index = label_to_index[label]
        label_indices = [i for i, true_label in enumerate(y_true) if true_label == label]
        if len(label_indices) > 0:
            per_label_accuracy[label] = accuracy_score([y_true_mapped[i] for i in label_indices], [y_pred_mapped[i] for i in label_indices])
            per_label_precision[label] = precision_score([y_true_mapped[i] for i in label_indices], [y_pred_mapped[i] for i in label_indices], average='weighted')
            per_label_recall[label] = recall_score([y_true_mapped[i] for i in label_indices], [y_pred_mapped[i] for i in label_indices], average='weighted', zero_division=0)  # Set zero_division to 0
            per_label_f1[label] = f1_score([y_true_mapped[i] for i in label_indices], [y_pred_mapped[i] for i in label_indices], average='weighted')
        else:
            per_label_accuracy[label] = 0
            per_label_precision[label] = 0
            per_label_recall[label] = 0
            per_label_f1[label] = 0

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'per_label_accuracy': per_label_accuracy,
        'per_label_precision': per_label_precision,
        'per_label_recall': per_label_recall,
        'per_label_f1': per_label_f1,
        'index_to_label': index_to_label
    }

#labels
labels = ['M', 'N', 'other']

metrics = calculate_metrics(y_val, y_pred, labels)

# Constructing table
table_data = []
for label in labels:
    table_data.append([label,
                       metrics['per_label_accuracy'][label],
                       metrics['per_label_precision'][label],
                       metrics['per_label_recall'][label],
                       metrics['per_label_f1'][label]])
overall_accuracy = calculate_metrics(y_val, y_pred, labels)
print("Overall Accuracy:".ljust(20), overall_accuracy['accuracy'])
print("Overall Precision:".ljust(20), overall_accuracy['precision'])
print("Overall Recall:".ljust(20), overall_accuracy['recall'])
print("Overall F1-score:".ljust(20), overall_accuracy['f1'])
print(tabulate(table_data, headers=["Label", "Accuracy", "Precision", "Recall", "F1-score"], tablefmt="grid"))


Overall Accuracy:    0.9832128008943857
Overall Precision:   0.9826584366359037
Overall Recall:      0.9832128008943857
Overall F1-score:    0.9829131150192979
+---------+------------+-------------+----------+------------+
| Label   |   Accuracy |   Precision |   Recall |   F1-score |
| M       |   0.921607 |           1 | 0.921607 |   0.959204 |
+---------+------------+-------------+----------+------------+
| N       |   0.246377 |           1 | 0.246377 |   0.395349 |
+---------+------------+-------------+----------+------------+
| other   |   0.991424 |           1 | 0.991424 |   0.995694 |
+---------+------------+-------------+----------+------------+


In [None]:
#average length of
dataset.describe()

Unnamed: 0,extract_id,self_label
count,562083.0,562083.0
mean,1241.475494,1.896291
std,699.638632,0.321072
min,0.0,0.0
25%,632.0,2.0
50%,1245.0,2.0
75%,1874.0,2.0
max,2440.0,2.0


In [None]:
import re
import pickle
import keras
import tensorflow as tf
import numpy as np

class CustomNonPaddingTokenLoss(keras.losses.Loss):
    def __init__(self, reduction=keras.losses.Reduction.AUTO, name="custom_ner_loss"):
        super().__init__(reduction=reduction, name=name)

    def call(self, y_true, y_pred):
        loss_fn = keras.losses.SparseCategoricalCrossentropy(
            from_logits=False, reduction=self.reduction
        )
        loss = loss_fn(y_true, y_pred)
        mask = tf.cast((y_true > 0), dtype=tf.float32)
        loss = loss * mask
        return tf.reduce_sum(loss) / tf.reduce_sum(mask)

def map_record_to_training_data(record):
    record = tf.strings.split(record, sep="\t")
    length = tf.strings.to_number(record[0], out_type=tf.int32)
    tokens = record[1 : length + 1]
    tags = record[length + 1 :]
    tags = tf.strings.to_number(tags, out_type=tf.int64)
    tags += 1
    return tokens, tags

def lookup(tokens):
    # Load the list from the file
    with open('/content/drive/MyDrive/NER/vocabulary.pkl', 'rb') as f:
        loaded_list = pickle.load(f)
    # The StringLookup class will convert tokens to token IDs
    lookup_layer = keras.layers.StringLookup(vocabulary=loaded_list)

    # No need to lowercase Vietnamese characters
    return lookup_layer(tokens)

def format_datatype(data):
    tokens =  [re.sub(r'[;,]', '', d) for d in data.split(' ')]
    #default is 0, since is for prediction
    ner_tags = [0 for d in data.split(' ')]

    #tab to separate
    string_input = str(len(tokens))+ "\t"+ "\t".join(tokens)+ "\t"+ "\t".join(map(str, ner_tags))
    string_input = tf.data.Dataset.from_tensor_slices([string_input])


    finalize_input = (string_input.map(map_record_to_training_data)
                      .map(lambda x, y: (lookup(x),  y))
                      .padded_batch(1)
                      )

    return finalize_input

def prediction(data):
    # Register the custom loss function with TensorFlow
    tf.keras.utils.get_custom_objects()['CustomNonPaddingTokenLoss'] = CustomNonPaddingTokenLoss
    # Assuming `ner_model` is your TensorFlow model object
    loaded_model = tf.keras.models.load_model("/content/drive/MyDrive/NER/ner_model")

    all_predicted_tag_ids = []

    for x, _ in data:
        print("Input Tensor Info:")
        print("Data Type:", x.dtype)
        print("Shape:", x.shape)
        output = loaded_model(x, training=False)
        predictions = np.argmax(output, axis=-1)
        predictions = np.reshape(predictions, [-1])
        all_predicted_tag_ids.append(predictions)

    all_predicted_tag_ids = np.concatenate(all_predicted_tag_ids)

    ner_labels = ["[PAD]", "N", "M", "other"]
    mapping =  dict(zip(range(len(ner_labels)), ner_labels))
    predicted_tags = [mapping[tag] for tag in all_predicted_tag_ids]

    return predicted_tags

sample_input = "1/ Trần Văn T, sinh ngày 01 tháng 01 năm 1987 tại Quảng Nam; Nơi cư trú: thôn 04, xã TG, huyện Bắc Trà My, tỉnh Quảng Nam; nghề nghiệp: nông; trình độ văn hoá: 03/12; dân tộc: Cadong; giới tính: nam; tôn giáo: không; quốc tịch: Việt Nam; con ông Trần Văn Tiếu và bà Thanh Thị Liên; vợ tên Phạm Thị Hiếm và 02 con; tiền án, tiền sự: không; Bị cáo bị áp dụng biện pháp ngăn chặn: “Cấm đi khỏi nơi cư trú”, có mặt tại phiên tòa. 2/ Đinh Tấn M, sinh ngày 21 tháng 6 năm 1995 tại Quảng Nam; Nơi cư trú: thôn 04, xã TG, huyện Bắc Trà My, tỉnh Quảng Nam; nghề nghiệp: nông; trình độ"
sample_input = "Hello world, my name is John, I live in New York, my birthday is 10/02/1990."
result = prediction(format_datatype(sample_input))
print(result)
print(len(result))

Input Tensor Info:
Data Type: <dtype: 'int64'>
Shape: (1, 15)
['[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', 'M', '[PAD]', 'other', 'other', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
15


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp /content/data/vocabulary.pkl /content/drive/MyDrive/NER/vocabulary.pkl

In [None]:
!cp -r /content/data/ner_model /content/drive/MyDrive/NER/ner_model

In [None]:
import tensorflow as tf
tf.__version__

'2.15.0'