In [1]:
%pip install tensorflow==2.15.0
%pip install spacy
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


#### Import Libraries

In [2]:
import os
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm import tqdm
from tensorflow import keras
from tensorflow.keras import layers
from google.cloud import bigquery

2024-05-08 09:37:58.519753: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-08 09:37:59.403735: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-08 09:37:59.403847: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-08 09:37:59.604441: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-08 09:38:00.007334: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-08 09:38:00.011817: I tensorflow/core/platform/cpu_feature_guard.cc:1

#### Setup Base Model

In [3]:
@keras.utils.register_keras_serializable(package="CustomModels")
class NERModel(keras.Model):
    def __init__(
        self, num_tags, vocab_size, maxlen=512, embed_dim=32, num_heads=2, ff_dim=32, **kwargs
    ):
        super().__init__()
        self.num_tags = num_tags
        self.vocab_size = vocab_size
        self.maxlen = maxlen
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.ff_dim = ff_dim

        self.embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
        self.transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
        self.dropout1 = layers.Dropout(0.1)
        self.ff = layers.Dense(ff_dim, activation="relu")
        self.dropout2 = layers.Dropout(0.1)
        self.ff_final = layers.Dense(num_tags, activation="softmax")

    def call(self, inputs, training=False):
        x = self.embedding_layer(inputs)
        x = self.transformer_block(x)
        x = self.dropout1(x, training=training)
        x = self.ff(x)
        x = self.dropout2(x, training=training)
        x = self.ff_final(x)
        return x
    
    def get_config(self):
        config = {
            'num_tags': self.num_tags,
            'vocab_size': self.vocab_size,
            'maxlen': self.maxlen,
            'embed_dim': self.embed_dim,
            'num_heads': self.num_heads,
            'ff_dim': self.ff_dim,
        }
        base_config = super(NERModel, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = keras.layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.ffn = keras.Sequential(
            [
                keras.layers.Dense(ff_dim, activation="relu"),
                keras.layers.Dense(embed_dim),
            ]
        )
        self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = keras.layers.Dropout(rate)
        self.dropout2 = keras.layers.Dropout(rate)

    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)
    
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = keras.layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.pos_emb = keras.layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, inputs):
        maxlen = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        position_embeddings = self.pos_emb(positions)
        token_embeddings = self.token_emb(inputs)
        return token_embeddings + position_embeddings
    
class CustomNonPaddingTokenLoss(keras.losses.Loss):
    def __init__(self, reduction='sum', name="custom_ner_loss"):
        super().__init__(reduction='sum', name=name)

    def call(self, y_true, y_pred):
        loss_fn = keras.losses.SparseCategoricalCrossentropy(
            from_logits=False, reduction=self.reduction
        )
        loss = loss_fn(y_true, y_pred)
        mask = tf.cast((y_true > 0), dtype=tf.float32)
        loss = loss * mask
        return tf.reduce_sum(loss) / tf.reduce_sum(mask)

def map_record_to_training_data(record):
    record = tf.strings.split(record, sep="\t")
    length = tf.strings.to_number(record[0], out_type=tf.int32)
    tokens = record[1 : length + 1]
    tags = record[length + 1 :]
    tags = tf.strings.to_number(tags, out_type=tf.int64)
    tags += 1
    return tokens, tags

def lookup(tokens):
    # Load the list from the file
    with open('vocabulary.pkl', 'rb') as f:
        loaded_list = pickle.load(f)
    # The StringLookup class will convert tokens to token IDs
    lookup_layer = keras.layers.StringLookup(vocabulary=loaded_list)

    # No need to lowercase Vietnamese characters
    return lookup_layer(tokens)

def format_datatype(data):
    tokens = word_tokenize(data)
    #tokens =  [re.sub(r'[;,]', '', d) for d in data.split(' ')]
    #default is 0, since is for prediction
    ner_tags = [0 for d in data.split(' ')]

    #tab to separate
    string_input = str(len(tokens))+ "\t"+ "\t".join(tokens)+ "\t"+ "\t".join(map(str, ner_tags))
    string_input = tf.data.Dataset.from_tensor_slices([string_input])


    finalize_input = (string_input.map(map_record_to_training_data)
                      .map(lambda x, y: (lookup(x),  y))
                      .padded_batch(1)
                      )

    return finalize_input

#### Load Pretrained Model

In [4]:
tf.keras.utils.get_custom_objects()['CustomNonPaddingTokenLoss'] = CustomNonPaddingTokenLoss
# Load model
loaded_model = tf.keras.models.load_model("ner_model.keras")

embedding_layer = loaded_model.embedding_layer

#### Load Original Data

In [5]:
def read_bq(project_id, dataset_id, table_id, bigquery_client):
    query = f"""
        SELECT *
        FROM {project_id}.{dataset_id}.{table_id}
    """
    query_job = bigquery_client.query(query)
    df = query_job.to_dataframe()
    return df

In [6]:
PROJECT_ID, DATASET_ID, TABLE_ID = "intern-project-415606", "Criminal_Dataset", "criminal_data"
bigquery_client = bigquery.Client(project=PROJECT_ID)
dataset = read_bq(PROJECT_ID, DATASET_ID, TABLE_ID, bigquery_client)
dataset.head()

Unnamed: 0,JLR_LINK,TRANS_TYPE_OF_CASE,TRANS_LEGAL_RELATIONSHIP,PDF_TEXT,EXTRACT,ID,NAME,Year,Month,Day,GENDER,BIRTH
0,https://congbobanan.toaan.gov.vn/2ta1016089t1c...,刑事,盜竊財產罪,<Page:1>TÒA ÁN NHÂN DÂN HUYỆN BẮC HÀ CỘNG HÒA ...,"1. Sùng Seo Q, sinh ngày 13/7/2003 tại huyện ...",No_Id,Sùng Seo Q,2003,7,13,Male,2003-07-13 00:00:00
1,https://congbobanan.toaan.gov.vn/2ta791888t1cv...,刑事,犯罪賭博,<Page:1>1\nTÒA ÁN NHÂN DÂN CỘNG HÒA XÃ HỘI CHỦ...,"1. Đỗ Đình H1, sinh ngày 20/6/1961 tại thôn H...",No_Id,Vũ Văn Th,1959,3,8,Male,1959-03-08 00:00:00
2,https://congbobanan.toaan.gov.vn/2ta536274t1cv...,採取行政處理措施的決定,放入強制性戒毒機構,<Page:1>TOÀ ÁN NHÂN DÂN CỘNG HOÀ XÃ HỘI CHỦ NG...,Họ và tên: Nguyễn Văn H. Giới tính: Nam; Sinh...,No_Id,Nguyễn Văn H,1991,2,16,Male,1991-02-16 00:00:00
3,https://congbobanan.toaan.gov.vn/2ta290704t1cv...,採取行政處理措施的決定,放入強制性戒毒機構,<Page:1>TÒA ÁN NHÂN DÂN CỘNG HÒA XÃ HỘI CHỦ NG...,Họ và tên: Nguyễn Văn T; Giới T: Nam; sinh ng...,No_Id,Nguyễn Văn T,1970,9,13,Male,1970-09-13 00:00:00
4,https://congbobanan.toaan.gov.vn/2ta946636t1cv...,刑事,犯罪組織賭博或持賭,<Page:1>TÒA ÁN NHÂN DÂN CỘNG HÒA XÃ HỘI CHỦ NG...,"1. Nguyễn Kim P, (tên gọi khác: Không), sinh ...",No_Id,Nguyễn Thu H,1992,1,19,Female,1992-01-19 00:00:00


#### Build vector database table

In [7]:
def create_table():
    schema = [
        bigquery.SchemaField("JLR_LINK", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("NAME", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("VECTOR", "STRING", mode="REQUIRED")
    ]
    project_id, dataset_id, table_id_write = 'intern-project-415606', 'Criminal_Dataset', 'criminal_name_vector'
    table = bigquery.Table(f"{project_id}.{dataset_id}.{table_id_write}", schema=schema)
    try:
        table = bigquery_client.create_table(table)
    except Exception as e:
        print(f"Table {project_id}.{dataset_id}.{table_id_write} already exists.")

In [8]:
def transform_and_insert(name, jlr_link):
    embedding_vectors = embedding_layer(lookup([name]))

    embedding_vector_list = embedding_vectors.numpy().tolist()[0]
    
    return embedding_vector_list

In [None]:
create_table()
names = dataset['NAME']
jlr_links = dataset['JLR_LINK']

client = bigquery.Client()

for i in range(32654, len(names)):
    vectors = transform_and_insert(names[i], jlr_links[i])
    query = f"""
        INSERT INTO `intern-project-415606.Criminal_Dataset`.criminal_name_vector (`JLR_LINK`, `NAME`, `VECTOR`)
        VALUES ('{jlr_links[i]}', '{names[i]}', '{str(vectors)}')
    """
    client.query(query)
    if i % 500 == 0:
        print(i)

Table intern-project-415606.Criminal_Dataset.criminal_name_vector already exists.
