In [1]:
%pip install tensorflow==2.15.0

Note: you may need to restart the kernel to use updated packages.


### Import Libraries

In [60]:
import os
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from google.cloud import bigquery

### Setup base model

In [4]:
@keras.utils.register_keras_serializable(package="CustomModels")
class NERModel(keras.Model):
    def __init__(
        self, num_tags, vocab_size, maxlen=512, embed_dim=32, num_heads=2, ff_dim=32, **kwargs
    ):
        super().__init__()
        self.num_tags = num_tags
        self.vocab_size = vocab_size
        self.maxlen = maxlen
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.ff_dim = ff_dim

        self.embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
        self.transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
        self.dropout1 = layers.Dropout(0.1)
        self.ff = layers.Dense(ff_dim, activation="relu")
        self.dropout2 = layers.Dropout(0.1)
        self.ff_final = layers.Dense(num_tags, activation="softmax")

    def call(self, inputs, training=False):
        x = self.embedding_layer(inputs)
        x = self.transformer_block(x)
        x = self.dropout1(x, training=training)
        x = self.ff(x)
        x = self.dropout2(x, training=training)
        x = self.ff_final(x)
        return x
    
    def get_config(self):
        config = {
            'num_tags': self.num_tags,
            'vocab_size': self.vocab_size,
            'maxlen': self.maxlen,
            'embed_dim': self.embed_dim,
            'num_heads': self.num_heads,
            'ff_dim': self.ff_dim,
        }
        base_config = super(NERModel, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = keras.layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.ffn = keras.Sequential(
            [
                keras.layers.Dense(ff_dim, activation="relu"),
                keras.layers.Dense(embed_dim),
            ]
        )
        self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = keras.layers.Dropout(rate)
        self.dropout2 = keras.layers.Dropout(rate)

    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)
    
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = keras.layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.pos_emb = keras.layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, inputs):
        maxlen = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        position_embeddings = self.pos_emb(positions)
        token_embeddings = self.token_emb(inputs)
        return token_embeddings + position_embeddings
    
class CustomNonPaddingTokenLoss(keras.losses.Loss):
    def __init__(self, reduction='sum', name="custom_ner_loss"):
        super().__init__(reduction='sum', name=name)

    def call(self, y_true, y_pred):
        loss_fn = keras.losses.SparseCategoricalCrossentropy(
            from_logits=False, reduction=self.reduction
        )
        loss = loss_fn(y_true, y_pred)
        mask = tf.cast((y_true > 0), dtype=tf.float32)
        loss = loss * mask
        return tf.reduce_sum(loss) / tf.reduce_sum(mask)

def map_record_to_training_data(record):
    record = tf.strings.split(record, sep="\t")
    length = tf.strings.to_number(record[0], out_type=tf.int32)
    tokens = record[1 : length + 1]
    tags = record[length + 1 :]
    tags = tf.strings.to_number(tags, out_type=tf.int64)
    tags += 1
    return tokens, tags

def lookup(tokens):
    # Load the list from the file
    with open('vocabulary.pkl', 'rb') as f:
        loaded_list = pickle.load(f)
    # The StringLookup class will convert tokens to token IDs
    lookup_layer = keras.layers.StringLookup(vocabulary=loaded_list)

    # No need to lowercase Vietnamese characters
    return lookup_layer(tokens)

def format_datatype(data):
    tokens = word_tokenize(data)
    #tokens =  [re.sub(r'[;,]', '', d) for d in data.split(' ')]
    #default is 0, since is for prediction
    ner_tags = [0 for d in data.split(' ')]

    #tab to separate
    string_input = str(len(tokens))+ "\t"+ "\t".join(tokens)+ "\t"+ "\t".join(map(str, ner_tags))
    string_input = tf.data.Dataset.from_tensor_slices([string_input])


    finalize_input = (string_input.map(map_record_to_training_data)
                      .map(lambda x, y: (lookup(x),  y))
                      .padded_batch(1)
                      )

    return finalize_input

### Load pretrained model

In [5]:
tf.keras.utils.get_custom_objects()['CustomNonPaddingTokenLoss'] = CustomNonPaddingTokenLoss
# Load model
loaded_model = tf.keras.models.load_model("ner_model.keras")

embedding_layer = loaded_model.embedding_layer

### Read data

In [6]:
def read_bq(project_id, dataset_id, table_id, bigquery_client):
    query = f"""
        SELECT *
        FROM {project_id}.{dataset_id}.{table_id}
    """
    query_job = bigquery_client.query(query)
    df = query_job.to_dataframe()
    return df

In [7]:
PROJECT_ID, DATASET_ID, TABLE_ID = "intern-project-415606", "Criminal_Dataset", "criminal_data"
bigquery_client = bigquery.Client(project=PROJECT_ID)
dataset = read_bq(PROJECT_ID, DATASET_ID, TABLE_ID, bigquery_client)
dataset

Unnamed: 0,JLR_LINK,TRANS_TYPE_OF_CASE,TRANS_LEGAL_RELATIONSHIP,PDF_TEXT,EXTRACT,ID,NAME,Year,Month,Day,GENDER,BIRTH
0,https://congbobanan.toaan.gov.vn/2ta1016089t1c...,刑事,盜竊財產罪,<Page:1>TÒA ÁN NHÂN DÂN HUYỆN BẮC HÀ CỘNG HÒA ...,"1. Sùng Seo Q, sinh ngày 13/7/2003 tại huyện ...",No_Id,Sùng Seo Q,2003,7,13,Male,2003-07-13 00:00:00
1,https://congbobanan.toaan.gov.vn/2ta791888t1cv...,刑事,犯罪賭博,<Page:1>1\nTÒA ÁN NHÂN DÂN CỘNG HÒA XÃ HỘI CHỦ...,"1. Đỗ Đình H1, sinh ngày 20/6/1961 tại thôn H...",No_Id,Vũ Văn Th,1959,3,8,Male,1959-03-08 00:00:00
2,https://congbobanan.toaan.gov.vn/2ta536274t1cv...,採取行政處理措施的決定,放入強制性戒毒機構,<Page:1>TOÀ ÁN NHÂN DÂN CỘNG HOÀ XÃ HỘI CHỦ NG...,Họ và tên: Nguyễn Văn H. Giới tính: Nam; Sinh...,No_Id,Nguyễn Văn H,1991,2,16,Male,1991-02-16 00:00:00
3,https://congbobanan.toaan.gov.vn/2ta290704t1cv...,採取行政處理措施的決定,放入強制性戒毒機構,<Page:1>TÒA ÁN NHÂN DÂN CỘNG HÒA XÃ HỘI CHỦ NG...,Họ và tên: Nguyễn Văn T; Giới T: Nam; sinh ng...,No_Id,Nguyễn Văn T,1970,9,13,Male,1970-09-13 00:00:00
4,https://congbobanan.toaan.gov.vn/2ta946636t1cv...,刑事,犯罪組織賭博或持賭,<Page:1>TÒA ÁN NHÂN DÂN CỘNG HÒA XÃ HỘI CHỦ NG...,"1. Nguyễn Kim P, (tên gọi khác: Không), sinh ...",No_Id,Nguyễn Thu H,1992,1,19,Female,1992-01-19 00:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...
36545,https://congbobanan.toaan.gov.vn/2ta219347t1cv...,刑事,犯罪非法持有毒品,<Page:1>TÒA ÁN NHÂN DÂN CỘNG HÒA XÃ HỘI CHỦ NG...,<Page:1>TÒA ÁN NHÂN DÂN CỘNG HÒA XÃ HỘI CHỦ NG...,No_Id,Bùi Văn K,1991,4,7,Male,1991-04-07 00:00:00
36546,https://congbobanan.toaan.gov.vn/2ta506383t1cv...,採取行政處理措施的決定,放入強制性戒毒機構,<Page:1>TOÀ ÁN NHÂN DÂN CỘNG HOÀ XÃ HỘI CHỦ NG...,"Họ và tên: Vũ Trường Đ, sinh ngày 18/6/1980; ...",No_Id,Vũ Trường Đ,1980,6,18,Male,1980-06-18 00:00:00
36547,https://congbobanan.toaan.gov.vn/2ta355439t1cv...,刑事,故意對他人造成傷害或損害健康的罪行,<Page:1>TOÀ ÁN NHÂN DÂN CỘNG HOÀ XÃ HỘI CHỦ NG...,"1. Bị cáo: Dương Quang N, sinh ngày 04/8/1997...",No_Id,Dương Quang N,1997,8,4,Male,1997-08-04 00:00:00
36548,https://congbobanan.toaan.gov.vn/2ta434496t1cv...,刑事,犯罪非法持有毒品,<Page:1>TÒA ÁN NHÂN DÂN CỘNG HÒA XÃ HỘI CHỦ NG...,"Nguyễn Quang L, tên gọi khác: Không; Giới tín...",No_Id,Nguyễn Quang L,1974,10,9,Male,1974-10-09 00:00:00


### Transform embedding vector

In [18]:
names = dataset["NAME"].to_list()
jlr_link = dataset['JLR_LINK'].to_list()

In [57]:
# Suppose input_tokens is a list of token sequences
# make them into sentences
input_tokens = [names[0]]

# Convert input tokens to integer sequences using your vocabulary lookup
input_sequences = lookup(input_tokens)
# Get the embedding vectors
embedding_vectors = embedding_layer(input_sequences)
print(input_tokens)
print(embedding_vectors)
print(type(embedding_vectors))

['Sùng Seo Q']
tf.Tensor(
[[ 0.05358159  0.01016463  0.00314485  0.06904604 -0.03862119 -0.1101924
   0.00972524  0.00094707 -0.1852332  -0.09839417  0.02609934  0.08843916
   0.02439241 -0.04689216 -0.06946589 -0.10178622 -0.15645982  0.09789144
   0.1355621  -0.01114203 -0.01158124 -0.15398215 -0.02306271  0.1564846
   0.02118216 -0.09064715  0.08215839  0.05770294 -0.06573923  0.15941034
   0.0586537  -0.10636423]], shape=(1, 32), dtype=float32)
<class 'tensorflow.python.framework.ops.EagerTensor'>


In [14]:
embedding_vector_list = embedding_vectors.numpy().tolist()[0]
embedding_vector_list

[0.053581587970256805,
 0.010164630599319935,
 0.00314484816044569,
 0.0690460354089737,
 -0.038621194660663605,
 -0.11019240319728851,
 0.009725235402584076,
 0.0009470721706748009,
 -0.1852332055568695,
 -0.09839417040348053,
 0.02609933726489544,
 0.08843915909528732,
 0.02439240738749504,
 -0.046892162412405014,
 -0.06946589052677155,
 -0.10178621858358383,
 -0.15645982325077057,
 0.0978914424777031,
 0.13556210696697235,
 -0.011142034083604813,
 -0.011581244878470898,
 -0.15398214757442474,
 -0.023062709718942642,
 0.15648460388183594,
 0.021182164549827576,
 -0.0906471461057663,
 0.0821583941578865,
 0.05770294368267059,
 -0.0657392293214798,
 0.15941034257411957,
 0.05865370109677315,
 -0.10636422783136368]

### Transform and Insert to BQ

In [42]:
def create_table():
    schema = [
        bigquery.SchemaField("JLR_LINK", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("NAME", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("VECTOR", "STRING", mode="REQUIRED")
    ]
    project_id, dataset_id, table_id_write = 'intern-project-415606', 'Criminal_Dataset', 'criminal_name_vector'
    table = bigquery.Table(f"{project_id}.{dataset_id}.{table_id_write}", schema=schema)
    try:
        table = bigquery_client.create_table(table)
    except Exception as e:
        print(f"Table {project_id}.{dataset_id}.{table_id_write} already exists.")
    
def transform_and_insert(names, jlr_link):
    create_table()
    client = bigquery.Client()
    for i in range(0, 5):#len(names)):
        embedding_vectors = embedding_layer(lookup([names[i]]))
        
        embedding_vector_list = embedding_vectors.numpy().tolist()[0]
        
        query = f"""
            INSERT INTO `intern-project-415606.Criminal_Dataset`.criminal_name_vector (`JLR_LINK`, `NAME`, `VECTOR`)
            VALUES ('{jlr_link[i]}', '{names[i]}', '{str(embedding_vector_list)}')
        """
        
        try:
            print(client.query(query))
        except Exception as e:
            print(f"Error:{e}")

In [43]:
transform_and_insert(names, jlr_link)

Table intern-project-415606.Criminal_Dataset.criminal_name_vector already exists.
QueryJob<project=intern-project-415606, location=US, id=7c11260a-0984-4c6c-9983-4bc40137bf1a>
QueryJob<project=intern-project-415606, location=US, id=01e9b188-ee6d-4c44-8f20-be490b9ddda8>
QueryJob<project=intern-project-415606, location=US, id=f8f5680b-1b18-40e0-9976-5c06b3321b1b>
QueryJob<project=intern-project-415606, location=US, id=5a590501-c80d-4cd6-9289-8bcb45d35f03>
QueryJob<project=intern-project-415606, location=US, id=1bfbf92c-65df-40f3-ac53-8797c86fa8de>


### Read the bq and change it to tensor (vector)

In [44]:
def read_bq():
    client = bigquery.Client()
    
    query = f"""
        SELECT *
        FROM `intern-project-415606.Criminal_Dataset.criminal_name_vector`
    """
    
    try:
        query_job = client.query(query)
        df = query_job.to_dataframe()
        return df
    except Exception as e:
        print(f"Error: {e}")

In [61]:
df = read_bq()
df

Unnamed: 0,JLR_LINK,NAME,VECTOR
0,https://congbobanan.toaan.gov.vn/2ta946636t1cv...,Nguyễn Thu H,"[0.034973084926605225, 0.024029400199651718, 0..."
1,https://congbobanan.toaan.gov.vn/2ta1016089t1c...,Sùng Seo Q,"[0.053581587970256805, 0.010164630599319935, 0..."
2,https://congbobanan.toaan.gov.vn/2ta791888t1cv...,Vũ Văn Th,"[-0.01057322695851326, -0.05474351346492767, 0..."
3,https://congbobanan.toaan.gov.vn/2ta290704t1cv...,Nguyễn Văn T,"[0.05989045649766922, -0.02766093984246254, 0...."
4,https://congbobanan.toaan.gov.vn/2ta290704t1cv...,Nguyễn Văn T,"[0.05989045649766922, -0.02766093984246254, 0...."
5,https://congbobanan.toaan.gov.vn/2ta536274t1cv...,Nguyễn Văn H,"[0.07151448726654053, -0.04318667948246002, 0...."
6,https://congbobanan.toaan.gov.vn/2ta1016089t1c...,Sùng Seo Q,"[0.053581587970256805, 0.010164630599319935, 0..."
7,https://congbobanan.toaan.gov.vn/2ta791888t1cv...,Vũ Văn Th,"[-0.01057322695851326, -0.05474351346492767, 0..."
8,https://congbobanan.toaan.gov.vn/2ta536274t1cv...,Nguyễn Văn H,"[0.07151448726654053, -0.04318667948246002, 0...."
9,https://congbobanan.toaan.gov.vn/2ta946636t1cv...,Nguyễn Thu H,"[0.034973084926605225, 0.024029400199651718, 0..."


In [62]:
def convert_to_tensor(vector_list):
    # Convert the list to a NumPy array
    np_array = np.array(vector_list)
    # Convert the NumPy array to a TensorFlow tensor
    tf_tensor = tf.convert_to_tensor(np_array, dtype=tf.float32)
    return tf_tensor

In [66]:
# Assuming df is your DataFrame containing the 'VECTOR' column
df['VECTOR'] = df['VECTOR'].apply(lambda x: convert_to_tensor(eval(x)))

# Check the modified DataFrame
df.head()

Unnamed: 0,JLR_LINK,NAME,VECTOR
0,https://congbobanan.toaan.gov.vn/2ta946636t1cv...,Nguyễn Thu H,"(tf.Tensor(0.034973085, shape=(), dtype=float3..."
1,https://congbobanan.toaan.gov.vn/2ta1016089t1c...,Sùng Seo Q,"(tf.Tensor(0.053581588, shape=(), dtype=float3..."
2,https://congbobanan.toaan.gov.vn/2ta791888t1cv...,Vũ Văn Th,"(tf.Tensor(-0.010573227, shape=(), dtype=float..."
3,https://congbobanan.toaan.gov.vn/2ta290704t1cv...,Nguyễn Văn T,"(tf.Tensor(0.059890456, shape=(), dtype=float3..."
4,https://congbobanan.toaan.gov.vn/2ta290704t1cv...,Nguyễn Văn T,"(tf.Tensor(0.059890456, shape=(), dtype=float3..."


In [67]:
df['VECTOR'][0]

<tf.Tensor: shape=(32,), dtype=float32, numpy=
array([ 3.49730849e-02,  2.40294002e-02,  6.00001290e-02,  8.48811120e-04,
        2.97953710e-02, -1.30276412e-01, -3.14006209e-03,  6.67454302e-03,
       -1.77457511e-01,  1.86333954e-02, -1.94507968e-02,  1.84292734e-01,
        1.05679601e-01, -1.10097066e-01, -8.13347921e-02,  1.23821199e-04,
       -7.92872757e-02,  6.21909276e-02,  1.49518356e-01, -9.65251476e-02,
        1.18960300e-02,  1.70368105e-02,  4.89422977e-02, -1.57194585e-03,
        5.50542437e-02, -5.43453805e-02,  2.09632330e-02,  7.07250237e-02,
       -6.99996576e-02,  8.70450139e-02,  6.43595010e-02, -9.11898091e-02],
      dtype=float32)>

In [None]:
type()