In [1]:
%pip install tensorflow
%pip install scipy

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm import tqdm
from tensorflow import keras
from tensorflow.keras import layers
from google.cloud import bigquery
from scipy.spatial.distance import cosine
from difflib import SequenceMatcher

2024-05-09 07:16:10.355596: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-09 07:16:10.716678: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-09 07:16:12.140457: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
@keras.utils.register_keras_serializable(package="CustomModels")
class NERModel(keras.Model):
    def __init__(
        self, num_tags, vocab_size, maxlen=512, embed_dim=32, num_heads=2, ff_dim=32, **kwargs
    ):
        super().__init__()
        self.num_tags = num_tags
        self.vocab_size = vocab_size
        self.maxlen = maxlen
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.ff_dim = ff_dim

        self.embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
        self.transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
        self.dropout1 = layers.Dropout(0.1)
        self.ff = layers.Dense(ff_dim, activation="relu")
        self.dropout2 = layers.Dropout(0.1)
        self.ff_final = layers.Dense(num_tags, activation="softmax")

    def call(self, inputs, training=False):
        x = self.embedding_layer(inputs)
        x = self.transformer_block(x)
        x = self.dropout1(x, training=training)
        x = self.ff(x)
        x = self.dropout2(x, training=training)
        x = self.ff_final(x)
        return x
    
    def get_config(self):
        config = {
            'num_tags': self.num_tags,
            'vocab_size': self.vocab_size,
            'maxlen': self.maxlen,
            'embed_dim': self.embed_dim,
            'num_heads': self.num_heads,
            'ff_dim': self.ff_dim,
        }
        base_config = super(NERModel, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = keras.layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.ffn = keras.Sequential(
            [
                keras.layers.Dense(ff_dim, activation="relu"),
                keras.layers.Dense(embed_dim),
            ]
        )
        self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = keras.layers.Dropout(rate)
        self.dropout2 = keras.layers.Dropout(rate)

    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)
    
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = keras.layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.pos_emb = keras.layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, inputs):
        maxlen = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        position_embeddings = self.pos_emb(positions)
        token_embeddings = self.token_emb(inputs)
        return token_embeddings + position_embeddings
    
class CustomNonPaddingTokenLoss(keras.losses.Loss):
    def __init__(self, reduction='sum', name="custom_ner_loss"):
        super().__init__(reduction='sum', name=name)

    def call(self, y_true, y_pred):
        loss_fn = keras.losses.SparseCategoricalCrossentropy(
            from_logits=False, reduction=self.reduction
        )
        loss = loss_fn(y_true, y_pred)
        mask = tf.cast((y_true > 0), dtype=tf.float32)
        loss = loss * mask
        return tf.reduce_sum(loss) / tf.reduce_sum(mask)

def map_record_to_training_data(record):
    record = tf.strings.split(record, sep="\t")
    length = tf.strings.to_number(record[0], out_type=tf.int32)
    tokens = record[1 : length + 1]
    tags = record[length + 1 :]
    tags = tf.strings.to_number(tags, out_type=tf.int64)
    tags += 1
    return tokens, tags

def lookup(tokens):
    # Load the list from the file
    with open('vocabulary.pkl', 'rb') as f:
        loaded_list = pickle.load(f)
    # The StringLookup class will convert tokens to token IDs
    lookup_layer = keras.layers.StringLookup(vocabulary=loaded_list)

    # No need to lowercase Vietnamese characters
    return lookup_layer(tokens)

def format_datatype(data):
    tokens = word_tokenize(data)
    #tokens =  [re.sub(r'[;,]', '', d) for d in data.split(' ')]
    #default is 0, since is for prediction
    ner_tags = [0 for d in data.split(' ')]

    #tab to separate
    string_input = str(len(tokens))+ "\t"+ "\t".join(tokens)+ "\t"+ "\t".join(map(str, ner_tags))
    string_input = tf.data.Dataset.from_tensor_slices([string_input])


    finalize_input = (string_input.map(map_record_to_training_data)
                      .map(lambda x, y: (lookup(x),  y))
                      .padded_batch(1)
                      )

    return finalize_input

In [4]:
def load_embedding_layer():
    tf.keras.utils.get_custom_objects()['CustomNonPaddingTokenLoss'] = CustomNonPaddingTokenLoss
    # Load model
    loaded_model = tf.keras.models.load_model("ner_model.keras")

    embedding_layer = loaded_model.embedding_layer
    
    return embedding_layer

In [5]:
embedding_layer = load_embedding_layer()

def word_embedding(input):
    
    embedding_vectors = embedding_layer(lookup([input]))

    embedding_vector_list = embedding_vectors.numpy().tolist()[0]
    
    return embedding_vector_list

  saveable.load_own_variables(weights_store.get(inner_path))


In [6]:
def list_to_tensor(vector_list):
    # Convert the list to a NumPy array
    np_array = np.array(vector_list)
    # Convert the NumPy array to a TensorFlow tensor
    tf_tensor = tf.convert_to_tensor(np_array, dtype=tf.float32)
    return tf_tensor

In [7]:
def read_bq():
    client = bigquery.Client()
    
    query = f"""
        SELECT *
        FROM `intern-project-415606.Criminal_Dataset.criminal_name_vector`
    """
    
    try:
        query_job = client.query(query)
        df = query_job.to_dataframe()
        return df
    except Exception as e:
        print(f"Error: {e}")

In [8]:
# read df
df = read_bq()

# Assuming df is your DataFrame containing the 'VECTOR' column
df['VECTOR'] = df['VECTOR'].apply(lambda x: eval(x))

# Check the modified DataFrame
df

Unnamed: 0,JLR_LINK,NAME,VECTOR
0,https://congbobanan.toaan.gov.vn/2ta129537t1cv...,Nông Văn T,"[0.06448426842689514, 0.0034767701290547848, 0..."
1,https://congbobanan.toaan.gov.vn/2ta109074t1cv...,Vũ Văn L,"[0.019767554476857185, -0.02712906524538994, 0..."
2,https://congbobanan.toaan.gov.vn/2ta1065938t1c...,Quách Văn T,"[0.06412803381681442, 0.02595774084329605, 0.0..."
3,https://congbobanan.toaan.gov.vn/2ta1137715t1c...,Hà Văn H,"[0.03470417857170105, -0.04582866653800011, 0...."
4,https://congbobanan.toaan.gov.vn/2ta863405t1cv...,Nguyễn Sỹ T,"[0.07035490870475769, -0.06170082837343216, 0...."
...,...,...,...
36544,https://congbobanan.toaan.gov.vn/2ta315460t1cv...,Phạm Thị H,"[0.022265102714300156, -0.034367285668849945, ..."
36545,https://congbobanan.toaan.gov.vn/2ta172151t1cv...,Cao Thị T,"[0.02409677766263485, -0.024656902998685837, 0..."
36546,https://congbobanan.toaan.gov.vn/2ta692520t1cv...,Bùi Quang H,"[0.05726223438978195, 0.0030422864947468042, 0..."
36547,https://congbobanan.toaan.gov.vn/2ta197918t1cv...,Nguyễn Thị Hồng H,"[-0.0019262349233031273, 0.01050012931227684, ..."


In [32]:
input_name = "Matt"

In [33]:
a = word_embedding('Matt') #我們要找的 
b = word_embedding('Lƣơng Văn Th') #影響因子
c = word_embedding('Đặng Văn N') #我們要找的
print("similarity a b:", 1 - cosine(a, b))
print("similarity a c:", 1 - cosine(a, c))
print("similarity b c:", 1 - cosine(b, c))

similarity a b: 0.8823302762864892
similarity a c: 0.7660522751244297
similarity b c: 0.8880456865880312


In [35]:
vector_list = word_embedding(input_name)

# Compute cosine similarities
cosine_similarities = [1 - cosine(vector_list, v) for v in df['VECTOR']]

# Get the indices of the top 10 most similar vectors
top_10_indices = np.argsort(cosine_similarities)[-100:][::-1]

# Retrieve the top 10 most similar vectors
top_10_vectors = df.iloc[top_10_indices]
result = pd.DataFrame({
    'index': top_10_vectors.index,
    'JLR_LINK': top_10_vectors['JLR_LINK'],
    'NAME': top_10_vectors['NAME'],
    'similarity': np.array(cosine_similarities)[top_10_indices]
})

# Display the result
result

Unnamed: 0,index,JLR_LINK,NAME,similarity
26308,26308,https://congbobanan.toaan.gov.vn/2ta1195196t1c...,Đàm Xuân L,1.0
26687,26687,https://congbobanan.toaan.gov.vn/2ta1136058t1c...,Nguyễn Thị Trúc T,1.0
26685,26685,https://congbobanan.toaan.gov.vn/2ta57334t1cvn...,Ngô Khắc V,1.0
26684,26684,https://congbobanan.toaan.gov.vn/2ta609609t1cv...,Lữ Trọng A,1.0
26683,26683,https://congbobanan.toaan.gov.vn/2ta823451t1cv...,Lê Khắc T,1.0
...,...,...,...,...
26650,26650,https://congbobanan.toaan.gov.vn/2ta460426t1cv...,Phạm Kim C,1.0
26649,26649,https://congbobanan.toaan.gov.vn/2ta795952t1cv...,Đặng Tòn G,1.0
26648,26648,https://congbobanan.toaan.gov.vn/2ta411033t1cv...,Hồ Quốc L,1.0
26647,26647,https://congbobanan.toaan.gov.vn/2ta1139413t1c...,TRẦN TRỌNG D,1.0


In [36]:
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

# Compute similarities
similarities = df['NAME'].apply(lambda x: similar(input_name, x))

# Get the indices of the top 10 most similar names
top_10_indices = np.argsort(similarities)[-100:][::-1]

# Retrieve the top 10 most similar rows
top_10_rows = df.iloc[top_10_indices]

# Create a new DataFrame with the desired columns
result = pd.DataFrame({
    'index': top_10_rows.index,
    'JLR_LINK': top_10_rows['JLR_LINK'],
    'NAME': top_10_rows['NAME'],
    'similarity': similarities[top_10_indices]
})

# Display the result
result


Unnamed: 0,index,JLR_LINK,NAME,similarity
21001,21001,https://congbobanan.toaan.gov.vn/2ta1063542t1c...,Mai Tất T,0.461538
24606,24606,https://congbobanan.toaan.gov.vn/2ta486335t1cv...,Mai Nhật K,0.428571
23871,23871,https://congbobanan.toaan.gov.vn/2ta776973t1cv...,Mai Nhất N,0.428571
34564,34564,https://congbobanan.toaan.gov.vn/2ta598600t1cv...,Mai Việt H,0.428571
26830,26830,https://congbobanan.toaan.gov.vn/2ta133067t1cv...,Mai Nhật T,0.428571
...,...,...,...,...
30246,30246,https://congbobanan.toaan.gov.vn/2ta632800t1cv...,Mai Văn H,0.307692
3812,3812,https://congbobanan.toaan.gov.vn/2ta1166556t1c...,Mai Thị K,0.307692
13808,13808,https://congbobanan.toaan.gov.vn/2ta1020694t1c...,Mai Văn H,0.307692
11078,11078,https://congbobanan.toaan.gov.vn/2ta638771t1cv...,Mùa Thị M,0.307692
