In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from azure.cosmos import CosmosClient, PartitionKey
from transformers import AutoTokenizer, AutoModel
from rapidfuzz import fuzz
from pythainlp.transliterate import romanize
from rapidfuzz.distance import Levenshtein
import pandas as pd
import json
import uuid
import torch
import jellyfish

# Initialize the Cosmos client
cosmos_url = "YOUR_COSMOS_URL"
cosmos_key = "YOUR_COSMOS_KEY"
database_name = "YOUR_DATABASE_NAME"
container_name = "YOUR_CONTAINER_NAME"

client_cosmos = CosmosClient(cosmos_url, cosmos_key)

database = client_cosmos.create_database_if_not_exists(id=database_name)

container = database.create_container_if_not_exists(
    id=container_name,
    partition_key=PartitionKey(path="/id"),
)

print("Container created successfully in a serverless account.")

In [None]:
DT = pd.read_parquet('YOUR_PATH')

# Batch Run with Lavenstein
model_name = "intfloat/multilingual-e5-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def get_embedding(texts, tokenizer, model):
    if isinstance(texts, pd.Series):
        texts = texts.tolist()
    elif isinstance(texts, str):
        texts = [texts]
    else:
        texts = list(texts)  

    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1)
    return embeddings.tolist()

def run_query(query_embedding):
    query_embedding_str = json.dumps(query_embedding)
    results = container.query_items(
        query=f'''
        SELECT TOP 1 c.uuid, c.id_card, c.full_name, VectorDistance(c.embedding ,{query_embedding_str}) AS SimilarityScore  
        FROM c 
        ORDER BY VectorDistance(c.embedding,{query_embedding_str})
        ''',
        enable_cross_partition_query=True, 
        populate_query_metrics=False, 
        populate_index_metrics=False
    )
    return pd.DataFrame(results)

def vector_search_batch(inputs):
    if isinstance(inputs, pd.Series):
        inputs = inputs.tolist()
    else:
        inputs = list(inputs)

    embeddings = get_embedding(inputs, tokenizer, model)
    embeddings_json = embeddings

    results_list = [None]*len(inputs)
    total = len(inputs)
    completed = 0

    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(run_query, emb): i for i, emb in enumerate(embeddings_json)}
        for future in as_completed(futures):
            i = futures[future]
            results_list[i] = future.result()
            completed += 1
            print(f"Completed {completed}/{total} records")

    return results_list

def calculate_levenshtein_distance_optimized(inputs):
    vector_search_results = vector_search_batch(inputs)

    def process_result(input_name, result_df):
        """Process each input-result pair and calculate Levenshtein distance."""
        if result_df is not None and not result_df.empty:
            # Calculate Levenshtein distance for each match
            result_df['LevenshteinDistance'] = result_df['full_name'].apply(
                lambda x: Levenshtein.distance(input_name, x)
            )
            # Add the input name for context
            result_df['InputName'] = input_name
        else:
            # If no results, append a placeholder row
            result_df = pd.DataFrame({
                'uuid': [None],
                'id_card': [None],
                'full_name': [None],
                'SimilarityScore': [None],
                'LevenshteinDistance': [None],
                'InputName': [input_name]
            })
        return result_df

    results_with_distance = []
    with ThreadPoolExecutor() as executor:
        # Parallelize processing of results
        futures = [
            executor.submit(process_result, input_name, result_df)
            for input_name, result_df in zip(inputs, vector_search_results)
        ]
        for future in futures:
            results_with_distance.append(future.result())

    # Combine all results into a single DataFrame
    return pd.concat(results_with_distance, ignore_index=True)

def calculate_levenshtein_distance(inputs):

    vector_search_results = vector_search_batch(inputs)
    results_with_distance = []

    for input_name, result_df in zip(inputs, vector_search_results):
        if result_df is not None and not result_df.empty:
            result_df = result_df.copy()  # Avoid modifying the original result DataFrame
            # Calculate Levenshtein distance for each match
            result_df['LevenshteinDistance'] = result_df['full_name'].apply(
                lambda x: Levenshtein.distance(input_name, x)
            )

            # Romanize the input_name once
            input_romanize = romanize(input_name, engine="thai2rom")

            # Romanize the 'full_name' column once and store in 'Output_Romanize'
            result_df['Output_Romanize'] = result_df['full_name'].apply(
                lambda x: romanize(x, engine="thai2rom")
            )

            # Add a single 'Input_Romanize' column with the same value for all rows
            result_df['Input_Romanize'] = input_romanize

            # Calculate Levenshtein distance between 'Input_Romanize' and 'Output_Romanize'
            result_df['Score_Romanize'] = result_df['Output_Romanize'].apply(
                lambda x: Levenshtein.distance(input_romanize, x)
            )

            # Add the input name for context
            result_df['InputName'] = input_name
            results_with_distance.append(result_df)
        else:
            # If no results, append a placeholder row
            results_with_distance.append(pd.DataFrame({
                'uuid': [None],
                'id_card': [None],
                'full_name': [None],
                'SimilarityScore': [None],
                'LevenshteinDistance': [None],
                'InputName': [input_name]
            }))

    # Combine all results into a single DataFrame
    return pd.concat(results_with_distance, ignore_index=True)

# Now calling this function will print progress as queries complete
DF = calculate_levenshtein_distance(DT['full_name'])

In [None]:
# Apply the conditions and assign 'Label' and 'Case'
DF.loc[DF['SimilarityScore'] >= 0.99, ['Label', 'Case']] = ['Same Person', 1]

DF.loc[
    (DF['SimilarityScore'] < 0.99) & (DF['LevenshteinDistance'] == 1) & (DF['Score_Romanize'].isin([0, 1])),
    ['Label', 'Case']
] = ['Same Person', 2]

DF.loc[
    (DF['SimilarityScore'] < 0.99) & (DF['LevenshteinDistance'] == 1) & (~DF['Score_Romanize'].isin([0, 1])),
    ['Label', 'Case']
] = ['Not Sure', 3]

DF.loc[
    (DF['SimilarityScore'] < 0.99) & (DF['LevenshteinDistance'] == 2) & (DF['Score_Romanize'] == 0),
    ['Label', 'Case']
] = ['Same Person', 4]

DF.loc[
    (DF['SimilarityScore'] < 0.99) & (DF['LevenshteinDistance'] == 2) & (DF['Score_Romanize'].isin([1, 2, 3])),
    ['Label', 'Case']
] = ['Not Sure', 5]

DF.loc[
    (DF['SimilarityScore'] < 0.99) & (DF['LevenshteinDistance'] == 2) & (~DF['Score_Romanize'].isin([0, 1, 2, 3])),
    ['Label', 'Case']
] = ['Different Person', 6]

DF.loc[
    (DF['SimilarityScore'] < 0.99) & (DF['LevenshteinDistance'] == 3) & (DF['Score_Romanize'] == 0),
    ['Label', 'Case']
] = ['Same Person', 7]

DF.loc[
    (DF['SimilarityScore'] < 0.99) & (DF['LevenshteinDistance'] == 3) & (DF['Score_Romanize'].isin([1, 2, 3])),
    ['Label', 'Case']
] = ['Not Sure', 8]

DF.loc[
    (DF['SimilarityScore'] < 0.99) & (DF['LevenshteinDistance'] == 3) & (~DF['Score_Romanize'].isin([1, 2, 3])),
    ['Label', 'Case']
] = ['Different Person', 9]

DF.loc[
    (DF['SimilarityScore'] < 0.99) & (DF['LevenshteinDistance'] == 4) & (DF['Score_Romanize'].isin([0, 1])),
    ['Label', 'Case']
] = ['Not Sure', 10]

DF.loc[
    (DF['SimilarityScore'] < 0.99) & (DF['LevenshteinDistance'] == 4) & (~DF['Score_Romanize'].isin([0, 1])),
    ['Label', 'Case']
] = ['Different Person', 11]

DF.loc[
    (DF['SimilarityScore'] < 0.99) & (DF['LevenshteinDistance'] >= 5),
    ['Label', 'Case']
] = ['Different Person', 12]

DF.to_excel('result.xlsx')