In [None]:
from azure.cosmos import CosmosClient, PartitionKey
import json
import uuid

# Initialize the Cosmos client
cosmos_url = "ํYOUR_COSMOS_URL"
cosmos_key = "YOUR_COSMOS_KEY"
database_name = "YOUR_DATABASE_NAME"
container_name = "YOUR_CONTAINER_NAME"

client_cosmos = CosmosClient(cosmos_url, cosmos_key)

# Create database if not exists
database = client_cosmos.create_database_if_not_exists(id=database_name)

# Create container without throughput configuration
container = database.create_container_if_not_exists(
    id=container_name,
    partition_key=PartitionKey(path="/id"),
)

print("Container created successfully in a serverless account.")

In [None]:
import pandas as pd
df = pd.read_parquet("/Users/noppavitkanchitavorakul/Desktop/Azure-AI/Data")

df = df[['id_card', 'fname', 'lname', 'full_name']].rename(columns={'fname': 'first_name', 'lname': 'last_name'})

chunk_size = 50000
num_chunks = len(df) // chunk_size + (1 if len(df) % chunk_size != 0 else 0)
df_chunks = [df.iloc[i*chunk_size:(i+1)*chunk_size] for i in range(num_chunks)]

In [None]:
data = df_chunks[4].to_dict(orient="records")

In [None]:
import uuid
import torch
from transformers import AutoTokenizer, AutoModel

# Initialize the Hugging Face model and tokenizer
model_name = "intfloat/multilingual-e5-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Define batch size
batch_size = 2000  # Adjust the batch size based on your requirements and API limits

def get_embedding(text, tokenizer, model):
    """
    Generate embedding for a single text using Hugging Face model.

    Args:
        text (str): The input text for which embedding is to be generated.
        tokenizer (object): The tokenizer instance.
        model (object): The model instance.

    Returns:
        list: The embedding for the input text.
    """
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1)
    return embeddings.squeeze().tolist()

def upload_batch_data(batch, container):
    """
    Upload a batch of documents to Cosmos DB.

    Args:
        batch (list): List of records to upload.
        container (object): Cosmos DB container object.
    """
    documents = []
    for record in batch:
        document = {
            "id": str(uuid.uuid4()),  # Generate a unique ID for each document
            "id_card": record.get("id_card"),
            "embedding": record.get("embedding"),
            "first_name": record.get("first_name"),
            "last_name": record.get("last_name"),
            "full_name": record.get("full_name")
        }
        documents.append(document)

    # Upsert documents into the Cosmos DB container
    for doc in documents:
        container.upsert_item(doc)

# Process data in batches
for i in range(0, len(data), batch_size):
    # Slice the data to create the current batch
    batch = data[i:i + batch_size]

    # Extract valid names from the batch
    names = [record.get("full_name", "").strip() for record in batch 
             if isinstance(record.get("full_name", ""), str) and record.get("full_name", "").strip()]

    # Debugging and validation
    print(f"Processing batch {i // batch_size + 1}/{(len(data) + batch_size - 1) // batch_size}")
    print(f"Number of valid names in batch: {len(names)}")
    
    # Ensure all items are non-empty strings
    assert all(isinstance(name, str) and name for name in names), "Not all items in `names` are valid strings"

    # Generate embeddings using the `get_embedding` function
    embeddings = [get_embedding(name, tokenizer, model) for name in names]
        
    # Attach embeddings back to the records
    for record, embedding in zip(batch, embeddings):
        record["embedding"] = embedding

    # Upload the batch data
    upload_batch_data(batch, container)

In [None]:
# Define the range of batches you want to process
start_batch = 20
end_batch = 25

# Process only the specified batches
for i in range((start_batch - 1) * batch_size, end_batch * batch_size, batch_size):
    # Slice the data to create the current batch
    batch = data[i:i + batch_size]

    # Extract valid names from the batch
    names = [record.get("full_name", "").strip() for record in batch 
             if isinstance(record.get("full_name", ""), str) and record.get("full_name", "").strip()]

    # Debugging and validation
    print(f"Processing batch {i // batch_size + 1}/{(len(data) + batch_size - 1) // batch_size}")
    print(f"Number of valid names in batch: {len(names)}")
    
    # Ensure all items are non-empty strings
    assert all(isinstance(name, str) and name for name in names), "Not all items in `names` are valid strings"

    # Generate embeddings using the `get_embedding` function
    embeddings = [get_embedding(name, tokenizer, model) for name in names]
        
    # Attach embeddings back to the records
    for record, embedding in zip(batch, embeddings):
        record["embedding"] = embedding

    # Upload the batch data
    upload_batch_data(batch, container)

In [None]:
def vector_search(query, num_results=1):
    query_embedding = get_embedding(query, tokenizer, model)
    results = container.query_items(
            query='SELECT TOP @num_results c.id, c.id_card, c.full_name, VectorDistance(c.embedding ,@embedding, true) AS SimilarityScore  FROM c ORDER BY VectorDistance(c.embedding,@embedding, true)',
            parameters=[
                {"name": "@embedding", "value": query_embedding}, 
                {"name": "@num_results", "value": num_results} 
            ],
            enable_cross_partition_query=True)
    
    #correct this
    return results

query = "สุชญา โตกุญาลัย"
results = vector_search(query)
for result in results: 
    print(f"Similarity Score: {result['SimilarityScore']}")
    print(f"id_card: {result['id_card']}")  
    print(f"full_name: {result['full_name']}\n") 

In [None]:
results_list = []

test1 = {
    "basic_wrong": [
        "สุชยา โตคุณาลัย",
        "สุชญา โตคุลาลัย",
        "สุชญา โตกุณาลัย",
        "สุชญา โตคุณาลัย์",
        "สุชญา โตคุณะลัย"
    ],
    "medium_wrong": [
        "สุชย่า โตคุนาลัย",
        "สุชญา โตกุญาลัย",
        "สุชณา โตคุลาลัย",
        "สุชะยา โตคุณลัย",
        "สุชณา โตคูณาลัย"
    ],
    "high_wrong": [
        "สุชญา โตคุณลัยส์",
        "สุชา โตคุนะไลน์",
        "สุฌญา โตคูญาลัย",
        "สุชา โตคุณไลน์",
        "สุชนญา โตคุญลัย"
    ],
    "extreme_wrong": [
        "สุชนญาร์ โตครุณละลัย",
        "สุชาหญา โตคลุนาย",
        "สุชะญา โตกุณาครัล",
        "สุฌา โตคลุยาญ",
        "สุชาหญ่า โตคนูยัลลาย"
    ]
}

for category in ['basic_wrong', 'medium_wrong', 'high_wrong', 'extreme_wrong']:
    for name in test1[category]:
        # Perform the vector search and capture the results
        results = vector_search(name)

        if results:
            for result in results:
                # Add the case to the result dictionary
                result['Case'] = category
                result['Test_Data'] = name
                # Append the result to the list
                results_list.append(result)

# # Convert the list of dictionaries to a DataFrame
df1 = pd.DataFrame(results_list)

## df2

results_list = []

test2 = {
    "basic_wrong": [
        "สุชยา สุขปิติ",
        "สุชญา สุคปิติ",
        "สุชณา สุขปิติ",
        "สุชญา สุกปิติ",
        "สุชญ่า สุขปิติ"
    ],
    "medium_wrong": [
        "สุชย่า สุขปิฏิ",
        "สุชะญา สุคปิฏิ",
        "สุชณา สุคปิตี",
        "สุชญา สุกพิถิ",
        "สุชณญา สุขปิติ"
    ],
    "high_wrong": [
        "สุชา สุคปิถิ",
        "สุฌญา สุคปิฐิ",
        "สุชณญา สุกปิฏี",
        "สุชาหญา สุกปิตี",
        "สุชา สุคปิฏี"
    ],
    "extreme_wrong": [
        "สุชนญาร์ สุกปิถี",
        "สุชะหญ่า สุคปิฐะ",
        "สุฌา สุกปิฏิฐะ",
        "สุชหญ่า สุกปิฏิถี",
        "สุชนญา สุคปิทิถะ"
    ]
}

for category in ['basic_wrong', 'medium_wrong', 'high_wrong', 'extreme_wrong']:
    for name in test2[category]:
        # Perform the vector search and capture the results
        results = vector_search(name)

        if results:
            for result in results:
                # Add the case to the result dictionary
                result['Case'] = category
                result['Test_Data'] = name
                # Append the result to the list
                results_list.append(result)

# # Convert the list of dictionaries to a DataFrame
df2 = pd.DataFrame(results_list)

## df3

results_list = []

test3 = {
    "basic_wrong": [
        "สุกันญา โตไพร",
        "สุกัญยา โตไพร",
        "สุกัญญา โตไพ",
        "สุกัญญา โตพร",
        "สุกัญญา โตภาย"
    ],
    "medium_wrong": [
        "สุกันยา โตพาย",
        "สุกัณญา โตไพร์",
        "สุกัญญ์ โตพรัย",
        "สุกัญญา โทไพ",
        "สุกัญญา โตภัย"
    ],
    "high_wrong": [
        "สุคันญา โตพรย์",
        "สุกัญย่า โทพราย",
        "สุกันย่า โตพะไย",
        "สุคัญญา โตพรัย",
        "สุกัญญ์ โทไพรย"
    ],
    "extreme_wrong": [
        "สุคัญญา โทพรัยย์",
        "สุกันหญา โตพายญ์",
        "สุขัญยา โตไพรยณ์",
        "สุกัณหยา โตพรันย์",
        "สุคัญย่า โทภายย์"
    ]
}

for category in ['basic_wrong', 'medium_wrong', 'high_wrong', 'extreme_wrong']:
    for name in test3[category]:
        # Perform the vector search and capture the results
        results = vector_search(name)

        if results:
            for result in results:
                # Add the case to the result dictionary
                result['Case'] = category
                result['Test_Data'] = name
                # Append the result to the list
                results_list.append(result)

# # Convert the list of dictionaries to a DataFrame
df3 = pd.DataFrame(results_list)


final_result = pd.concat([df1, df2, df3], ignore_index=True)

In [None]:
final_result #50000

In [None]:
final_result #100000

In [None]:
final_result #150000

In [None]:
final_result #200000

In [None]:
final_result.to_excel('e5-200000.xlsx', index=False)