# Khmer Semantic Similarity Testing (Fast Version)

Quick demonstration of semantic similarity detection with 25 test pairs.

In [None]:
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
import warnings
warnings.filterwarnings('ignore')

print(f"TensorFlow: {tf.__version__}")

In [None]:
# Load vocabulary
VOCAB_PATH = "/Users/mac/Downloads/For_NLP_Projects_Kru_vanny/large_processed_data/char2idx.txt"
char2idx = {}

with open(VOCAB_PATH, 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.strip().split('\t', 1)
        if len(parts) == 2:
            idx, char = parts
            char2idx[char] = int(idx)

print(f"Vocabulary: {max(char2idx.values()) + 1} characters")

In [None]:
# Load model and extract encoder
MODEL_PATH = "/Users/mac/Downloads/best_large_model.h5"
print("Loading model...")
full_model = keras.models.load_model(MODEL_PATH)

latent_layer = full_model.get_layer('latent')
encoder = keras.Model(full_model.input[0], latent_layer.output)
print(f"Encoder ready: {encoder.output.shape}")

In [None]:
# Helper functions
PAD_IDX, SOS_IDX, EOS_IDX, UNK_IDX = 0, 1, 2, 3
MAX_LEN = 202

def text_to_sequence(text):
    indices = [char2idx.get(c, UNK_IDX) for c in text]
    seq = [SOS_IDX] + indices + [EOS_IDX]
    if len(seq) < MAX_LEN:
        seq = seq + [PAD_IDX] * (MAX_LEN - len(seq))
    else:
        seq = seq[:MAX_LEN]
    return seq

def cosine_sim(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

print("Functions ready")

In [None]:
# Warmup (first prediction is slow)
print("Warmup...")
_ = encoder.predict(np.array([text_to_sequence("តេស្ត")]), verbose=0)
print("Ready!")

In [None]:
# Test pairs (25 total - more comprehensive)
test_pairs = [
    # Identical
    ("កម្ពុជា", "កម្ពុជា", "identical", "Cambodia vs Cambodia"),
    
    # Geography (similar)
    ("កម្ពុជា", "ភ្នំពេញ", "similar", "Cambodia vs Phnom Penh"),
    ("ទន្លេ", "ទឹក", "similar", "River vs Water"),
    ("ភ្នំ", "ដី", "similar", "Mountain vs Land"),
    
    # Language (similar)
    ("ភាសា", "ខ្មែរ", "similar", "Language vs Khmer"),
    ("និយាយ", "ពាក្យ", "similar", "Speak vs Word"),
    
    # Sleep/Rest (similar - synonyms)
    ("េដក", "េគង", "similar", "Sleep vs Sleep (synonym)"),
    ("សម្រាក", "េគង", "similar", "Rest vs Sleep"),
    
    # Country/Nation (similar)
    ("ប្រទេស", "ជាតិ", "similar", "Country vs Nation"),
    
    # Education (similar)
    ("សៀវភៅ", "អាន", "similar", "Book vs Read"),
    ("សិស្ស", "សាលា", "similar", "Student vs School"),
    ("គ្រូ", "បង្រៀន", "similar", "Teacher vs Teach"),
    
    # Food (similar)
    ("អាហារ", "ញុំា", "similar", "Food vs Eat"),
    ("បាយ", "អាហារ", "similar", "Rice vs Food"),
    
    # Emotions (similar)
    ("សប្បាយ", "រីករាយ", "similar", "Happy vs Joyful"),
    
    # Dissimilar pairs
    ("កម្ពុជា", "ធុេរន", "dissimilar", "Cambodia vs Durian"),
    ("ភាសា", "ផ្លែឈើ", "dissimilar", "Language vs Fruit"),
    ("េដក", "ធុេរន", "dissimilar", "Sleep vs Durian"),
    ("ប្រទេស", "អាហារ", "dissimilar", "Country vs Food"),
    ("សៀវភៅ", "រថយន្ត", "dissimilar", "Book vs Car"),
    ("ទឹក", "ភ្លើង", "dissimilar", "Water vs Fire"),
    ("ថ្ងៃ", "យប់", "dissimilar", "Day vs Night"),
    ("ក្មេង", "ចាស់", "dissimilar", "Young vs Old"),
    ("ខ្ពស់", "ទាប", "dissimilar", "High vs Low")
]

print(f"Testing {len(test_pairs)} pairs...\n")

In [None]:
# Batch process all words
all_words = []
for w1, w2, _, _ in test_pairs:
    if w1 not in all_words: all_words.append(w1)
    if w2 not in all_words: all_words.append(w2)

# Get all embeddings at once (faster!)
sequences = np.array([text_to_sequence(w) for w in all_words])
embeddings = encoder.predict(sequences, verbose=0)

# Create word -> embedding mapping
word_emb = {w: embeddings[i] for i, w in enumerate(all_words)}

print(f"Encoded {len(all_words)} unique words")

In [None]:
# Run tests
print("="*70)
print("SEMANTIC SIMILARITY TEST RESULTS")
print("="*70)
print()

results = []
for i, (w1, w2, expected, desc) in enumerate(test_pairs, 1):
    sim = cosine_sim(word_emb[w1], word_emb[w2])
    
    # Determine pass/fail
    if expected == "identical":
        passed = sim > 0.99
        threshold = ">0.99"
    elif expected == "similar":
        passed = sim > 0.7
        threshold = ">0.70"
    else:
        passed = sim < 0.5
        threshold = "<0.50"
    
    results.append(passed)
    status = "✓ PASS" if passed else "✗ FAIL"
    
    print(f"Test {i}: {desc}")
    print(f"  '{w1}' vs '{w2}'")
    print(f"  Expected: {expected.upper()} ({threshold})")
    print(f"  Similarity: {sim:.4f}")
    print(f"  Status: {status}")
    print()

# Summary
print("="*70)
print("SUMMARY")
print("="*70)
print(f"Total tests: {len(results)}")
print(f"Passed: {sum(results)}")
print(f"Failed: {len(results) - sum(results)}")
print(f"Pass rate: {100 * sum(results) / len(results):.1f}%")
print()

if sum(results) / len(results) >= 0.8:
    print("✓ Model SUCCESSFULLY distinguishes similar from dissimilar words")
    print("✓ Ready for semantic similarity applications")
else:
    print("! Model needs improvement")

print("="*70)