In [1]:
# !uv pip install git+https://github.com/willccbb/verifiers.git

In [2]:
import os
from speedy_utils import *
from llm_utils import *

In [3]:
client = MOpenAI(base_url='http://localhost:7999/v1')

In [4]:
lm = AsyncLM(port=7999)

Using model: Qwen/Qwen3-235B-A22B-Instruct-2507-FP8


In [5]:
vector_service = VectorCache(url_or_model='http://localhost:8000/v1')#, model_name='poly-embed-0.6b-250904')

Auto-detected backend: openai
Infer model name: poly-embed-0.6b-250904


In [15]:
df = pd.read_csv('/home/anhvth5/projects/TRANSLATE_UI/assets/LC_STANDARD/GS3_TN_ZH_VI/examples.csv')

In [16]:
embeds = vector_service(df.source.tolist())

Retrieved 256482 embeddings in 3.73 seconds


In [25]:
from __future__ import annotations

from typing import List, Literal
from collections import defaultdict
import numpy as np
import faiss


class DisjointSet:
    def __init__(self, n: int) -> None:
        self.parent = np.arange(n, dtype=np.int64)
        self.size = np.ones(n, dtype=np.int64)

    def find(self, x: int) -> int:
        # path compression
        while self.parent[x] != x:
            self.parent[x] = self.parent[self.parent[x]]
            x = self.parent[x]
        return x

    def union(self, a: int, b: int) -> None:
        ra, rb = self.find(a), self.find(b)
        if ra == rb:
            return
        if self.size[ra] < self.size[rb]:
            ra, rb = rb, ra
        self.parent[rb] = ra
        self.size[ra] += self.size[rb]


def filter_duplicated(
    matrix: np.ndarray,
    eps: float = 0.08,
    *,
    metric: Literal['cosine', 'l2'] = 'cosine',
    assume_normalized: bool = True,
    k_neighbors: int = 16,
    batch_size: int = 8192,
    hnsw_m: int = 16,
    ef_search: int = 64,
) -> List[List[int]]:
    """
    Group vectors that are 'too close' (within eps) and return index groups.
    Uses HNSW k-NN (batched) + union-find to avoid O(N^2) memory/time.
    """
    if matrix.ndim != 2:
        raise ValueError('matrix must be 2D [N, D]')

    n, d = matrix.shape
    if n == 0:
        return []

    xb = np.asarray(matrix, dtype=np.float32, order='C')

    if metric == 'cosine' and not assume_normalized:
        faiss.normalize_L2(xb)

    index = faiss.IndexHNSWFlat(d, hnsw_m)
    index.hnsw.efSearch = ef_search
    index.add(xb)

    if metric == 'cosine':
        def close_enough(d2: float) -> bool:
            # for unit vectors: 0.5 * ||u - v||^2 == cosine_distance
            return 0.5 * d2 <= eps
    else:
        e2 = eps * eps

        def close_enough(d2: float) -> bool:
            return d2 <= e2

    dsu = DisjointSet(n)
    k = max(2, k_neighbors)

    for start in range(0, n, batch_size):
        end = min(n, start + batch_size)
        D, I = index.search(xb[start:end], k)
        for row in range(end - start):
            i = start + row
            ids = I[row]
            d2s = D[row]
            for j, d2 in zip(ids, d2s):
                if j < 0 or j == i:
                    continue
                if close_enough(float(d2)):
                    dsu.union(i, j)

    comp_to_members: dict[int, List[int]] = defaultdict(list)
    for i in range(n):
        comp_to_members[dsu.find(i)].append(i)

    groups = []
    for members in comp_to_members.values():
        members.sort()
        groups.append(members)

    groups.sort(key=lambda g: g[0])
    return groups


In [21]:
dedup_ids = filter_duplicated(embeds, 0.08)

In [26]:
list_gs = filter_duplicated(embeds, 0.08)

In [40]:
list_gs = list(sorted(list_gs, key=len, reverse=True))

In [69]:
len(list_gs)/len(df)

0.7702996701522914

In [66]:
g = list_gs[1000]

In [67]:
df.iloc[g]

Unnamed: 0,source,target
17020,誓约之礼-椅子1,Lễ Thề Ước-Ghế 1
17021,誓约之礼-椅子2,Lễ Thề Ước-Ghế 2
17022,誓约之礼-椅子3,Lễ Thề Ước-Ghế 3
17023,誓约之礼-椅子4,Lễ Thề Ước-Ghế 4
17024,誓约之礼-椅子5,Lễ Thề Ước-Ghế 5
17025,誓约之礼-椅子6,Lễ Thề Ước-Ghế 6
17026,誓约之礼-椅子7,Lễ Thề Ước-Ghế 7
17027,誓约之礼-椅子8,Lễ Thề Ước-Ghế 8
