<a href="https://colab.research.google.com/github/agmCorp/colab/blob/main/textSim.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from typing import List, Tuple, Dict
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity


def find_near_duplicates_char_ngrams(
    texts: List[str],
    top_k: int = 10,
    similarity_threshold: float = 0.75,
    analyzer: str = "char",
    ngram_range: Tuple[int, int] = (2, 5),
) -> List[Tuple[int, int, float]]:
    """
    Devuelve pares (i, j, similarity) que parecen duplicados usando TF-IDF de n-gramas
    de caracteres + similitud coseno.

    - No usa normalización manual ni diccionario de equivalencias.
    - "analyzer" puede ser "char" o "char_wb".
    - similarity_threshold típico: 0.72 - 0.85 (calibrar según datos).
    """

    if not texts:
        return []

    vectorizer = TfidfVectorizer(
        analyzer=analyzer,
        ngram_range=ngram_range,
        min_df=1,
        lowercase=True,  # default, lo dejo explícito
    )

    X = vectorizer.fit_transform(texts)

    # Vecinos más cercanos usando distancia coseno
    n_neighbors = min(top_k + 1, len(texts))  # +1 porque el vecino 0 es el propio ítem
    nn = NearestNeighbors(n_neighbors=n_neighbors, metric="cosine")
    nn.fit(X)

    distances, indices = nn.kneighbors(X, return_distance=True)

    # Guardar el mejor score por par (a,b)
    best: Dict[Tuple[int, int], float] = {}

    for i in range(len(texts)):
        for rank in range(1, indices.shape[1]):  # rank 0 es el mismo i
            j = int(indices[i, rank])
            sim = 1.0 - float(distances[i, rank])  # similarity = 1 - cosine_distance
            if sim >= similarity_threshold:
                a, b = (i, j) if i < j else (j, i)
                prev = best.get((a, b), 0.0)
                if sim > prev:
                    best[(a, b)] = sim

    # Devolver ordenado por similitud descendente
    out = [(a, b, s) for (a, b), s in best.items()]
    out.sort(key=lambda x: x[2], reverse=True)
    return out


def print_pairwise_similarities(
    texts: List[str],
    analyzer: str = "char",
    ngram_range: Tuple[int, int] = (2, 5),
) -> None:
    """
    Debug: imprime similitudes coseno para todos los pares i<j.
    Útil para elegir el threshold.
    """
    if len(texts) < 2:
        print("Need at least 2 texts.")
        return

    vectorizer = TfidfVectorizer(
        analyzer=analyzer,
        ngram_range=ngram_range,
        min_df=1,
        lowercase=True,
    )
    X = vectorizer.fit_transform(texts)
    S = cosine_similarity(X)

    for i in range(len(texts)):
        for j in range(i + 1, len(texts)):
            print(f"{i} {j} -> {S[i, j]:.3f}")


if __name__ == "__main__":
    samples = [
        "CRUZE 5 1.4T LTZ PLUS EXTRA FULL 5P. AUT.",
        "CRUZE automatic. 5 1.4T LTZ PLUS EXTRA FULL 5Puertas",
        "CRUZE 1.4T LTZ PLUS EXTRA FULL 4P. AUT. (ARG)",
        "CRUZE 1.4T 4 puertas LTZ PLUS EXTRA FULL AUT. (ARG)",
        "CRUZE 5 1.4T LTZ PLUS EXTRA FULL 5Ptas AT.",
    ]

    # 1) Debug opcional para ver scores y calibrar threshold
    print("Pairwise similarities (debug):")
    print_pairwise_similarities(samples, analyzer="char", ngram_range=(2, 5))
    print()

    # 2) Búsqueda de duplicados
    hits = find_near_duplicates_char_ngrams(
        samples,
        top_k=5,
        similarity_threshold=0.75,
        analyzer="char",
        ngram_range=(2, 5),
    )

    if not hits:
        print("No matches found. Try lowering similarity_threshold (e.g. 0.70).")
    else:
        print("Matches:")
        for a, b, sim in hits:
            print(f"{sim:.3f} | {samples[a]}  <->  {samples[b]}")


Pairwise similarities (debug):
0 1 -> 0.505
0 2 -> 0.575
0 3 -> 0.405
0 4 -> 0.678
1 2 -> 0.345
1 3 -> 0.400
1 4 -> 0.469
2 3 -> 0.612
2 4 -> 0.406
3 4 -> 0.371

No matches found. Try lowering similarity_threshold (e.g. 0.70).
