In [None]:
import hashlib
import pandas as pd
from langchain_openai import OpenAIEmbeddings
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from dotenv import load_dotenv

load_dotenv()

True

In [5]:
data = [
    "The warranty covers accidental screen damage and liquid damage.",
    "Accidental damage and water exposure are covered under warranty.",
    "Battery replacements are provided free within one year.",
    "The warranty does not include intentional physical damage.",
    "Intentional damage or modification voids the warranty.",
    "Battery replacements are provided free within one year.",  # exact duplicate
    "Free battery replacement is available during the first year.",  # paraphrased
    "The warranty includes coverage for accidental screen damage.",
    "Manufacturing defects are always covered under the warranty.",
    "Any manufacturing issue will be covered free of cost under the warranty.",
    "Water and screen damages are part of warranty protection plan."
]

df = pd.DataFrame({"text": data})
print("🔹 Original Dataset ({} records)".format(len(df)))
df

🔹 Original Dataset (11 records)


Unnamed: 0,text
0,The warranty covers accidental screen damage a...
1,Accidental damage and water exposure are cover...
2,Battery replacements are provided free within ...
3,The warranty does not include intentional phys...
4,Intentional damage or modification voids the w...
5,Battery replacements are provided free within ...
6,Free battery replacement is available during t...
7,The warranty includes coverage for accidental ...
8,Manufacturing defects are always covered under...
9,Any manufacturing issue will be covered free o...


### Hash-Based Deduplication (Exact Match Filter)
✅ **Use Case:** When ingesting raw documents (PDFs, text files, etc.), where we want to remove byte-for-byte duplicates before embedding.

**⚡ Fast and cheap** — best as the first filter.

In [6]:
def hash_deduplicate(texts):
    """
    Removes exact duplicates using hash values (SHA256).
    Recommended before generating embeddings to save cost and time.
    """
    seen = set()
    unique = []
    for t in texts:
        h = hashlib.sha256(t.strip().lower().encode()).hexdigest()
        if h not in seen:
            unique.append(t)
            seen.add(h)
    return unique

hash_deduped_texts = hash_deduplicate(df["text"].tolist())

print(f"✅ After hash deduplication: {len(hash_deduped_texts)} unique records")
for t in hash_deduped_texts:
    print("-", t)


✅ After hash deduplication: 10 unique records
- The warranty covers accidental screen damage and liquid damage.
- Accidental damage and water exposure are covered under warranty.
- Battery replacements are provided free within one year.
- The warranty does not include intentional physical damage.
- Intentional damage or modification voids the warranty.
- Free battery replacement is available during the first year.
- The warranty includes coverage for accidental screen damage.
- Manufacturing defects are always covered under the warranty.
- Any manufacturing issue will be covered free of cost under the warranty.
- Water and screen damages are part of warranty protection plan.


### Semantic Deduplication (Cosine Similarity on Embeddings)

✅ **Use Case:** When we want to remove semantically similar text that is reworded but carries the same meaning (e.g., “covered under warranty” vs “warranty includes coverage”).
🧩 **Slightly expensive** (embedding generation) but intelligent — used for final corpus cleanup.

In [8]:
# Initialize embedding model
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# Create embeddings for each text
embedded_vectors = [embeddings.embed_query(t) for t in hash_deduped_texts]
embedded_vectors = np.array(embedded_vectors)

# Function to find semantic duplicates
def semantic_deduplicate(texts, vectors, threshold=0.85):
    """
    Removes semantically similar sentences based on cosine similarity.
    threshold: 0.85 means remove if similarity > 85%
    """
    keep_indices = []
    removed_indices = set()
    removed_pairs = []  # Track what was removed

    for i in range(len(vectors)):
        if i in removed_indices:
            continue
            
        # Check if current text is similar to any already kept text
        should_keep = True
        for kept_idx in keep_indices:
            if kept_idx in removed_indices:
                continue
            sim = cosine_similarity([vectors[i]], [vectors[kept_idx]])[0][0]
            if sim > threshold:
                should_keep = False
                removed_indices.add(i)
                removed_pairs.append({
                    'removed_idx': i,
                    'kept_idx': kept_idx,
                    'similarity': sim,
                    'removed_text': texts[i],
                    'kept_text': texts[kept_idx]
                })
                break
        
        if should_keep:
            keep_indices.append(i)

    filtered_texts = [texts[i] for i in keep_indices]
    return filtered_texts, removed_pairs

semantic_deduped_texts, removed_pairs = semantic_deduplicate(hash_deduped_texts, embedded_vectors)

print(f"\n✅ After semantic deduplication: {len(semantic_deduped_texts)} records remain")
print(f"📉 Removed {len(removed_pairs)} similar pairs:\n")

for pair in removed_pairs:
    print(f"🗑️  Removed (similarity: {pair['similarity']:.4f}):")
    print(f"    '{pair['removed_text']}'")
    print(f"    (kept: '{pair['kept_text']}')\n")

print("📋 Final deduplicated texts:")
for t in semantic_deduped_texts:
    print("-", t)



✅ After semantic deduplication: 9 records remain
📉 Removed 1 similar pairs:

🗑️  Removed (similarity: 0.8894):
    'The warranty includes coverage for accidental screen damage.'
    (kept: 'The warranty covers accidental screen damage and liquid damage.')

📋 Final deduplicated texts:
- The warranty covers accidental screen damage and liquid damage.
- Accidental damage and water exposure are covered under warranty.
- Battery replacements are provided free within one year.
- The warranty does not include intentional physical damage.
- Intentional damage or modification voids the warranty.
- Free battery replacement is available during the first year.
- Manufacturing defects are always covered under the warranty.
- Any manufacturing issue will be covered free of cost under the warranty.
- Water and screen damages are part of warranty protection plan.
