Exact Duplicate Detection (Simple)

In [1]:
# Two texts
text1 = "The sky is blue."
text2 = "The sky is blue."

# Exact Match
if text1.strip().lower() == text2.strip().lower():
    print("Duplicate detected (Exact match).")
else:
    print("Not duplicate (Exact match).")


Duplicate detected (Exact match).


Duplicate Detection using Cosine Similarity + TF-IDF

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Example texts
texts = [
    "The sky is blue and clear today.",
    "Today the sky looks blue and clear.",
    "It might rain later today."
]

# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(texts)

# Calculate Cosine Similarity
cosine_sim = cosine_similarity(tfidf_matrix)

# Optional: View the Cosine Similarity matrix
print("\nCosine Similarity Matrix:")
cosine_sim_df = pd.DataFrame(cosine_sim, index=[f"Text {i}" for i in range(1, len(texts)+1)], columns=[f"Text {i}" for i in range(1, len(texts)+1)])
print(cosine_sim_df)

# Threshold for considering duplicates
threshold = 0.7  

print("\nPotential Duplicate Pairs (Cosine Similarity >= 0.7):")
found = False
for i in range(len(texts)):
    for j in range(i+1, len(texts)):
        if cosine_sim[i, j] >= threshold:
            print(f"\n- Text {i+1}: {texts[i]}")
            print(f"- Text {j+1}: {texts[j]}")
            print(f"Similarity: {cosine_sim[i,j]:.2f}")
            found = True

if not found:
    print("No duplicates found above the threshold.")



Cosine Similarity Matrix:
          Text 1    Text 2    Text 3
Text 1  1.000000  0.764197  0.081227
Text 2  0.764197  1.000000  0.081227
Text 3  0.081227  0.081227  1.000000

Potential Duplicate Pairs (Cosine Similarity >= 0.7):

- Text 1: The sky is blue and clear today.
- Text 2: Today the sky looks blue and clear.
Similarity: 0.76


Duplicate Detection using Jaccard Similarity

In [4]:
def jaccard_similarity(list1, list2):
    set1 = set(list1)
    set2 = set(list2)
    return len(set1.intersection(set2)) / len(set1.union(set2))

# Tokenize
tokens1 = "The sky is blue and clear today".lower().split()
tokens2 = "Today the sky looks blue and clear".lower().split()

# Jaccard
similarity = jaccard_similarity(tokens1, tokens2)
print(f"Jaccard Similarity: {similarity:.2f}")

# Set threshold
if similarity > 0.5:
    print("Likely duplicate (Jaccard).")
else:
    print("Not duplicate (Jaccard).")


Jaccard Similarity: 0.75
Likely duplicate (Jaccard).
