In [1]:
# Question: Advanced Deduplication Using Machine Learning
# Description: Implement ML-based deduplication based on feature similarity.

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Sample dataset with possible duplicates
data = {
    'id': [1, 2, 3, 4, 5],
    'name': [
        'John Doe',
        'Jon Doe',
        'Jane Smith',
        'Jane A. Smith',
        'J. Smith'
    ]
}

df = pd.DataFrame(data)

# Step 2: Compute TF-IDF vectors for 'name' column
vectorizer = TfidfVectorizer().fit_transform(df['name'])
similarity_matrix = cosine_similarity(vectorizer)

# Step 3: Mark duplicate pairs based on similarity threshold
threshold = 0.85
duplicates = set()

for i in range(len(df)):
    for j in range(i + 1, len(df)):
        if similarity_matrix[i, j] > threshold:
            duplicates.add((df.iloc[i]['id'], df.iloc[j]['id']))

# Step 4: Show results
print("Potential duplicate ID pairs (similarity > 0.85):")
for pair in duplicates:
    print(pair)



Potential duplicate ID pairs (similarity > 0.85):
(3, 4)
