In [1]:
# Question: Advanced Deduplication Using Machine Learning
# Description: Implement ML-based deduplication based on feature similarity.
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Sample dataset with some potential duplicates
data = {
    'Name': ['John Doe', 'Jane Smith', 'John Doe', 'Alice Johnson', 'Jane Smith'],
    'Age': [28, 34, 28, 45, 34],
    'City': ['New York', 'Los Angeles', 'New York', 'Chicago', 'Los Angeles']
}

# Create a DataFrame
df = pd.DataFrame(data)

# Step 1: Feature Engineering
# Convert categorical features (like 'City') to numerical format using one-hot encoding
df_encoded = pd.get_dummies(df, columns=['City'])

# Step 2: Normalize the data to avoid scale-based bias
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_encoded.drop('Name', axis=1))  # Dropping 'Name' column for clustering

# Step 3: Apply KMeans Clustering to group similar rows
kmeans = KMeans(n_clusters=3, random_state=42)  # Let's assume there are 3 clusters (groups of duplicates)
df['Cluster'] = kmeans.fit_predict(df_scaled)

# Step 4: Identify duplicates based on clusters
# For each cluster, keep the first occurrence and remove the rest
duplicates_removed = df.drop_duplicates(subset='Cluster', keep='first')

# Display the deduplicated DataFrame
print(duplicates_removed[['Name', 'Age', 'City']])




            Name  Age         City
0       John Doe   28     New York
1     Jane Smith   34  Los Angeles
3  Alice Johnson   45      Chicago
