In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt

# Step 1: Sample dataset simulating duplicate event registrations
data = {
    'Name': ['Amit', 'Ameet', 'Amith', 'Sonia', 'Sonya', 'Raj', 'Raju', 'Raghav', 'Ragav', 'Amit'],
    'Age': [25, 25, 26, 30, 30, 28, 29, 35, 35, 25],
    'Event': ['Hackathon', 'Hackathon', 'Hackathon', 'Workshop', 'Workshop', 'Meetup', 'Meetup', 'Seminar', 'Seminar', 'Hackathon']
}
df = pd.DataFrame(data)

# Step 2: Encode categorical values
df_encoded = pd.get_dummies(df, columns=['Name', 'Event'])

# Normalize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_encoded)

# Step 3: Apply DBSCAN
dbscan = DBSCAN(eps=1.5, min_samples=2)  # Tune eps as needed
df['cluster'] = dbscan.fit_predict(X_scaled)

# Step 4: Identify duplicates: entries in the same cluster
duplicates = df[df['cluster'] != -1]
dedup_df = duplicates.drop_duplicates(subset='cluster', keep='first')

# Show results
print("Original Records with Cluster Labels:")
print(df[['Name', 'Age', 'Event', 'cluster']])
print("\nDetected Duplicates:")
print(duplicates[['Name', 'Age', 'Event', 'cluster']])
print("\nDeduplicated Records (unique per cluster):")
print(dedup_df[['Name', 'Age', 'Event']])

# Optional: Visualize clusters
plt.scatter(df.index, df['Age'], c=df['cluster'], cmap='rainbow')
plt.title("DBSCAN Clustering for Deduplication")
plt.xlabel("Index")
plt.ylabel("Age")
plt.grid(True)
plt.show()