In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt

# Step 1: Simulate duplicate employee records
data = {
    'Name': ['Alice', 'Alicia', 'Bob', 'Robert', 'Rob', 'Charlie', 'Charli', 'David', 'Dave', 'Davi'],
    'Department': ['HR', 'HR', 'Finance', 'Finance', 'Finance', 'IT', 'IT', 'Sales', 'Sales', 'Sales'],
    'Age': [30, 31, 40, 39, 41, 35, 34, 28, 28, 27]
}
df = pd.DataFrame(data)

# Encode categorical features
df_encoded = pd.get_dummies(df, columns=['Name', 'Department'])

# Step 2: Normalize data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_encoded)

# Step 3: Dendrogram to visualize clusters
linked = linkage(X_scaled, method='ward')
plt.figure(figsize=(10, 6))
dendrogram(linked, labels=df['Name'].values, distance_sort='descending', show_leaf_counts=True)
plt.title("Dendrogram for Employee Records")
plt.show()

# Step 4: Apply Agglomerative Clustering
clustering = AgglomerativeClustering(n_clusters=5)  # You can tune this value
df['cluster'] = clustering.fit_predict(X_scaled)

# Step 5: Identify representative records per cluster (first entry per cluster)
dedup_df = df.drop_duplicates(subset='cluster', keep='first')

# Show output
print("Original Records with Cluster Labels:")
print(df[['Name', 'Department', 'Age', 'cluster']])
print("\nDeduplicated Records:")
print(dedup_df[['Name', 'Department', 'Age']])