In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Load the Seeds dataset
file_path = "seeds_dataset.txt"  # Update this if using a different path
column_names = [
    "area", "perimeter", "compactness", "length_of_kernel", "width_of_kernel",
    "asymmetry_coefficient", "length_of_kernel_groove", "class"
]

# Read the dataset (handles whitespace)
seeds_df = pd.read_csv(file_path, sep="\s+", names=column_names)

# Drop the actual class labels for unsupervised learning
X = seeds_df.drop("class", axis=1)

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans_labels = kmeans.fit_predict(X_scaled)

# Add the cluster labels to the original dataframe
seeds_df["Cluster"] = kmeans_labels

# Evaluate the clustering with silhouette score
sil_score = silhouette_score(X_scaled, kmeans_labels)

# Output
print("Silhouette Score:", sil_score)
print(seeds_df.head())


Silhouette Score: 0.4024370101867174
    area  perimeter  compactness  length_of_kernel  width_of_kernel  \
0  15.26      14.84       0.8710             5.763            3.312   
1  14.88      14.57       0.8811             5.554            3.333   
2  14.29      14.09       0.9050             5.291            3.337   
3  13.84      13.94       0.8955             5.324            3.379   
4  16.14      14.99       0.9034             5.658            3.562   

   asymmetry_coefficient  length_of_kernel_groove  class  Cluster  
0                  2.221                    5.220      1        2  
1                  1.018                    4.956      1        2  
2                  2.699                    4.825      1        2  
3                  2.259                    4.805      1        2  
4                  1.355                    5.175      1        2  


In [2]:
from sklearn.cluster import AgglomerativeClustering, MeanShift, estimate_bandwidth

# Hierarchical Clustering
hierarchical = AgglomerativeClustering(n_clusters=3)
hierarchical_labels = hierarchical.fit_predict(X_scaled)

# MeanShift Clustering
bandwidth = estimate_bandwidth(X_scaled, quantile=0.2, n_samples=100)
meanshift = MeanShift(bandwidth=bandwidth, bin_seeding=True)
meanshift_labels = meanshift.fit_predict(X_scaled)

# Add labels to dataframe
seeds_df["Hierarchical_Cluster"] = hierarchical_labels
seeds_df["MeanShift_Cluster"] = meanshift_labels

# Silhouette scores
sil_score_hierarchical = silhouette_score(X_scaled, hierarchical_labels)
sil_score_meanshift = silhouette_score(X_scaled, meanshift_labels)

print("Silhouette Score (Hierarchical):", sil_score_hierarchical)
print("Silhouette Score (MeanShift):", sil_score_meanshift)
print(seeds_df[["Cluster", "Hierarchical_Cluster", "MeanShift_Cluster"]].head())


Silhouette Score (Hierarchical): 0.3926339709101015
Silhouette Score (MeanShift): 0.4674286233538442
   Cluster  Hierarchical_Cluster  MeanShift_Cluster
0        2                     0                  0
1        2                     0                  0
2        2                     0                  0
3        2                     0                  0
4        2                     0                  0


In [3]:
print("Number of rows:", seeds_df.shape[0])


Number of rows: 210
