In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

In [3]:
data = pd.read_csv('Wine_clust.csv')
X = data[['Alcohol','Malic_Acid','Ash','Ash_Alcanity','Magnesium','Total_Phenols','Flavanoids','Nonflavanoid_Phenols','Proanthocyanins','Color_Intensity','Hue','OD280','Proline']]

# Standardize the features to have mean=0 and standard deviation=1
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [4]:
def find_optimal_clusters(model, data, range_n_clusters):
    best_score = -1
    optimal_n_clusters = None

    for n_clusters in range_n_clusters:
        cluster_model = model(n_clusters=n_clusters, random_state=42)
        cluster_labels = cluster_model.fit_predict(data)
        
        # Silhouette score (higher is better)
        silhouette_avg = silhouette_score(data, cluster_labels)
        
        # Calinski-Harabasz score (higher is better)
        calinski_score = calinski_harabasz_score(data, cluster_labels)
        
        # Davies-Bouldin score (lower is better)
        davies_bouldin_score_avg = davies_bouldin_score(data, cluster_labels)
        
        # You can also try other metrics to evaluate the clustering performance
        
        # Taking the average of the scores here, but you can use any other criterion.
        score_avg = (silhouette_avg + calinski_score + (1 / davies_bouldin_score_avg)) / 3
        
        if score_avg > best_score:
            best_score = score_avg
            optimal_n_clusters = n_clusters

    return optimal_n_clusters


In [8]:
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler

# ... (previous code)

def find_optimal_clusters(model, data, range_n_clusters):
    optimal_n_clusters = None

    for n_clusters in range_n_clusters:
        cluster_model = model(n_clusters=n_clusters)
        cluster_labels = cluster_model.fit_predict(data)

        # Silhouette score (higher is better)
        silhouette_avg = silhouette_score(data, cluster_labels)
        print(f"For n_clusters={n_clusters}, the silhouette score is {silhouette_avg}")

        if optimal_n_clusters is None or silhouette_avg > optimal_silhouette:
            optimal_n_clusters = n_clusters
            optimal_silhouette = silhouette_avg

    return optimal_n_clusters

# ... (previous code)

# Usage example for KMeans
optimal_kmeans_clusters = find_optimal_clusters(KMeans, X_scaled, range_n_clusters)
kmeans_model = KMeans(n_clusters=optimal_kmeans_clusters)
kmeans_labels = kmeans_model.fit_predict(X_scaled)

# Usage example for Agglomerative Clustering
optimal_agg_clusters = find_optimal_clusters(AgglomerativeClustering, X_scaled, range_n_clusters)
agg_model = AgglomerativeClustering(n_clusters=optimal_agg_clusters)
agg_labels = agg_model.fit_predict(X_scaled)


For n_clusters=2, the silhouette score is 0.26831340971052126
For n_clusters=3, the silhouette score is 0.2848589191898987
For n_clusters=4, the silhouette score is 0.2614352045273167
For n_clusters=5, the silhouette score is 0.25481042740970417
For n_clusters=6, the silhouette score is 0.1954279440862444
For n_clusters=7, the silhouette score is 0.19719778519132336
For n_clusters=8, the silhouette score is 0.14370559736785465
For n_clusters=9, the silhouette score is 0.15632097372774345
For n_clusters=10, the silhouette score is 0.14639725203312745
For n_clusters=2, the silhouette score is 0.2670131771272231
For n_clusters=3, the silhouette score is 0.2774439826952265
For n_clusters=4, the silhouette score is 0.225836659334758
For n_clusters=5, the silhouette score is 0.18674235566758707
For n_clusters=6, the silhouette score is 0.17966642854438503
For n_clusters=7, the silhouette score is 0.18685342560226942
For n_clusters=8, the silhouette score is 0.18834697102837825
For n_clusters

In [9]:
# Range of cluster numbers to explore
range_n_clusters = range(2, 11)

# K-means clustering
optimal_kmeans_clusters = find_optimal_clusters(KMeans, X_scaled, range_n_clusters)
kmeans_model = KMeans(n_clusters=optimal_kmeans_clusters, random_state=42)
kmeans_labels = kmeans_model.fit_predict(X_scaled)

# Agglomerative Hierarchical clustering
optimal_agg_clusters = find_optimal_clusters(AgglomerativeClustering, X_scaled, range_n_clusters)
agg_model = AgglomerativeClustering(n_clusters=optimal_agg_clusters)
agg_labels = agg_model.fit_predict(X_scaled)


For n_clusters=2, the silhouette score is 0.26831340971052126
For n_clusters=3, the silhouette score is 0.2848589191898987
For n_clusters=4, the silhouette score is 0.2480025291433857
For n_clusters=5, the silhouette score is 0.23517877055946282
For n_clusters=6, the silhouette score is 0.20710628876426948
For n_clusters=7, the silhouette score is 0.15664977680792785
For n_clusters=8, the silhouette score is 0.1989058923144989
For n_clusters=9, the silhouette score is 0.15213880073126265
For n_clusters=10, the silhouette score is 0.1378580956992455
For n_clusters=2, the silhouette score is 0.2670131771272231
For n_clusters=3, the silhouette score is 0.2774439826952265
For n_clusters=4, the silhouette score is 0.225836659334758
For n_clusters=5, the silhouette score is 0.18674235566758707
For n_clusters=6, the silhouette score is 0.17966642854438503
For n_clusters=7, the silhouette score is 0.18685342560226942
For n_clusters=8, the silhouette score is 0.18834697102837825
For n_clusters=

In [10]:
# DBSCAN clustering (no need to specify the number of clusters)
dbscan_model = DBSCAN(eps=0.5, min_samples=5)
dbscan_labels = dbscan_model.fit_predict(X_scaled)