In [None]:
# 1️⃣ K-Means seul


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score, normalized_mutual_info_score
from scipy.stats import entropy

# Chargement et normalisation des données
df = pd.read_csv('/content/drive/MyDrive/project_dm/diabetes.csv')
scaler = StandardScaler()
data = scaler.fit_transform(df)

# Méthode du Coude (Elbow Method) pour choisir k optimal
wcss = []
k_range = range(2, 15)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(data)
    wcss.append(kmeans.inertia_)

#  Affichage du graphique Elbow Method
plt.figure(figsize=(8, 5))
plt.plot(k_range, wcss, marker='o', linestyle='-')
plt.xlabel('Nombre de clusters (k)')
plt.ylabel('WCSS (Within-Cluster Sum of Squares)')
plt.title('Méthode du coude pour déterminer k optimal')
plt.show()

#  Validation avec le Silhouette Score

best_k = 2
best_score = -1
silhouette_scores = []

for k in range(2, 15):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(data)
    score = silhouette_score(data, labels)
    silhouette_scores.append(score)

    if score > best_score:
        best_score = score
        best_k = k

print(f"📌 Le nombre optimal de clusters selon le Silhouette Score est: {best_k}")

#  Affichage du Silhouette Score en fonction de k
plt.figure(figsize=(8, 5))
plt.plot(range(2, 15), silhouette_scores, marker='o', linestyle='-')
plt.xlabel('Nombre de clusters (k)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score pour différents k')
plt.show()

# 📌 4️⃣ Appliquer K-Means avec le k optimal
kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10)
kmeans_labels = kmeans.fit_predict(data)
df['Cluster_KMeans'] = kmeans_labels

# # Calcul des métriques d'évaluation :

#  Silhouette Score
silhouette_avg = silhouette_score(data, kmeans_labels)
print(f"📌 Silhouette Score: {silhouette_avg:.4f}")

#  Davies-Bouldin Index (DBI)
dbi = davies_bouldin_score(data, kmeans_labels)
print(f"📌 Davies-Bouldin Index: {dbi:.4f}")

#  Normalized Mutual Information (NMI) (comparaison K-Means vs lui-même pour validation)
nmi_score = normalized_mutual_info_score(kmeans_labels, kmeans_labels)
print(f"📌 Normalized Mutual Information (NMI): {nmi_score:.4f}")

#  Calcul de l'entropie des clusters
def calculate_entropy(labels):
    unique_labels, counts = np.unique(labels, return_counts=True)
    probabilities = counts / counts.sum()
    return entropy(probabilities, base=2)

entropy_score = calculate_entropy(kmeans_labels)
print(f"📌 Entropie des clusters: {entropy_score:.4f}")

# Visualisation des clusters en 2D (PCA)
pca = PCA(n_components=2)
data_pca = pca.fit_transform(data)

plt.figure(figsize=(8, 6))
plt.scatter(data_pca[:, 0], data_pca[:, 1], c=kmeans_labels, cmap='viridis', alpha=0.6)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title(f'Clusters K-Means avec k={best_k} (2D)')
plt.colorbar(label='Cluster')
plt.show()

#  Visualisation des clusters en 3D (PCA)
pca_3d = PCA(n_components=3)
data_pca_3d = pca_3d.fit_transform(data)

fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(data_pca_3d[:, 0], data_pca_3d[:, 1], data_pca_3d[:, 2], c=kmeans_labels, cmap='viridis', alpha=0.6)
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3')
ax.set_title(f'Clusters K-Means avec k={best_k} (3D)')
plt.show()

