In [None]:
# 2️⃣ K-Means amélioré par DBSCAN
#
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score, normalized_mutual_info_score

#  Chargement et normalisation des données
df = pd.read_csv('/content/drive/MyDrive/project_dm/diabetes.csv')
scaler = StandardScaler()
data = scaler.fit_transform(df)

#  Déterminer le nombre optimal de clusters avec la méthode du coude (Elbow Method)
wcss = []
k_range = range(2, 15)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(data)
    wcss.append(kmeans.inertia_)

plt.plot(k_range, wcss, marker='o')
plt.xlabel('Nombre de clusters (k)')
plt.ylabel('WCSS')
plt.title('Elbow Method pour déterminer k optimal')
plt.show()

#  Choix du nombre optimal de clusters (ajuster en fonction du graphique)
k_optimal = 5

#  Appliquer K-Means
kmeans = KMeans(n_clusters=k_optimal, random_state=42, n_init=10)
kmeans_labels = kmeans.fit_predict(data)
df['Cluster_KMeans'] = kmeans_labels

#  Appliquer DBSCAN pour améliorer le clustering
dbscan = DBSCAN(eps=1.5, min_samples=5)
dbscan_labels = dbscan.fit_predict(data)

#  Correction du remplacement des outliers (-1) par la prédiction K-Means
dbscan_labels_fixed = np.copy(dbscan_labels)  # Copie des labels DBSCAN pour modification

for i in range(len(dbscan_labels)):
    if dbscan_labels[i] == -1:  # Si c'est un outlier (-1)
        dbscan_labels_fixed[i] = kmeans_labels[i]  # Remplace par la prédiction K-Means

df['Cluster_DBSCAN'] = dbscan_labels_fixed  # Mettre à jour les clusters DBSCAN corrigés

#  Visualisation des clusters (2D et 3D)
pca = PCA(n_components=2)
data_pca = pca.fit_transform(data)

plt.scatter(data_pca[:, 0], data_pca[:, 1], c=df['Cluster_DBSCAN'], cmap='plasma', alpha=0.6)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('Clusters après DBSCAN (2D)')
plt.colorbar(label='Cluster')
plt.show()

pca_3d = PCA(n_components=3)
data_pca_3d = pca_3d.fit_transform(data)

fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(data_pca_3d[:, 0], data_pca_3d[:, 1], data_pca_3d[:, 2], c=df['Cluster_DBSCAN'], cmap='plasma', alpha=0.6)
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3')
ax.set_title('Clusters après DBSCAN (3D)')
plt.show()

# Évaluation des performances
silhouette_avg_dbscan = silhouette_score(data, df['Cluster_DBSCAN'])
dbi_dbscan = davies_bouldin_score(data, df['Cluster_DBSCAN'])
nmi_score = normalized_mutual_info_score(df['Cluster_KMeans'], df['Cluster_DBSCAN'])

print(f"📌 Silhouette Score (DBSCAN): {silhouette_avg_dbscan:.4f}")
print(f"📌 Davies-Bouldin Index (DBSCAN): {dbi_dbscan:.4f}")
print(f"📌 NMI (K-Means vs DBSCAN): {nmi_score:.4f}")

