In [1]:
# 📘 05_unsupervised_learning.ipynb

# 📦 Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import dendrogram, linkage

# 📥 Step 2: Load cleaned dataset
df = pd.read_csv('../data/heart_cleaned.csv')
X = df.drop('target', axis=1)

# ==========================
# 📌 KMeans Clustering
# ==========================
# Try different K values
silhouette_scores = []
K_range = range(2, 8)
for k in K_range:
    km = KMeans(n_clusters=k, random_state=42)
    labels = km.fit_predict(X)
    score = silhouette_score(X, labels)
    silhouette_scores.append(score)

# Plot Silhouette Scores
plt.figure(figsize=(8, 4))
plt.plot(K_range, silhouette_scores, marker='o')
plt.title("KMeans - Silhouette Scores vs. K")
plt.xlabel("Number of Clusters (K)")
plt.ylabel("Silhouette Score")
plt.grid(True)
plt.tight_layout()
plt.savefig('../data/kmeans_silhouette.png')
plt.close()

# Final KMeans with optimal K
k_optimal = K_range[np.argmax(silhouette_scores)]
kmeans = KMeans(n_clusters=k_optimal, random_state=42)
df['kmeans_cluster'] = kmeans.fit_predict(X)

# ==========================
# 📌 Hierarchical Clustering
# ==========================
# Linkage Matrix for Dendrogram
Z = linkage(X, method='ward')

# Plot Dendrogram
plt.figure(figsize=(10, 5))
dendrogram(Z, truncate_mode='lastp', p=10)
plt.title("Hierarchical Clustering - Dendrogram")
plt.xlabel("Sample Index or Cluster")
plt.ylabel("Distance")
plt.tight_layout()
plt.savefig('../data/hierarchical_dendrogram.png')
plt.close()

# Agglomerative Clustering
hc = AgglomerativeClustering(n_clusters=k_optimal)
df['hierarchical_cluster'] = hc.fit_predict(X)

# ==========================
# 📊 PCA for Visualization (2D)
# ==========================
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Plot KMeans Clusters in 2D
plt.figure(figsize=(6, 5))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=df['kmeans_cluster'], cmap='tab10', s=30)
plt.title("KMeans Clusters (PCA 2D)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.tight_layout()
plt.savefig('../data/kmeans_pca_plot.png')
plt.close()

# Plot Hierarchical Clusters in 2D
plt.figure(figsize=(6, 5))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=df['hierarchical_cluster'], cmap='tab10', s=30)
plt.title("Hierarchical Clusters (PCA 2D)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.tight_layout()
plt.savefig('../data/hierarchical_pca_plot.png')
plt.close()

print(f"✅ Step 05 complete. Optimal K: {k_optimal}, PCA plots saved.")

[WinError 2] The system cannot find the file specified
  File "c:\Users\Eng.Basel\AppData\Local\Programs\Python\Python310\lib\site-packages\joblib\externals\loky\backend\context.py", line 247, in _count_physical_cores
    cpu_count_physical = _count_physical_cores_win32()
  File "c:\Users\Eng.Basel\AppData\Local\Programs\Python\Python310\lib\site-packages\joblib\externals\loky\backend\context.py", line 299, in _count_physical_cores_win32
    cpu_info = subprocess.run(
  File "c:\Users\Eng.Basel\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 503, in run
    with Popen(*popenargs, **kwargs) as process:
  File "c:\Users\Eng.Basel\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 971, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\Eng.Basel\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 1440, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,


✅ Step 05 complete. Optimal K: 2, PCA plots saved.
