Lab 11: K-Means Clustering
This script demonstrates K-Means clustering algorithm.

In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs, load_iris
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score


In [None]:
def basic_kmeans_clustering():
    """Demonstrate basic K-Means clustering"""
    print("=" * 50)
    print("Basic K-Means Clustering")
    print("=" * 50)
    
    # Generate synthetic data
    X, y_true = make_blobs(n_samples=300, centers=4, n_features=2,
                           cluster_std=0.60, random_state=42)
    
    print(f"\nDataset shape: {X.shape}")
    print(f"True number of clusters: 4")
    
    # Apply K-Means
    kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
    y_pred = kmeans.fit_predict(X)
    
    # Get cluster centers
    centers = kmeans.cluster_centers_
    
    print(f"\nCluster Centers:")
    for i, center in enumerate(centers):
        print(f"Cluster {i}: {center}")
    
    print(f"\nInertia (within-cluster sum of squares): {kmeans.inertia_:.2f}")
    print(f"Number of iterations: {kmeans.n_iter_}")
    
    # Visualize results
    plt.figure(figsize=(12, 5))
    
    # Plot 1: True labels
    plt.subplot(1, 2, 1)
    plt.scatter(X[:, 0], X[:, 1], c=y_true, cmap='viridis', s=50, alpha=0.6)
    plt.title('True Labels')
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.colorbar(label='Cluster')
    
    # Plot 2: K-Means clusters
    plt.subplot(1, 2, 2)
    plt.scatter(X[:, 0], X[:, 1], c=y_pred, cmap='viridis', s=50, alpha=0.6)
    plt.scatter(centers[:, 0], centers[:, 1], c='red', s=200, alpha=0.8,
                marker='X', edgecolors='black', linewidth=2, label='Centroids')
    plt.title('K-Means Clusters')
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.colorbar(label='Cluster')
    plt.legend()
    
    plt.tight_layout()
    plt.savefig('lab11_basic_kmeans.png')
    plt.close()
    print("\nBasic K-Means plot saved as 'lab11_basic_kmeans.png'")


In [None]:
def elbow_method():
    """Use elbow method to find optimal number of clusters"""
    print("\n" + "=" * 50)
    print("Elbow Method for Optimal K")
    print("=" * 50)
    
    # Generate synthetic data
    X, _ = make_blobs(n_samples=300, centers=4, n_features=2,
                      cluster_std=0.60, random_state=42)
    
    # Test different values of K
    K_range = range(1, 11)
    inertias = []
    
    print("\nTesting different K values:")
    for k in K_range:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        kmeans.fit(X)
        inertias.append(kmeans.inertia_)
        print(f"K={k}: Inertia={kmeans.inertia_:.2f}")
    
    # Plot elbow curve
    plt.figure(figsize=(10, 6))
    plt.plot(K_range, inertias, 'bo-', linewidth=2, markersize=8)
    plt.xlabel('Number of Clusters (K)')
    plt.ylabel('Inertia (Within-cluster sum of squares)')
    plt.title('Elbow Method for Optimal K')
    plt.grid(True, alpha=0.3)
    plt.xticks(K_range)
    
    # Mark the elbow point (K=4)
    plt.axvline(x=4, color='r', linestyle='--', label='Optimal K=4')
    plt.legend()
    
    plt.tight_layout()
    plt.savefig('lab11_elbow_method.png')
    plt.close()
    print("\nElbow method plot saved as 'lab11_elbow_method.png'")


In [None]:
def silhouette_analysis():
    """Use silhouette score to evaluate clustering quality"""
    print("\n" + "=" * 50)
    print("Silhouette Analysis")
    print("=" * 50)
    
    # Generate synthetic data
    X, _ = make_blobs(n_samples=300, centers=4, n_features=2,
                      cluster_std=0.60, random_state=42)
    
    # Test different values of K
    K_range = range(2, 11)
    silhouette_scores = []
    
    print("\nSilhouette Scores for different K:")
    for k in K_range:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        labels = kmeans.fit_predict(X)
        score = silhouette_score(X, labels)
        silhouette_scores.append(score)
        print(f"K={k}: Silhouette Score={score:.4f}")
    
    # Find optimal K
    optimal_k = K_range[np.argmax(silhouette_scores)]
    print(f"\nOptimal K (highest silhouette score): {optimal_k}")
    
    # Plot silhouette scores
    plt.figure(figsize=(10, 6))
    plt.plot(K_range, silhouette_scores, 'go-', linewidth=2, markersize=8)
    plt.axvline(x=optimal_k, color='r', linestyle='--', 
                label=f'Optimal K={optimal_k}')
    plt.xlabel('Number of Clusters (K)')
    plt.ylabel('Silhouette Score')
    plt.title('Silhouette Analysis')
    plt.grid(True, alpha=0.3)
    plt.xticks(K_range)
    plt.legend()
    
    plt.tight_layout()
    plt.savefig('lab11_silhouette_analysis.png')
    plt.close()
    print("\nSilhouette analysis plot saved as 'lab11_silhouette_analysis.png'")


In [None]:
def iris_clustering():
    """Apply K-Means to Iris dataset"""
    print("\n" + "=" * 50)
    print("K-Means on Iris Dataset")
    print("=" * 50)
    
    # Load Iris dataset
    iris = load_iris()
    X = iris.data
    y_true = iris.target
    
    print(f"\nDataset shape: {X.shape}")
    print(f"True number of species: {len(np.unique(y_true))}")
    
    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Apply K-Means
    kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
    y_pred = kmeans.fit_predict(X_scaled)
    
    # Evaluate
    silhouette = silhouette_score(X_scaled, y_pred)
    davies_bouldin = davies_bouldin_score(X_scaled, y_pred)
    
    print(f"\nSilhouette Score: {silhouette:.4f}")
    print(f"Davies-Bouldin Index: {davies_bouldin:.4f}")
    print("(Lower Davies-Bouldin is better)")
    
    # Visualize using first 2 features
    plt.figure(figsize=(15, 5))
    
    # Plot 1: True labels
    plt.subplot(1, 3, 1)
    plt.scatter(X[:, 0], X[:, 1], c=y_true, cmap='viridis', s=50, alpha=0.6)
    plt.xlabel(iris.feature_names[0])
    plt.ylabel(iris.feature_names[1])
    plt.title('True Species Labels')
    plt.colorbar(label='Species')
    
    # Plot 2: K-Means clusters
    plt.subplot(1, 3, 2)
    plt.scatter(X[:, 0], X[:, 1], c=y_pred, cmap='viridis', s=50, alpha=0.6)
    centers_original = scaler.inverse_transform(kmeans.cluster_centers_)
    plt.scatter(centers_original[:, 0], centers_original[:, 1], 
                c='red', s=200, alpha=0.8, marker='X', 
                edgecolors='black', linewidth=2, label='Centroids')
    plt.xlabel(iris.feature_names[0])
    plt.ylabel(iris.feature_names[1])
    plt.title('K-Means Clusters')
    plt.colorbar(label='Cluster')
    plt.legend()
    
    # Plot 3: Different features
    plt.subplot(1, 3, 3)
    plt.scatter(X[:, 2], X[:, 3], c=y_pred, cmap='viridis', s=50, alpha=0.6)
    plt.scatter(centers_original[:, 2], centers_original[:, 3], 
                c='red', s=200, alpha=0.8, marker='X', 
                edgecolors='black', linewidth=2, label='Centroids')
    plt.xlabel(iris.feature_names[2])
    plt.ylabel(iris.feature_names[3])
    plt.title('K-Means (Different Features)')
    plt.colorbar(label='Cluster')
    plt.legend()
    
    plt.tight_layout()
    plt.savefig('lab11_iris_clustering.png')
    plt.close()
    print("\nIris clustering plot saved as 'lab11_iris_clustering.png'")


In [None]:
def kmeans_convergence():
    """Demonstrate K-Means convergence process"""
    print("\n" + "=" * 50)
    print("K-Means Convergence Process")
    print("=" * 50)
    
    # Generate simple data
    np.random.seed(42)
    X, _ = make_blobs(n_samples=100, centers=3, n_features=2,
                      cluster_std=0.60, random_state=42)
    
    # Initialize K-Means with max_iter=1 to see step-by-step
    print("\nShowing convergence iterations:")
    
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    axes = axes.ravel()
    
    for i, max_iter in enumerate([0, 1, 2, 3, 5, 10]):
        kmeans = KMeans(n_clusters=3, random_state=42, max_iter=max_iter, n_init=1)
        
        if max_iter == 0:
            # Show initial random centroids
            kmeans.fit(X)
            labels = np.zeros(len(X), dtype=int)
            centers = kmeans.cluster_centers_
        else:
            labels = kmeans.fit_predict(X)
            centers = kmeans.cluster_centers_
        
        axes[i].scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', s=50, alpha=0.6)
        axes[i].scatter(centers[:, 0], centers[:, 1], c='red', s=200, alpha=0.8,
                       marker='X', edgecolors='black', linewidth=2)
        axes[i].set_title(f'Iteration {max_iter}')
        axes[i].set_xlabel('Feature 1')
        axes[i].set_ylabel('Feature 2')
        
        print(f"Iteration {max_iter}: Inertia = {kmeans.inertia_:.2f}")
    
    plt.tight_layout()
    plt.savefig('lab11_convergence.png')
    plt.close()
    print("\nConvergence plot saved as 'lab11_convergence.png'")


In [None]:
def compare_initializations():
    """Compare different initialization methods"""
    print("\n" + "=" * 50)
    print("Comparing Initialization Methods")
    print("=" * 50)
    
    # Generate data
    X, _ = make_blobs(n_samples=300, centers=4, n_features=2,
                      cluster_std=0.60, random_state=42)
    
    # Test different initializations
    init_methods = ['k-means++', 'random']
    results = {}
    
    for init in init_methods:
        kmeans = KMeans(n_clusters=4, init=init, n_init=10, random_state=42)
        kmeans.fit(X)
        
        results[init] = {
            'inertia': kmeans.inertia_,
            'n_iter': kmeans.n_iter_
        }
        
        print(f"\n{init}:")
        print(f"  Inertia: {kmeans.inertia_:.2f}")
        print(f"  Iterations: {kmeans.n_iter_}")


In [None]:
def cluster_analysis():
    """Analyze characteristics of each cluster"""
    print("\n" + "=" * 50)
    print("Cluster Characteristics Analysis")
    print("=" * 50)
    
    # Load Iris dataset
    iris = load_iris()
    X = iris.data
    
    # Apply K-Means
    kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
    clusters = kmeans.fit_predict(X)
    
    # Create DataFrame for analysis
    df = pd.DataFrame(X, columns=iris.feature_names)
    df['Cluster'] = clusters
    
    print("\nCluster Statistics:")
    print(df.groupby('Cluster').mean())
    
    print("\nCluster Sizes:")
    print(df['Cluster'].value_counts().sort_index())
    
    # Visualize cluster characteristics
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    axes = axes.ravel()
    
    for i, feature in enumerate(iris.feature_names):
        for cluster in range(3):
            cluster_data = df[df['Cluster'] == cluster][feature]
            axes[i].hist(cluster_data, alpha=0.5, label=f'Cluster {cluster}', bins=15)
        
        axes[i].set_xlabel(feature)
        axes[i].set_ylabel('Frequency')
        axes[i].set_title(f'{feature} Distribution by Cluster')
        axes[i].legend()
        axes[i].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('lab11_cluster_characteristics.png')
    plt.close()
    print("\nCluster characteristics plot saved as 'lab11_cluster_characteristics.png'")


In [None]:
def main():
    """Main function to demonstrate K-Means clustering"""
    print("\n" + "=" * 50)
    print("Lab 11: K-Means Clustering")
    print("=" * 50)
    
    # Basic K-Means
    basic_kmeans_clustering()
    
    # Elbow method
    elbow_method()
    
    # Silhouette analysis
    silhouette_analysis()
    
    # Iris clustering
    iris_clustering()
    
    # Convergence process
    kmeans_convergence()
    
    # Compare initializations
    compare_initializations()
    
    # Cluster analysis
    cluster_analysis()
    
    print("\n" + "=" * 50)
    print("Lab 11 Complete!")
    print("=" * 50)


In [None]:
if __name__ == "__main__":
    main()
