In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, adjusted_rand_index
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy import stats
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D

In [None]:
# Part 1: Implementing Clustering

# Step 1: Load the Iris dataset from the provided file
print("Step 1: Loading the Iris dataset from file...")

# Define column names
column_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']

# Load the data
iris_df = pd.read_csv('iris.data.txt', header=None, names=column_names)

# Create a mapping for class labels to numeric values
class_mapping = {
    'Iris-setosa': 0,
    'Iris-versicolor': 1,
    'Iris-virginica': 2
}

# Create a numeric target column
iris_df['class_numeric'] = iris_df['class'].map(class_mapping)

# Display information about the dataset
print(f"Dataset Shape: {iris_df.shape}")
print("\nFirst 5 rows of the dataset:")
print(iris_df.head())
print("\nClass distribution:")
print(iris_df['class'].value_counts())
print("\nSummary statistics:")
print(iris_df.describe())

In [None]:
# Step 2: Preprocess the data
print("\nStep 2: Preprocessing the data...")

# Extract features (excluding class labels)
X = iris_df.iloc[:, 0:4].values
y = iris_df['class_numeric'].values

# Normalize/standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Data has been standardized. Mean of each feature is now approximately 0, and standard deviation is 1.")

In [None]:
# Step 3: Implement K-means Clustering
print("\nStep 3: Implementing K-means Clustering...")

# Finding the optimal number of clusters using the Elbow Method
inertia = []
k_range = range(1, 11)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

# Plot the Elbow Method results
plt.figure(figsize=(10, 6))
plt.plot(k_range, inertia, 'bo-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia (Sum of Squared Distances)')
plt.title('Elbow Method for Optimal k')
plt.grid(True)
plt.savefig('elbow_method.png')
print("Elbow Method plot saved as 'elbow_method.png'")

In [None]:
# Calculate silhouette scores for different k values
silhouette_scores = []
for k in range(2, 11):  # Silhouette score requires at least 2 clusters
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    cluster_labels = kmeans.fit_predict(X_scaled)
    silhouette_avg = silhouette_score(X_scaled, cluster_labels)
    silhouette_scores.append(silhouette_avg)
    print(f"For n_clusters = {k}, the silhouette score is {silhouette_avg:.3f}")

# Plot silhouette scores
plt.figure(figsize=(10, 6))
plt.plot(range(2, 11), silhouette_scores, 'bo-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score for Different k Values')
plt.grid(True)
plt.savefig('silhouette_scores.png')
print("Silhouette Score plot saved as 'silhouette_scores.png'")

In [None]:
# Choose optimal k based on analysis (k=3)
k_optimal = 3
print(f"\nChosen optimal number of clusters (k): {k_optimal}")

# Fit K-means with the optimal k
kmeans = KMeans(n_clusters=k_optimal, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(X_scaled)

# Add cluster labels to the original dataframe
iris_df['kmeans_cluster'] = cluster_labels

# Display cluster distribution
print("\nCluster distribution:")
print(iris_df['kmeans_cluster'].value_counts())

In [None]:
# Step 4: Implement Hierarchical Clustering
print("\nStep 4: Implementing Hierarchical Clustering...")

# Compute the linkage matrix
Z = linkage(X_scaled, method='ward')

# Plot the dendrogram
plt.figure(figsize=(12, 8))
dendrogram(Z, leaf_rotation=90, leaf_font_size=8)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Sample index')
plt.ylabel('Distance')
plt.axhline(y=5, color='r', linestyle='--')  # Suggest a cut-off
plt.savefig('dendrogram.png')
print("Dendrogram saved as 'dendrogram.png'")

In [None]:
# Part 2: Debugging Issues
print("\n--- Part 2: Debugging Issues ---")

# Check if the data was properly scaled
print("\nChecking for scaling issues...")
if np.abs(X_scaled.mean()) < 0.01 and np.abs(X_scaled.std() - 1) < 0.01:
    print("Data is properly scaled: Mean is close to 0 and standard deviation is close to 1.")
else:
    print("WARNING: Data may not be properly scaled!")
    print(f"Mean: {X_scaled.mean()}")
    print(f"Standard deviation: {X_scaled.std()}")

# Check for outliers
print("\nChecking for outliers...")
z_scores = stats.zscore(X_scaled)
abs_z_scores = np.abs(z_scores)
outliers = (abs_z_scores > 3).any(axis=1)
print(f"Number of samples with outliers (z-score > 3): {outliers.sum()}")
if outliers.sum() > 0:
    print("Outlier samples indices:")
    print(np.where(outliers)[0])

In [None]:
# Test different initialization methods for K-means
print("\nTesting different initialization methods for K-means...")
init_methods = ['k-means++', 'random']
silhouette_init = {}

for init in init_methods:
    kmeans_init = KMeans(n_clusters=k_optimal, init=init, random_state=42, n_init=10)
    cluster_labels_init = kmeans_init.fit_predict(X_scaled)
    silhouette_init[init] = silhouette_score(X_scaled, cluster_labels_init)
    print(f"Initialization method: {init}, Silhouette Score: {silhouette_init[init]:.3f}")

best_init = max(silhouette_init, key=silhouette_init.get)
print(f"Best initialization method: {best_init}")

# Use the best initialization method
kmeans_best = KMeans(n_clusters=k_optimal, init=best_init, random_state=42, n_init=10)
cluster_labels_best = kmeans_best.fit_predict(X_scaled)
iris_df['best_cluster'] = cluster_labels_best

In [None]:
# Part 3: Evaluating the Model
print("\n--- Part 3: Evaluating the Model ---")

# Visualize clusters in 2D using PCA
print("\nVisualizing clusters using PCA...")
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(12, 10))

# Ground truth classes
plt.subplot(2, 2, 1)
for i in range(3):
    plt.scatter(X_pca[y == i, 0], X_pca[y == i, 1], label=f'Class {i} ({list(class_mapping.keys())[i]})')
plt.title('PCA of Iris dataset - Ground Truth Classes')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.legend()

# K-means clusters
plt.subplot(2, 2, 2)
for i in range(k_optimal):
    plt.scatter(X_pca[cluster_labels_best == i, 0], X_pca[cluster_labels_best == i, 1], label=f'Cluster {i}')
plt.title('PCA of Iris dataset - K-means Clusters')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.legend()

# Sepal length vs width
plt.subplot(2, 2, 3)
for i in range(3):
    plt.scatter(X[y == i, 0], X[y == i, 1], label=f'Class {i} ({list(class_mapping.keys())[i]})')
plt.title('Sepal Length vs Sepal Width - Ground Truth')
plt.xlabel('Sepal Length')
plt.ylabel('Sepal Width')
plt.legend()

# Petal length vs width
plt.subplot(2, 2, 4)
for i in range(3):
    plt.scatter(X[y == i, 2], X[y == i, 3], label=f'Class {i} ({list(class_mapping.keys())[i]})')
plt.title('Petal Length vs Petal Width - Ground Truth')
plt.xlabel('Petal Length')
plt.ylabel('Petal Width')
plt.legend()

plt.tight_layout()
plt.savefig('cluster_visualization.png')
print("Cluster visualization saved as 'cluster_visualization.png'")

In [None]:
# Evaluate clustering against ground truth
print("\nEvaluating clustering against ground truth...")
ari = adjusted_rand_index(y, cluster_labels_best)
print(f"Adjusted Rand Index: {ari:.3f}")
print("Note: ARI ranges from -1 to 1, where 1 indicates perfect agreement, and values around 0 indicate random labeling.")

# Contingency table
contingency_table = pd.crosstab(
    iris_df['class_numeric'],
    iris_df['best_cluster'],
    rownames=['Class'],
    colnames=['Cluster']
)
print("\nContingency table (Clusters vs. Classes):")
print(contingency_table)

# Map clusters to classes
cluster_to_class_mapping = {}
for cluster in range(k_optimal):
    most_common_class = contingency_table[cluster].idxmax()
    cluster_to_class_mapping[cluster] = most_common_class

print("\nMapping clusters to original classes:")
for cluster, class_num in cluster_to_class_mapping.items():
    class_name = list(class_mapping.keys())[class_num]
    print(f"Cluster {cluster} → Class {class_num} ({class_name})")

# Overall agreement
mapped_labels = np.array([cluster_to_class_mapping[label] for label in cluster_labels_best])
agreement = np.sum(mapped_labels == y) / len(y)
print(f"\nOverall agreement after mapping: {agreement:.3f} or {agreement*100:.1f}%")

In [None]:
# Create a 3D visualization of the clusters
print("\nCreating 3D visualization of clusters...")

fig = plt.figure(figsize=(12, 10))
ax = fig.add_subplot(111, projection='3d')

# Choose 3 features for visualization
features_to_plot = [0, 2, 3]  # sepal_length, petal_length, petal_width
feature_names = [column_names[i] for i in features_to_plot]

for cluster in range(k_optimal):
    indices = cluster_labels_best == cluster
    ax.scatter(
        X[indices, features_to_plot[0]],
        X[indices, features_to_plot[1]],
        X[indices, features_to_plot[2]],
        label=f'Cluster {cluster} → {list(class_mapping.keys())[cluster_to_class_mapping[cluster]]}'
    )

ax.set_xlabel(feature_names[0])
ax.set_ylabel(feature_names[1])
ax.set_zlabel(feature_names[2])
ax.set_title('3D Visualization of Iris Clusters')
plt.legend()
plt.savefig('iris_clusters_3d.png')
print("3D visualization saved as 'iris_clusters_3d.png'")

In [None]:
# Create a pair plot for all features
print("\nCreating pair plot for feature relationships...")

iris_df_with_clusters = iris_df.copy()
iris_df_with_clusters['species'] = iris_df_with_clusters['class']

plt.figure(figsize=(15, 15))
pair_plot = sns.pairplot(
    iris_df_with_clusters,
    vars=column_names[:4],
    hue='species',
    diag_kind='kde',
    plot_kws={'alpha': 0.6}
)
pair_plot.fig.suptitle('Pair Plot of Iris Features by Species', y=1.02, fontsize=16)
plt.savefig('iris_pair_plot.png')
print("Pair plot saved as 'iris_pair_plot.png'")

In [None]:
# Silhouette analysis visualization
print("\nPerforming silhouette analysis visualization...")
from sklearn.metrics import silhouette_samples

plt.figure(figsize=(10, 8))
silhouette_vals = silhouette_samples(X_scaled, cluster_labels_best)

y_lower, y_upper = 0, 0
yticks = []

for i, cluster in enumerate(range(k_optimal)):
    cluster_silhouette_vals = silhouette_vals[cluster_labels_best == cluster]
    cluster_silhouette_vals.sort()
    y_upper += len(cluster_silhouette_vals)
    color = plt.cm.nipy_spectral(float(i) / k_optimal)
    plt.barh(range(y_lower, y_upper), cluster_silhouette_vals, height=1.0,
            edgecolor='none', color=color)
    yticks.append((y_lower + y_upper) / 2)
    y_lower += len(cluster_silhouette_vals)

silhouette_avg = silhouette_score(X_scaled, cluster_labels_best)
plt.axvline(x=silhouette_avg, color="red", linestyle="--")

plt.yticks(yticks, [f'Cluster {i}' for i in range(k_optimal)])
plt.xlabel('Silhouette Coefficient')
plt.ylabel('Cluster')
plt.title(f'Silhouette Analysis (Average Score: {silhouette_avg:.3f})')
plt.savefig('silhouette_analysis.png')
print("Silhouette analysis visualization saved as 'silhouette_analysis.png'")

In [None]:
# Feature importance based on cluster centers
print("\nAnalyzing feature importance based on cluster centers...")
feature_names = column_names[:4]
cluster_centers = kmeans_best.cluster_centers_

print("\nCluster Centers (in scaled space):")
centers_df = pd.DataFrame(cluster_centers, columns=feature_names)
centers_df.index = [f'Cluster {i}' for i in range(k_optimal)]
print(centers_df)

# Prediction function
print("\nCreating a function to predict clusters for new iris measurements...")
def predict_iris_cluster(sepal_length, sepal_width, petal_length, petal_width):
    """
    Predict the cluster for a new iris flower based on its measurements.
    """
    features = np.array([[sepal_length, sepal_width, petal_length, petal_width]])
    features_scaled = scaler.transform(features)
    cluster = kmeans_best.predict(features_scaled)[0]
    class_idx = cluster_to_class_mapping[cluster]
    species = list(class_mapping.keys())[class_idx]
    return cluster, species

# Example predictions
print("\nExample predictions:")
sample1 = predict_iris_cluster(5.1, 3.5, 1.4, 0.2)  # Likely setosa
sample2 = predict_iris_cluster(6.3, 3.3, 6.0, 2.5)  # Likely virginica
sample3 = predict_iris_cluster(5.7, 2.8, 4.1, 1.3)  # Likely versicolor

print(f"Sample 1: Cluster {sample1[0]}, Likely species: {sample1[1]}")
print(f"Sample 2: Cluster {sample2[0]}, Likely species: {sample2[1]}")
print(f"Sample 3: Cluster {sample3[0]}, Likely species: {sample3[1]}")

print("\nClustering analysis complete!")