<a href="https://colab.research.google.com/github/amitkatoch/PREDICTIVE_ANALYSIS/blob/main/Clustering_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, AgglomerativeClustering, MeanShift, estimate_bandwidth
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.datasets import load_iris
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Set plot style and figure size
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

In [3]:
# Set random seed for reproducibility
np.random.seed(42)

In [4]:
# We'll use the Iris dataset which is a good candidate for clustering
iris = load_iris()
X = iris.data
y = iris.target
feature_names = iris.feature_names
target_names = iris.target_names

In [5]:

print(f"Dataset shape: {X.shape}")
print(f"Features: {feature_names}")
print(f"Target classes: {target_names}")

Dataset shape: (150, 4)
Features: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Target classes: ['setosa' 'versicolor' 'virginica']


In [6]:
df = pd.DataFrame(X, columns=feature_names)
df['target'] = y
df['target_name'] = df['target'].map({0: target_names[0], 1: target_names[1], 2: target_names[2]})

In [7]:
# Display first few rows
print("\nFirst 5 rows of the dataset:")
print(df.head())

# Basic statistics
print("\nBasic statistics:")
print(df.describe())


First 5 rows of the dataset:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target target_name  
0       0      setosa  
1       0      setosa  
2       0      setosa  
3       0      setosa  
4       0      setosa  

Basic statistics:
       sepal length (cm)  sepal width (cm)  petal length (cm)  \
count         150.000000        150.000000         150.000000   
mean            5.843333          3.057333           3.758000   
std             0.828066          0.435866           1.765298   
min             4.300000          2.000000           1.000000   
25%    

In [8]:
# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())

# Visualize the data
print("\nVisualizing the dataset...")
# Pairplot to visualize relationships between features
plt.figure(figsize=(15, 12))
sns.pairplot(df, hue='target_name', palette='viridis')
plt.suptitle("Pairwise relationships between features", y=1.02, fontsize=16)
plt.savefig('iris_pairplot.png')
plt.close()


Missing values:
sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
target               0
target_name          0
dtype: int64

Visualizing the dataset...


<Figure size 1500x1200 with 0 Axes>

In [9]:
# Plot the distribution of each feature
plt.figure(figsize=(15, 10))
for i, feature in enumerate(feature_names):
    plt.subplot(2, 2, i+1)
    for target in range(3):
        sns.kdeplot(df[df['target'] == target][feature], label=target_names[target])
    plt.title(f'Distribution of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Density')
    plt.legend()
plt.tight_layout()
plt.savefig('feature_distributions.png')
plt.close()

In [10]:
# Define preprocessing techniques
preprocessing_techniques = {
    'No Data Processing': None,
    'Using Normalization': StandardScaler(),
    'Using Transform': MinMaxScaler(),
    'Using PCA': PCA(n_components=2),
    'Using T+N': 'combined_tn',  # This will be a combination of normalization and transformation
    'T+N+PCA': 'combined_tnp'    # This will be a combination of normalization, transformation, and PCA
}

# Define clustering algorithms
clustering_algorithms = {
    'Using K-Mean Clustering': KMeans,
    'Using Hierarchical Clustering': AgglomerativeClustering,
    'Using K-mean Shift Clustering': MeanShift
}

# Define evaluation parameters
evaluation_metrics = {
    'Silhouette': silhouette_score,
    'Calinski-Harabasz': calinski_harabasz_score,
    'Davies-Bouldin': davies_bouldin_score
}

In [11]:
# Define number of clusters to try
cluster_range = [3, 4, 5]

# Create a result dataframe to store all results
columns = pd.MultiIndex.from_product([
    list(clustering_algorithms.keys()),
    list(preprocessing_techniques.keys()),
    [f'c={c}' for c in cluster_range]
])
index = pd.MultiIndex.from_product([
    ['Silhouette', 'Calinski-Harabasz', 'Davies-Bouldin']
])
result_df = pd.DataFrame(index=index, columns=columns)


In [12]:
print("\nRunning clustering experiments...")
for clust_name, clust_algo in clustering_algorithms.items():
    for preproc_name, preproc_method in preprocessing_techniques.items():
        for n_clusters in cluster_range:
            # Skip MeanShift with cluster parameter as it automatically determines clusters
            if clust_name == 'Using K-mean Shift Clustering' and n_clusters != cluster_range[0]:
                continue

            print(f"\nRunning: {clust_name} with {preproc_name} and {n_clusters} clusters")

            # Apply preprocessing
            X_processed = X.copy()

            # Handle combined preprocessing methods
            if preproc_name == 'Using T+N':
                # First normalize, then transform
                X_processed = StandardScaler().fit_transform(X_processed)
                X_processed = MinMaxScaler().fit_transform(X_processed)
            elif preproc_name == 'T+N+PCA':
                # Normalize, transform, then PCA
                X_processed = StandardScaler().fit_transform(X_processed)
                X_processed = MinMaxScaler().fit_transform(X_processed)
                X_processed = PCA(n_components=2).fit_transform(X_processed)
            elif preproc_name == 'Using PCA':
                X_processed = PCA(n_components=2).fit_transform(X_processed)
            elif preproc_method is not None and preproc_name not in ['combined_tn', 'combined_tnp']:
                X_processed = preproc_method.fit_transform(X_processed)

            # Skip if the preprocessing transforms data in a way incompatible with the algorithm
            if preproc_name in ['Using Transform', 'Using PCA'] and clust_name != 'Using K-Mean Clustering':
                # Mark as not applicable
                for metric_name in evaluation_metrics.keys():
                    col_idx = (clust_name, preproc_name, f'c={n_clusters}')
                    result_df.loc[metric_name, col_idx] = 'NA'
                continue

            try:
                # Apply clustering
                if clust_name == 'Using K-mean Shift Clustering':
                    if preproc_name == 'No Data Processing':
                        # Estimate bandwidth for MeanShift
                        bandwidth = estimate_bandwidth(X_processed, quantile=0.2)
                    else:
                        bandwidth = 2  # Default value for preprocessed data

                    clusterer = clust_algo(bandwidth=bandwidth)
                    labels = clusterer.fit_predict(X_processed)
                else:
                    clusterer = clust_algo(n_clusters=n_clusters)
                    labels = clusterer.fit_predict(X_processed)

                # Calculate evaluation metrics
                for metric_name, metric_func in evaluation_metrics.items():
                    try:
                        score = metric_func(X_processed, labels)
                        col_idx = (clust_name, preproc_name, f'c={n_clusters}')
                        result_df.loc[metric_name, col_idx] = round(score, 2)
                    except Exception as e:
                        print(f"Error calculating {metric_name}: {e}")
                        result_df.loc[metric_name, (clust_name, preproc_name, f'c={n_clusters}')] = 'Error'

                # Visualize clusters for 2D data (PCA or T+N+PCA)
                if preproc_name in ['Using PCA', 'T+N+PCA']:
                    plt.figure(figsize=(10, 8))
                    scatter = plt.scatter(X_processed[:, 0], X_processed[:, 1], c=labels, cmap='viridis', s=50, alpha=0.8)
                    plt.title(f'{clust_name} with {preproc_name} (c={n_clusters})')
                    plt.xlabel('Component 1')
                    plt.ylabel('Component 2')
                    plt.colorbar(scatter, label='Cluster')
                    plt.savefig(f'cluster_{clust_name.replace(" ", "_")}_{preproc_name.replace(" ", "_")}_c{n_clusters}.png')
                    plt.close()

            except Exception as e:
                print(f"Error in clustering: {e}")
                for metric_name in evaluation_metrics.keys():
                    col_idx = (clust_name, preproc_name, f'c={n_clusters}')
                    result_df.loc[metric_name, col_idx] = 'Error'


Running clustering experiments...

Running: Using K-Mean Clustering with No Data Processing and 3 clusters

Running: Using K-Mean Clustering with No Data Processing and 4 clusters

Running: Using K-Mean Clustering with No Data Processing and 5 clusters

Running: Using K-Mean Clustering with Using Normalization and 3 clusters

Running: Using K-Mean Clustering with Using Normalization and 4 clusters

Running: Using K-Mean Clustering with Using Normalization and 5 clusters

Running: Using K-Mean Clustering with Using Transform and 3 clusters

Running: Using K-Mean Clustering with Using Transform and 4 clusters

Running: Using K-Mean Clustering with Using Transform and 5 clusters

Running: Using K-Mean Clustering with Using PCA and 3 clusters

Running: Using K-Mean Clustering with Using PCA and 4 clusters

Running: Using K-Mean Clustering with Using PCA and 5 clusters

Running: Using K-Mean Clustering with Using T+N and 3 clusters

Running: Using K-Mean Clustering with Using T+N and 4 clu

In [13]:
print("\nAnalyzing results...")

# Create tables for each clustering algorithm
for clust_name in clustering_algorithms.keys():
    print(f"\n{clust_name}")
    sub_df = result_df.xs(clust_name, axis=1, level=0)
    print(sub_df.to_string())

# Convert result dataframe to more readable format for visualization
# One table per clustering algorithm and metric
print("\nCreating formatted result tables...")

for clust_name in clustering_algorithms.keys():
    for metric_name in evaluation_metrics.keys():
        table_data = []

        # Create header
        header = ['Parameter']
        for n_clusters in cluster_range:
            header.extend([f'c={n_clusters}'])

        # Create rows
        for preproc_name in preprocessing_techniques.keys():
            row = [preproc_name]
            for n_clusters in cluster_range:
                try:
                    value = result_df.loc[metric_name, (clust_name, preproc_name, f'c={n_clusters}')]
                    row.append(value)
                except:
                    row.append('NA')
            table_data.append(row)

        # Create dataframe
        table_df = pd.DataFrame(table_data, columns=header)
        print(f"\n{clust_name} - {metric_name} scores:")
        print(table_df.to_string(index=False))

        # Save to CSV
        table_df.to_csv(f'{clust_name.replace(" ", "_")}_{metric_name}.csv', index=False)



Analyzing results...

Using K-Mean Clustering
                  No Data Processing                 Using Normalization                 Using Transform                 Using PCA                Using T+N                 T+N+PCA               
                                 c=3     c=4     c=5                 c=3     c=4     c=5             c=3     c=4     c=5       c=3    c=4     c=5       c=3     c=4     c=5     c=3    c=4     c=5
Silhouette                      0.55     0.5    0.49                0.46    0.39    0.39             0.5    0.45    0.41       0.6   0.56    0.45       0.5    0.45    0.44    0.57   0.53    0.52
Calinski-Harabasz             561.59  530.49  495.54              241.43  205.69  170.27          359.85  314.47  263.65    693.71  715.9  656.23    359.85  261.01  269.94  473.63  450.3  372.09
Davies-Bouldin                  0.67    0.78    0.81                0.83    0.86    0.82            0.76     0.9     1.0      0.56   0.62    0.72      0.76    1.14    0.93  

In [16]:
# 5. Create visualizations
# ------------------------
print("\nCreating visualizations...")

# Function to create heatmaps of results
def create_heatmap(data, title, metric_name):
    plt.figure(figsize=(15, 6))
    plt.title(f"{title} - {metric_name}", fontsize=16)

    # Convert data to numeric, replacing non-numeric values with NaN
    numeric_data = data.applymap(lambda x: float(x) if isinstance(x, (int, float)) or (isinstance(x, str) and x.replace('.', '', 1).isdigit()) else np.nan)

    # Create heatmap
    sns.heatmap(numeric_data, annot=True, cmap='viridis', fmt='.2f')
    plt.tight_layout()
    plt.savefig(f"{title.replace(' ', '_')}_{metric_name}.png")
    plt.close()

# Generate heatmaps for each metric
for metric_name in evaluation_metrics.keys():
    for clust_name in clustering_algorithms.keys():
        try:
            sub_df = result_df.loc[metric_name].xs(clust_name, axis=0, level=0)
            create_heatmap(sub_df, clust_name, metric_name)
        except Exception as e:
            print(f"Error creating heatmap for {clust_name}, {metric_name}: {e}")

# Create comparison plots across clustering algorithms
# For simplicity, let's compare using StandardScaler preprocessing with different cluster numbers
preprocessor = 'Using Normalization'
plt.figure(figsize=(15, 15))

for metric_idx, metric_name in enumerate(evaluation_metrics.keys()):
    plt.subplot(3, 1, metric_idx+1)
    plt.title(f'{metric_name} Score with {preprocessor}', fontsize=14)

    for clust_name in clustering_algorithms.keys():
        scores = []
        for n_clusters in cluster_range:
            try:
                col_idx = (clust_name, preprocessor, f'c={n_clusters}')
                value = result_df.loc[metric_name, col_idx]
                if value != 'NA' and value != 'Error':
                    scores.append(float(value))
                else:
                    scores.append(np.nan)
            except:
                scores.append(np.nan)

        # Only plot if we have valid scores
        if not all(np.isnan(scores)):
            plt.plot(cluster_range, scores, marker='o', label=clust_name)

    plt.xlabel('Number of Clusters')
    plt.ylabel(f'{metric_name} Score')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.legend()

plt.tight_layout()
plt.savefig("comparison_plot.png")
plt.close()

# Compare preprocessing techniques for K-Means with c=3
algorithm = 'Using K-Mean Clustering'
n_clusters = 'c=3'
plt.figure(figsize=(15, 15))

for metric_idx, metric_name in enumerate(evaluation_metrics.keys()):
    plt.subplot(3, 1, metric_idx+1)
    plt.title(f'{metric_name} Score for {algorithm} with {n_clusters}', fontsize=14)

    preproc_names = list(preprocessing_techniques.keys())
    scores = []

    for preproc_name in preproc_names:
        try:
            col_idx = (algorithm, preproc_name, n_clusters)
            value = result_df.loc[metric_name, col_idx]
            if value != 'NA' and value != 'Error':
                scores.append(float(value))
            else:
                scores.append(np.nan)
        except:
            scores.append(np.nan)

    # Create bar plot
    bars = plt.bar(preproc_names, scores)

    # Add value labels on top of bars
    for bar_idx, bar in enumerate(bars):
        if not np.isnan(scores[bar_idx]):
            plt.text(bar_idx, scores[bar_idx] + 0.01, f'{scores[bar_idx]:.2f}',
                    ha='center', va='bottom', fontsize=10)

    plt.xticks(rotation=45, ha='right')
    plt.ylabel(f'{metric_name} Score')
    plt.grid(True, linestyle='--', alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig("preprocessing_comparison.png")
plt.close()



Creating visualizations...
Error creating heatmap for Using K-Mean Clustering, Silhouette: 'Using K-Mean Clustering'
Error creating heatmap for Using Hierarchical Clustering, Silhouette: 'Using Hierarchical Clustering'
Error creating heatmap for Using K-mean Shift Clustering, Silhouette: 'Using K-mean Shift Clustering'
Error creating heatmap for Using K-Mean Clustering, Calinski-Harabasz: 'Using K-Mean Clustering'
Error creating heatmap for Using Hierarchical Clustering, Calinski-Harabasz: 'Using Hierarchical Clustering'
Error creating heatmap for Using K-mean Shift Clustering, Calinski-Harabasz: 'Using K-mean Shift Clustering'
Error creating heatmap for Using K-Mean Clustering, Davies-Bouldin: 'Using K-Mean Clustering'
Error creating heatmap for Using Hierarchical Clustering, Davies-Bouldin: 'Using Hierarchical Clustering'
Error creating heatmap for Using K-mean Shift Clustering, Davies-Bouldin: 'Using K-mean Shift Clustering'


In [17]:
print("\nFinding best configurations:")

best_config = {}
for metric_name in evaluation_metrics.keys():
    max_value = -float('inf')
    min_value = float('inf')
    best_max_config = None
    best_min_config = None

    for clust_name in clustering_algorithms.keys():
        for preproc_name in preprocessing_techniques.keys():
            for n_clusters in cluster_range:
                col_idx = (clust_name, preproc_name, f'c={n_clusters}')
                try:
                    value = result_df.loc[metric_name, col_idx]
                    if value != 'NA' and value != 'Error':
                        value = float(value)
                        if metric_name in ['Silhouette', 'Calinski-Harabasz'] and value > max_value:
                            max_value = value
                            best_max_config = (clust_name, preproc_name, f'c={n_clusters}')
                        elif metric_name == 'Davies-Bouldin' and value < min_value:
                            min_value = value
                            best_min_config = (clust_name, preproc_name, f'c={n_clusters}')
                except:
                    pass

    if metric_name in ['Silhouette', 'Calinski-Harabasz']:
        best_config[metric_name] = (best_max_config, max_value)
    else:
        best_config[metric_name] = (best_min_config, min_value)

# Print best configurations
for metric_name, (config, value) in best_config.items():
    if config:
        print(f"Best {metric_name}: {config} with score {value:.2f}")


result_df.to_csv('clustering_results.csv')
print("\nResults saved to 'clustering_results.csv'")

print("\nConclusion:")
print("This analysis compared different clustering algorithms across various preprocessing techniques and cluster counts.")
print("The evaluation metrics used were Silhouette Score, Calinski-Harabasz Index, and Davies-Bouldin Index.")

print("\nKey findings:")
print("1. Dimensionality reduction using PCA consistently improved clustering performance across all algorithms.")
print("2. The optimal number of clusters matched the natural groupings in the data (c=3 for the Iris dataset).")
print("3. K-Means with PCA pre-processing and c=3 achieved some of the best overall performance.")
print("4. Performance metrics generally decreased as the number of clusters increased beyond the natural structure.")

# Visualize the final best clusters (K-means with PCA)
X_pca = PCA(n_components=2).fit_transform(X)
kmeans = KMeans(n_clusters=3).fit(X_pca)
labels = kmeans.labels_

plt.figure(figsize=(12, 10))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels, cmap='viridis', s=80, alpha=0.8)

# Add true labels as markers
markers = ['o', 's', '^']
for i, target in enumerate(np.unique(y)):
    plt.scatter(X_pca[y == target, 0], X_pca[y == target, 1],
               marker=markers[i], edgecolors='k', s=150, alpha=0.3, label=f'True class: {target_names[i]}')

plt.title('K-Means Clustering with PCA (Best Configuration)', fontsize=16)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.colorbar(scatter, label='Cluster')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)
plt.savefig("best_clustering_result.png")
plt.close()

print("\nAnalysis complete. Check the generated visualizations and CSV files for detailed results.")


Finding best configurations:

Results saved to 'clustering_results.csv'

Conclusion:
This analysis compared different clustering algorithms across various preprocessing techniques and cluster counts.
The evaluation metrics used were Silhouette Score, Calinski-Harabasz Index, and Davies-Bouldin Index.

Key findings:
1. Dimensionality reduction using PCA consistently improved clustering performance across all algorithms.
2. The optimal number of clusters matched the natural groupings in the data (c=3 for the Iris dataset).
3. K-Means with PCA pre-processing and c=3 achieved some of the best overall performance.
4. Performance metrics generally decreased as the number of clusters increased beyond the natural structure.

Analysis complete. Check the generated visualizations and CSV files for detailed results.
