In [1]:
import os
import numpy as np
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms
import torchvision
import tifffile as tiff
import matplotlib.pyplot as plt
import time
from tqdm import tqdm
from copy import deepcopy
import random
from sklearn.preprocessing import normalize

from collections import Counter
from scipy.stats import mode

In [2]:
# Set seed for reproducibility
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Set seed
set_seed(42)

In [3]:
# Set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
class SimCLR(nn.Module):
    def __init__(self, hidden_dim, lr, temperature, weight_decay,max_epochs):
        super().__init__()
        self.temperature = temperature
        
        # Load the pretrained ResNet-18 model
        self.convnet = torchvision.models.resnet18(weights='ResNet18_Weights.DEFAULT')
        
        # Modify the fully connected layer
        self.convnet.fc = nn.Sequential(
            nn.Linear(self.convnet.fc.in_features, 4 * hidden_dim),  # Linear layer with 4*hidden_dim output
            nn.ReLU(inplace=True),
            nn.Linear(4 * hidden_dim, 20)  # Output layer with hidden_dim output
        )

        self.optimizer = optim.AdamW(self.parameters(), lr=lr, weight_decay=weight_decay)
        self.lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(self.optimizer, T_max=max_epochs, eta_min=lr / 50)

    def forward(self, x):
        return self.convnet(x)

    def info_nce_loss(self, imgs1, imgs2, device):

        imgs = torch.cat((imgs1, imgs2), dim=0)  # Concatenate along the batch dimension
        imgs = imgs.to(device)  # Move images to the device

        # Encode all images
        feats = self.forward(imgs)
    
        # Calculate cosine similarity
        cos_sim = nn.functional.cosine_similarity(feats[:, None, :], feats[None, :, :], dim=-1)
    
        # Mask out cosine similarity to itself
        self_mask = torch.eye(cos_sim.shape[0], dtype=torch.bool, device=cos_sim.device)
        cos_sim.masked_fill_(self_mask, -9e15)
    
        # Find positive example -> batch_size//2 away from the original example
        pos_mask = self_mask.roll(shifts=cos_sim.shape[0] // 2, dims=0)
    
        # Normalize similarity scores by temperature
        cos_sim = cos_sim / self.temperature

        # InfoNCE loss
        nll = -cos_sim[pos_mask] + torch.logsumexp(cos_sim, dim=-1)
        nll = nll.mean()

        # Accuracy calculations
        # Create a combination of positive and negative similarities for ranking
        comb_sim = torch.cat([cos_sim[pos_mask][:, None],  # Positive example in first position
                          cos_sim.masked_fill(pos_mask, -9e15)], dim=-1)
    
        # Sort and get the ranking position of the positive example
        sim_argsort = comb_sim.argsort(dim=-1, descending=True).argmin(dim=-1)
    
        # Compute accuracy metrics
        top1_acc = (sim_argsort == 0).float().mean()  # Top-1 accuracy
        top5_acc = (sim_argsort < 5).float().mean()   # Top-5 accuracy
        mean_pos = 1 + sim_argsort.float().mean()     # Mean position of the positive example

        return nll, top1_acc, top5_acc, mean_pos

    def train_epoch(self, train_loader, device):
        self.train()
        total_loss = 0.0
        total_top1_acc = 0.0
        total_top5_acc = 0.0
        total_mean_pos = 0.0

        for batch in tqdm(train_loader, desc="Training", leave=False):
            imgs1, imgs2, _ = batch
            imgs1, imgs2 = imgs1.to(device), imgs2.to(device)  # Move data to device
        
            self.optimizer.zero_grad()

            # Calculate loss and accuracy metrics
            loss, top1_acc, top5_acc, mean_pos = self.info_nce_loss(imgs1, imgs2, device)

            loss.backward()
            self.optimizer.step()
            #self.lr_scheduler.step()

            # Accumulate metrics
            total_loss += loss.item()
            total_top1_acc += top1_acc.item()
            total_top5_acc += top5_acc.item()
            total_mean_pos += mean_pos.item()

        avg_loss = total_loss / len(train_loader)
        avg_top1_acc = total_top1_acc / len(train_loader)
        avg_top5_acc = total_top5_acc / len(train_loader)
        avg_mean_pos = total_mean_pos / len(train_loader)

        return avg_loss, avg_top1_acc, avg_top5_acc, avg_mean_pos

    def validate_epoch(self, val_loader, device):
        self.eval()
        total_loss = 0.0
        total_top1_acc = 0.0
        total_top5_acc = 0.0
        total_mean_pos = 0.0

        with torch.no_grad():
            for batch in tqdm(val_loader, desc="Validating", leave=False):
                imgs1, imgs2, _ = batch
                imgs1, imgs2 = imgs1.to(device), imgs2.to(device)  # Move data to device

                # Calculate loss and accuracy metrics
                loss, top1_acc, top5_acc, mean_pos = self.info_nce_loss(imgs1, imgs2, device)

                # Accumulate metrics
                total_loss += loss.item()
                total_top1_acc += top1_acc.item()
                total_top5_acc += top5_acc.item()
                total_mean_pos += mean_pos.item()

        avg_loss = total_loss / len(val_loader)
        avg_top1_acc = total_top1_acc / len(val_loader)
        avg_top5_acc = total_top5_acc / len(val_loader)
        avg_mean_pos = total_mean_pos / len(val_loader)

        return avg_loss, avg_top1_acc, avg_top5_acc, avg_mean_pos
    

    def inference_epoch(self, inference_loader, device):
        self.eval()
        total_loss = 0.0
        total_top1_acc = 0.0
        total_top5_acc = 0.0
        total_mean_pos = 0.0

        with torch.no_grad():
            for batch in tqdm(inference_loader, desc="Inferencing", leave=False):
                imgs1, imgs2, _ = batch
                imgs1, imgs2 = imgs1.to(device), imgs2.to(device)  # Move data to device

                # Calculate loss and accuracy metrics
                loss, top1_acc, top5_acc, mean_pos = self.info_nce_loss(imgs1, imgs2, device)

                # Accumulate metrics
                total_loss += loss.item()
                total_top1_acc += top1_acc.item()
                total_top5_acc += top5_acc.item()
                total_mean_pos += mean_pos.item()

        avg_loss = total_loss / len(inference_loader)
        avg_top1_acc = total_top1_acc / len(inference_loader)
        avg_top5_acc = total_top5_acc / len(inference_loader)
        avg_mean_pos = total_mean_pos / len(inference_loader)

        return avg_loss, avg_top1_acc, avg_top5_acc, avg_mean_pos

In [5]:
full_model_path =  r'C:\Users\k54739\saved_model\simclr_strongcrop_245.pth' 
simclr_model = SimCLR(hidden_dim=128, lr=5e-4, temperature=0.07, weight_decay=1e-4,max_epochs=245)
simclr_model.load_state_dict(torch.load(full_model_path))
simclr_model.to(device)
simclr_model.eval()

  simclr_model.load_state_dict(torch.load(full_model_path))


SimCLR(
  (convnet): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_runn

# Clustering

In [6]:
class LabeledImageDataset(Dataset):
    def __init__(self, image_files, labels, transform=None):
        self.image_files = image_files
        self.labels = labels
        #self.transform = transform
        self.resize_transform = transforms.Resize((96, 96))

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_path = self.image_files[idx]
        image = tiff.imread(img_path)

        # Ensure the image has 3 layers (channels)
        if image.shape[0] != 3:
            raise ValueError(f"Image {img_path} does not have exactly 3 layers.")

        # Normalize the 16-bit image to [0, 1]
        image = image.astype(np.float32) / 65535.0

        # Convert to a torch tensor and add channel dimension
        image = torch.tensor(image, dtype=torch.float32)
        
        # Apply resize transform
        image = self.resize_transform(image)

        label = self.labels[idx]

        return image, label

def load_data(root_dir):

    classes = ['cond7_all', 'sd_only','ex'] #full dataset
    #classes = ['cond7_curated', 'sd_only','ex'] # curated full dataset


    #classes = ['uncure_cond7_40', 'sdonly_40','ex'] # 40 subset # 97.5
    #classes = ['cure_cond7_40', 'sdonly_40','ex'] # curated 40 subset

    #inference
    #classes = ['cond7_all', 'sd_plus_dsclose','ex'] #full dataset
    #classes = ['cond7_curated', 'sd_plus_dsclose','ex'] # curated full dataset



    #classes = ['uncure_cond7_40', 'sd_plus_dsclose_40','ex'] # curated 40 subset inference
    #classes = ['cure_cond7_40', 'sd_plus_dsclose_40','ex'] # curated 40 subset inference

    image_files = []
    labels = []

    for idx, class_name in enumerate(classes):
        class_dir = os.path.join(root_dir, class_name)
        files = [os.path.join(class_dir, file) for file in os.listdir(class_dir) if file.endswith(('.tiff', '.tif'))]
        image_files.extend(files)
        labels.extend([idx] * len(files))
    
    # Check if the labels correctly reflect the classes
    print("Label distribution:", {classes[i]: labels.count(i) for i in range(len(classes))})

    return image_files, labels


# Directories for labeled data
image_dir = r"C:\Users\k54739\Bibi_new_thesis\thesis\classification"

# Load data
image_files, labels = load_data(image_dir)

# Create the labeled datasets
labeled_dataset = LabeledImageDataset(image_files, labels)

# Define DataLoaders
batch_size = 16
loader_labeled = DataLoader(labeled_dataset, batch_size=batch_size, shuffle=True, drop_last=False, pin_memory=True, num_workers=0)

Label distribution: {'cond7_all': 472, 'sd_only': 103, 'ex': 40}


In [7]:
for anchor,label in loader_labeled:
    print(anchor.shape, label.shape, anchor.dtype)
    print(label)
    break

torch.Size([16, 3, 96, 96]) torch.Size([16]) torch.float32
tensor([0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 2])


In [8]:
@torch.no_grad()
def prepare_data_features(model, dataloader):
    # Prepare model
    network = deepcopy(model.convnet)
    #network.fc = nn.Identity()  # Removing projection head g(.)
    network.eval()
    network.to(device)

    # Encode all images
    feats, labels = [], []
    for batch_imgs, batch_labels in tqdm(dataloader):
        batch_imgs = batch_imgs.to(device)
        batch_feats = network(batch_imgs)
        print(f"Batch features shape: {batch_feats.shape}")
        print(f"Batch labels shape: {batch_labels.shape}")
        
        feats.append(batch_feats.detach().cpu())
        labels.append(batch_labels)

    feats = torch.cat(feats, dim=0)
    labels = torch.cat(labels, dim=0)
    
    print(f"Features shape after concatenation: {feats.shape}")
    print(f"Labels shape after concatenation: {labels.shape}")

    return torch.utils.data.TensorDataset(feats, labels)

In [9]:
# Extract features for train and test datasets
feats_simclr = prepare_data_features(simclr_model, loader_labeled)

  3%|▎         | 1/39 [00:00<00:34,  1.11it/s]

Batch features shape: torch.Size([16, 20])
Batch labels shape: torch.Size([16])


  5%|▌         | 2/39 [00:01<00:30,  1.22it/s]

Batch features shape: torch.Size([16, 20])
Batch labels shape: torch.Size([16])


  8%|▊         | 3/39 [00:02<00:28,  1.25it/s]

Batch features shape: torch.Size([16, 20])
Batch labels shape: torch.Size([16])


 10%|█         | 4/39 [00:03<00:27,  1.29it/s]

Batch features shape: torch.Size([16, 20])
Batch labels shape: torch.Size([16])


 13%|█▎        | 5/39 [00:03<00:26,  1.31it/s]

Batch features shape: torch.Size([16, 20])
Batch labels shape: torch.Size([16])


 15%|█▌        | 6/39 [00:04<00:24,  1.33it/s]

Batch features shape: torch.Size([16, 20])
Batch labels shape: torch.Size([16])


 18%|█▊        | 7/39 [00:05<00:23,  1.34it/s]

Batch features shape: torch.Size([16, 20])
Batch labels shape: torch.Size([16])


 21%|██        | 8/39 [00:06<00:23,  1.32it/s]

Batch features shape: torch.Size([16, 20])
Batch labels shape: torch.Size([16])


 23%|██▎       | 9/39 [00:06<00:22,  1.32it/s]

Batch features shape: torch.Size([16, 20])
Batch labels shape: torch.Size([16])


 26%|██▌       | 10/39 [00:07<00:22,  1.30it/s]

Batch features shape: torch.Size([16, 20])
Batch labels shape: torch.Size([16])


 28%|██▊       | 11/39 [00:08<00:22,  1.27it/s]

Batch features shape: torch.Size([16, 20])
Batch labels shape: torch.Size([16])


 31%|███       | 12/39 [00:09<00:21,  1.25it/s]

Batch features shape: torch.Size([16, 20])
Batch labels shape: torch.Size([16])


 33%|███▎      | 13/39 [00:10<00:21,  1.20it/s]

Batch features shape: torch.Size([16, 20])
Batch labels shape: torch.Size([16])


 36%|███▌      | 14/39 [00:11<00:20,  1.23it/s]

Batch features shape: torch.Size([16, 20])
Batch labels shape: torch.Size([16])


 38%|███▊      | 15/39 [00:11<00:19,  1.25it/s]

Batch features shape: torch.Size([16, 20])
Batch labels shape: torch.Size([16])


 41%|████      | 16/39 [00:12<00:17,  1.29it/s]

Batch features shape: torch.Size([16, 20])
Batch labels shape: torch.Size([16])


 44%|████▎     | 17/39 [00:13<00:17,  1.29it/s]

Batch features shape: torch.Size([16, 20])
Batch labels shape: torch.Size([16])


 46%|████▌     | 18/39 [00:14<00:16,  1.30it/s]

Batch features shape: torch.Size([16, 20])
Batch labels shape: torch.Size([16])


 49%|████▊     | 19/39 [00:14<00:15,  1.29it/s]

Batch features shape: torch.Size([16, 20])
Batch labels shape: torch.Size([16])


 51%|█████▏    | 20/39 [00:15<00:14,  1.31it/s]

Batch features shape: torch.Size([16, 20])
Batch labels shape: torch.Size([16])


 54%|█████▍    | 21/39 [00:16<00:14,  1.28it/s]

Batch features shape: torch.Size([16, 20])
Batch labels shape: torch.Size([16])


 56%|█████▋    | 22/39 [00:17<00:13,  1.27it/s]

Batch features shape: torch.Size([16, 20])
Batch labels shape: torch.Size([16])


 59%|█████▉    | 23/39 [00:18<00:12,  1.25it/s]

Batch features shape: torch.Size([16, 20])
Batch labels shape: torch.Size([16])


 62%|██████▏   | 24/39 [00:18<00:12,  1.25it/s]

Batch features shape: torch.Size([16, 20])
Batch labels shape: torch.Size([16])


 64%|██████▍   | 25/39 [00:19<00:11,  1.24it/s]

Batch features shape: torch.Size([16, 20])
Batch labels shape: torch.Size([16])


 67%|██████▋   | 26/39 [00:20<00:10,  1.25it/s]

Batch features shape: torch.Size([16, 20])
Batch labels shape: torch.Size([16])


 69%|██████▉   | 27/39 [00:21<00:09,  1.24it/s]

Batch features shape: torch.Size([16, 20])
Batch labels shape: torch.Size([16])


 72%|███████▏  | 28/39 [00:22<00:08,  1.23it/s]

Batch features shape: torch.Size([16, 20])
Batch labels shape: torch.Size([16])


 74%|███████▍  | 29/39 [00:22<00:08,  1.24it/s]

Batch features shape: torch.Size([16, 20])
Batch labels shape: torch.Size([16])


 77%|███████▋  | 30/39 [00:23<00:07,  1.25it/s]

Batch features shape: torch.Size([16, 20])
Batch labels shape: torch.Size([16])


 79%|███████▉  | 31/39 [00:24<00:06,  1.25it/s]

Batch features shape: torch.Size([16, 20])
Batch labels shape: torch.Size([16])


 82%|████████▏ | 32/39 [00:25<00:05,  1.24it/s]

Batch features shape: torch.Size([16, 20])
Batch labels shape: torch.Size([16])


 85%|████████▍ | 33/39 [00:26<00:04,  1.25it/s]

Batch features shape: torch.Size([16, 20])
Batch labels shape: torch.Size([16])


 87%|████████▋ | 34/39 [00:26<00:03,  1.25it/s]

Batch features shape: torch.Size([16, 20])
Batch labels shape: torch.Size([16])


 90%|████████▉ | 35/39 [00:27<00:03,  1.25it/s]

Batch features shape: torch.Size([16, 20])
Batch labels shape: torch.Size([16])


 92%|█████████▏| 36/39 [00:28<00:02,  1.25it/s]

Batch features shape: torch.Size([16, 20])
Batch labels shape: torch.Size([16])


 95%|█████████▍| 37/39 [00:29<00:01,  1.27it/s]

Batch features shape: torch.Size([16, 20])
Batch labels shape: torch.Size([16])


 97%|█████████▋| 38/39 [00:30<00:00,  1.27it/s]

Batch features shape: torch.Size([16, 20])
Batch labels shape: torch.Size([16])


100%|██████████| 39/39 [00:30<00:00,  1.28it/s]

Batch features shape: torch.Size([7, 20])
Batch labels shape: torch.Size([7])
Features shape after concatenation: torch.Size([615, 20])
Labels shape after concatenation: torch.Size([615])





In [10]:
# Convert features and labels to NumPy arrays
feats_np = feats_simclr.tensors[0].numpy()  # Features in shape (60, 512)
feats_np_norm = normalize(feats_np, axis=1)
labels_np = feats_simclr.tensors[1].numpy()  # Corresponding labels

# Check the shapes
print("Shape of features (for K-Means):", feats_np.shape)
print("Shape of labels:", labels_np.shape)

Shape of features (for K-Means): (615, 20)
Shape of labels: (615,)


In [11]:
from collections import Counter
import numpy as np
from sklearn.preprocessing import normalize

# Functions: K-Means initialization, assignment, and computation
def kMeans_init_centroids(X, K):
    """Initialize centroids randomly from the dataset."""
    randidx = np.random.permutation(X.shape[0])
    centroids = X[randidx[:K]]
    return centroids

def find_closest_centroids(X, centroids):
    """Assign data points to closest centroids using cosine similarity."""
    idx = np.zeros(X.shape[0], dtype=int)
    for i in range(X.shape[0]):
        similarities = np.dot(centroids, X[i])  # Cosine similarity
        idx[i] = np.argmax(similarities)  # Assign to the most similar centroid
    return idx

def compute_centroids(X, idx, K):
    """Compute new centroids based on assigned clusters."""
    centroids = np.zeros((K, X.shape[1]))
    for k in range(K):
        points = X[idx == k]
        if len(points) > 0:
            centroids[k] = np.mean(points, axis=0)
    centroids = normalize(centroids, axis=1)  # Normalize centroids
    return centroids

def run_kMeans(X, initial_centroids, max_iters=10):
    """Run the K-Means algorithm."""
    K = initial_centroids.shape[0]
    centroids = initial_centroids
    idx = np.zeros(X.shape[0], dtype=int)
    for _ in range(max_iters):
        idx = find_closest_centroids(X, centroids)
        centroids = compute_centroids(X, idx, K)
    return centroids, idx

# Accuracy calculation and main experiment loop
def calculate_accuracy(idx, labels_np, K=3):
    """Calculate accuracy for a given set of cluster assignments."""
    total_samples = len(labels_np)
    cluster_label_counts = {}
    label_to_cluster = {}
    correctly_classified = 0

    # Count true labels in each cluster
    for cluster in range(K):
        cluster_indices = np.where(idx == cluster)[0]
        cluster_labels = labels_np[cluster_indices]
        label_counts = Counter(cluster_labels)
        cluster_label_counts[cluster] = label_counts
    
    # Assign each label to the cluster where it is most common
    for label in range(3):  # Assuming 3 classes: 0, 1, 2
        max_count = 0
        assigned_cluster = None
        for cluster, label_counts in cluster_label_counts.items():
            if label_counts[label] > max_count:
                max_count = label_counts[label]
                assigned_cluster = cluster
        if assigned_cluster is not None:
            label_to_cluster[label] = assigned_cluster
            correctly_classified += max_count

    accuracy = correctly_classified / total_samples
    return accuracy, cluster_label_counts 

'''# Accuracy calculation and main experiment loop
def calculate_accuracy(idx, labels_np, K=3):
    """Calculate accuracy for a given set of cluster assignments."""
    total_samples = len(labels_np)
    cluster_label_counts = {}
    label_to_cluster = {}
    correctly_classified = 0

    # Count true labels in each cluster
    for cluster in range(K):
        cluster_indices = np.where(idx == cluster)[0]
        cluster_labels = labels_np[cluster_indices]
        label_counts = Counter(cluster_labels)
        cluster_label_counts[cluster] = label_counts
    
    # Track clusters already assigned to labels
    assigned_clusters = set()

    # Assign each label to the cluster where it is most common, respecting the rule
    for label in range(3):  # Assuming 3 classes: 0, 1, 2
        max_count = 0
        assigned_cluster = None
        for cluster, label_counts in cluster_label_counts.items():
            # Skip clusters already assigned to another label
            if cluster in assigned_clusters:
                #continue
            if label_counts[label] > max_count:
                max_count = label_counts[label]
                assigned_cluster = cluster
        # Assign this label to the cluster and mark the cluster as used
        if assigned_cluster is not None:
            label_to_cluster[label] = assigned_cluster
            assigned_clusters.add(assigned_cluster)
            correctly_classified += max_count

    accuracy = correctly_classified / total_samples
    return accuracy, cluster_label_counts'''

def main_kMeans_experiment(X, labels_np, K=3, num_runs=100, max_iters=50):
    accuracies = []
    best_accuracy = 0
    best_cluster_counts = None
    best_centroids = None
    best_idx = None

    for run in range(num_runs):
        # Initialize centroids randomly
        initial_centroids = kMeans_init_centroids(X, K)
        
        # Run K-Means
        centroids, idx = run_kMeans(X, initial_centroids, max_iters)
        
        # Calculate accuracy
        accuracy, cluster_counts = calculate_accuracy(idx, labels_np, K)
        accuracies.append(accuracy)
        
        # Track the best run
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_cluster_counts = cluster_counts
            best_centroids = centroids
            best_idx = idx
        
        print(f"Run {run + 1}/{num_runs}, Accuracy: {accuracy:.4f}")

    # Print final results
    print("\n=== Final Results ===")
    print(f"Max Accuracy: {best_accuracy:.4f}")
    print("Cluster Descriptions:")
    for cluster, counts in best_cluster_counts.items():
        print(f"Cluster {cluster} has true labels: {counts}")
    
    # Return the best centroids and idx along with accuracy
    return best_accuracy, best_centroids, best_idx, best_cluster_counts


'''
# Main loop for running K-Means multiple times
def main_kMeans_experiment(X, labels_np, K=3, num_runs=100, max_iters=50):
    accuracies = []
    best_accuracy = 0
    best_cluster_counts = None

    for run in range(num_runs):
        # Initialize centroids randomly
        initial_centroids = kMeans_init_centroids(X, K)
        
        # Run K-Means
        centroids, idx = run_kMeans(X, initial_centroids, max_iters)
        
        # Calculate accuracy
        accuracy, cluster_counts = calculate_accuracy(idx, labels_np, K)
        accuracies.append(accuracy)
        
        # Track the best run
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_cluster_counts = cluster_counts
        
        print(f"Run {run + 1}/{num_runs}, Accuracy: {accuracy:.4f}")

    # Print final results
    print("\n=== Final Results ===")
    print(f"Max Accuracy: {best_accuracy:.4f}")
    print("Cluster Descriptions:")
    for cluster, counts in best_cluster_counts.items():
        print(f"Cluster {cluster} has true labels: {counts}")'''

    

# Example usage
# Assuming `feats_np_norm` is your normalized feature matrix and `labels_np` contains true labels
K = 3  # Number of clusters
num_runs = 400  # Number of K-Means runs
max_iters = 50  # Maximum iterations per run

best_accuracy, best_centroids, best_idx, best_cluster_counts = main_kMeans_experiment(feats_np_norm, labels_np, K, num_runs, max_iters)


Run 1/400, Accuracy: 0.7073
Run 2/400, Accuracy: 0.7171
Run 3/400, Accuracy: 0.6211
Run 4/400, Accuracy: 0.6033
Run 5/400, Accuracy: 0.6423
Run 6/400, Accuracy: 0.6211
Run 7/400, Accuracy: 0.6179
Run 8/400, Accuracy: 0.6423
Run 9/400, Accuracy: 0.6455
Run 10/400, Accuracy: 0.6244
Run 11/400, Accuracy: 0.6407
Run 12/400, Accuracy: 0.5854
Run 13/400, Accuracy: 0.6520
Run 14/400, Accuracy: 0.6016
Run 15/400, Accuracy: 0.5984
Run 16/400, Accuracy: 0.5854
Run 17/400, Accuracy: 0.6423
Run 18/400, Accuracy: 0.5447
Run 19/400, Accuracy: 0.6049
Run 20/400, Accuracy: 0.6439
Run 21/400, Accuracy: 0.5447
Run 22/400, Accuracy: 0.7187
Run 23/400, Accuracy: 0.4407
Run 24/400, Accuracy: 0.6472
Run 25/400, Accuracy: 0.5919
Run 26/400, Accuracy: 0.6423
Run 27/400, Accuracy: 0.7252
Run 28/400, Accuracy: 0.5967
Run 29/400, Accuracy: 0.5447
Run 30/400, Accuracy: 0.4667
Run 31/400, Accuracy: 0.6244
Run 32/400, Accuracy: 0.5854
Run 33/400, Accuracy: 0.5382
Run 34/400, Accuracy: 0.5447
Run 35/400, Accuracy: 0

In [12]:
374+40

414

In [13]:
414/615

0.6731707317073171

In [14]:
447/615

0.7268292682926829

In [15]:
21+31+37

89

In [16]:
89/120

0.7416666666666667

In [17]:
216+69

285

93+104+33

93+132+6

231/423

In [18]:
from collections import Counter
import numpy as np
from sklearn.preprocessing import normalize

# Functions: K-Means initialization, assignment, and computation
def kMeans_init_centroids(X, K):
    """Initialize centroids randomly from the dataset."""
    randidx = np.random.permutation(X.shape[0])
    centroids = X[randidx[:K]]
    return centroids

def find_closest_centroids(X, centroids):
    """Assign data points to closest centroids using cosine similarity."""
    idx = np.zeros(X.shape[0], dtype=int)
    for i in range(X.shape[0]):
        similarities = np.dot(centroids, X[i])  # Cosine similarity
        idx[i] = np.argmax(similarities)  # Assign to the most similar centroid
    return idx

def compute_centroids(X, idx, K):
    """Compute new centroids based on assigned clusters."""
    centroids = np.zeros((K, X.shape[1]))
    for k in range(K):
        points = X[idx == k]
        if len(points) > 0:
            centroids[k] = np.mean(points, axis=0)
    centroids = normalize(centroids, axis=1)  # Normalize centroids
    return centroids

def run_kMeans(X, initial_centroids, max_iters=10):
    """Run the K-Means algorithm."""
    K = initial_centroids.shape[0]
    centroids = initial_centroids
    idx = np.zeros(X.shape[0], dtype=int)
    for _ in range(max_iters):
        idx = find_closest_centroids(X, centroids)
        centroids = compute_centroids(X, idx, K)
    return centroids, idx

'''# Accuracy calculation and main experiment loop
def calculate_accuracy(idx, labels_np, K=3):
    """Calculate accuracy for a given set of cluster assignments."""
    total_samples = len(labels_np)
    cluster_label_counts = {}
    label_to_cluster = {}
    correctly_classified = 0

    # Count true labels in each cluster
    for cluster in range(K):
        cluster_indices = np.where(idx == cluster)[0]
        cluster_labels = labels_np[cluster_indices]
        label_counts = Counter(cluster_labels)
        cluster_label_counts[cluster] = label_counts
    
    # Assign each label to the cluster where it is most common
    for label in range(3):  # Assuming 3 classes: 0, 1, 2
        max_count = 0
        assigned_cluster = None
        for cluster, label_counts in cluster_label_counts.items():
            if label_counts[label] > max_count:
                max_count = label_counts[label]
                assigned_cluster = cluster
        if assigned_cluster is not None:
            label_to_cluster[label] = assigned_cluster
            correctly_classified += max_count

    accuracy = correctly_classified / total_samples
    return accuracy, cluster_label_counts '''

# Accuracy calculation and main experiment loop
def calculate_accuracy(idx, labels_np, K=3):
    """Calculate accuracy for a given set of cluster assignments."""
    total_samples = len(labels_np)
    cluster_label_counts = {}
    label_to_cluster = {}
    correctly_classified = 0

    # Count true labels in each cluster
    for cluster in range(K):
        cluster_indices = np.where(idx == cluster)[0]
        cluster_labels = labels_np[cluster_indices]
        label_counts = Counter(cluster_labels)
        cluster_label_counts[cluster] = label_counts
    
    # Track clusters already assigned to labels
    assigned_clusters = set()

    # Assign each label to the cluster where it is most common, respecting the rule
    for label in range(3):  # Assuming 3 classes: 0, 1, 2
        max_count = 0
        assigned_cluster = None
        for cluster, label_counts in cluster_label_counts.items():
            # Skip clusters already assigned to another label
            if cluster in assigned_clusters:
                continue
            if label_counts[label] > max_count:
                max_count = label_counts[label]
                assigned_cluster = cluster
        # Assign this label to the cluster and mark the cluster as used
        if assigned_cluster is not None:
            label_to_cluster[label] = assigned_cluster
            assigned_clusters.add(assigned_cluster)
            correctly_classified += max_count

    accuracy = correctly_classified / total_samples
    return accuracy, cluster_label_counts

def main_kMeans_experiment(X, labels_np, K=3, num_runs=100, max_iters=50):
    accuracies = []
    best_accuracy = 0
    best_cluster_counts = None
    best_centroids = None
    best_idx = None

    for run in range(num_runs):
        # Initialize centroids randomly
        initial_centroids = kMeans_init_centroids(X, K)
        
        # Run K-Means
        centroids, idx = run_kMeans(X, initial_centroids, max_iters)
        
        # Calculate accuracy
        accuracy, cluster_counts = calculate_accuracy(idx, labels_np, K)
        accuracies.append(accuracy)
        
        # Track the best run
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_cluster_counts = cluster_counts
            best_centroids = centroids
            best_idx = idx
        
        print(f"Run {run + 1}/{num_runs}, Accuracy: {accuracy:.4f}")

    # Print final results
    print("\n=== Final Results ===")
    print(f"Max Accuracy: {best_accuracy:.4f}")
    print("Cluster Descriptions:")
    for cluster, counts in best_cluster_counts.items():
        print(f"Cluster {cluster} has true labels: {counts}")
    
    # Return the best centroids and idx along with accuracy
    return best_accuracy, best_centroids, best_idx, best_cluster_counts


'''
# Main loop for running K-Means multiple times
def main_kMeans_experiment(X, labels_np, K=3, num_runs=100, max_iters=50):
    accuracies = []
    best_accuracy = 0
    best_cluster_counts = None

    for run in range(num_runs):
        # Initialize centroids randomly
        initial_centroids = kMeans_init_centroids(X, K)
        
        # Run K-Means
        centroids, idx = run_kMeans(X, initial_centroids, max_iters)
        
        # Calculate accuracy
        accuracy, cluster_counts = calculate_accuracy(idx, labels_np, K)
        accuracies.append(accuracy)
        
        # Track the best run
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_cluster_counts = cluster_counts
        
        print(f"Run {run + 1}/{num_runs}, Accuracy: {accuracy:.4f}")

    # Print final results
    print("\n=== Final Results ===")
    print(f"Max Accuracy: {best_accuracy:.4f}")
    print("Cluster Descriptions:")
    for cluster, counts in best_cluster_counts.items():
        print(f"Cluster {cluster} has true labels: {counts}")'''

    

# Example usage
# Assuming `feats_np_norm` is your normalized feature matrix and `labels_np` contains true labels
K = 3  # Number of clusters
num_runs = 400  # Number of K-Means runs
max_iters = 50  # Maximum iterations per run

best_accuracy, best_centroids, best_idx, best_cluster_counts = main_kMeans_experiment(feats_np_norm, labels_np, K, num_runs, max_iters)


Run 1/400, Accuracy: 0.6423
Run 2/400, Accuracy: 0.5919
Run 3/400, Accuracy: 0.6244
Run 4/400, Accuracy: 0.5886
Run 5/400, Accuracy: 0.6098
Run 6/400, Accuracy: 0.6098
Run 7/400, Accuracy: 0.6000
Run 8/400, Accuracy: 0.7252
Run 9/400, Accuracy: 0.3837
Run 10/400, Accuracy: 0.5171
Run 11/400, Accuracy: 0.6423
Run 12/400, Accuracy: 0.6602
Run 13/400, Accuracy: 0.5317
Run 14/400, Accuracy: 0.5171
Run 15/400, Accuracy: 0.5122
Run 16/400, Accuracy: 0.6098
Run 17/400, Accuracy: 0.5854
Run 18/400, Accuracy: 0.4602
Run 19/400, Accuracy: 0.6423
Run 20/400, Accuracy: 0.5268
Run 21/400, Accuracy: 0.5171
Run 22/400, Accuracy: 0.5902
Run 23/400, Accuracy: 0.5854
Run 24/400, Accuracy: 0.5935
Run 25/400, Accuracy: 0.5967
Run 26/400, Accuracy: 0.6423
Run 27/400, Accuracy: 0.6033
Run 28/400, Accuracy: 0.7089
Run 29/400, Accuracy: 0.6000
Run 30/400, Accuracy: 0.5187
Run 31/400, Accuracy: 0.5919
Run 32/400, Accuracy: 0.5122
Run 33/400, Accuracy: 0.5854
Run 34/400, Accuracy: 0.6016
Run 35/400, Accuracy: 0

In [19]:
103+373+40


516

In [20]:
516/615

0.8390243902439024

In [21]:
298/615

0.4845528455284553

In [22]:
ss

NameError: name 'ss' is not defined

In [None]:
85+137+11

In [None]:
233/423

In [None]:
284/615

In [None]:
425+125+40

In [None]:
590/637

In [None]:
124+406+40

In [None]:
570/637

In [None]:
from collections import Counter
import numpy as np

# Number of clusters
K = len(np.unique(best_idx))

# Total number of samples
total_samples = len(labels_np)

# Dictionary to track the cluster assigned to each label
label_to_cluster = {}

# Variable to count correctly classified samples
correctly_classified = 0

# Iterate over each cluster and count the true labels in that cluster
cluster_label_counts = {}  # To store the counts for each cluster
for cluster in range(K):
    # Find the indices of images assigned to the current cluster
    cluster_indices = np.where(best_idx == cluster)[0]
    
    # Get the true labels for the images in this cluster
    cluster_labels = labels_np[cluster_indices]
    
    # Use Counter to count occurrences of each label in the cluster
    label_counts = Counter(cluster_labels)
    
    # Save label counts for this cluster
    cluster_label_counts[cluster] = label_counts

# Track clusters already assigned to labels
assigned_clusters = set()

# Assign each label to the cluster where it is most common, respecting the rule
for label in range(3):  # Assuming 3 classes: 0, 1, 2
    max_count = 0
    assigned_cluster = None
    for cluster, label_counts in cluster_label_counts.items():
        # Skip clusters already assigned to another label
        if cluster in assigned_clusters:
            continue
        if label_counts[label] > max_count:
            max_count = label_counts[label]
            assigned_cluster = cluster
    # Assign this label to the cluster and mark the cluster as used
    if assigned_cluster is not None:
        label_to_cluster[label] = assigned_cluster
        assigned_clusters.add(assigned_cluster)
        correctly_classified += max_count

# Calculate accuracy
accuracy = correctly_classified / total_samples

print(f"Accuracy: {accuracy:.4f}")


In [None]:
# Track clusters already assigned to labels
assigned_clusters = set()

if cluster in assigned_clusters:
            continue

assigned_clusters.add(assigned_cluster)

SKLEARN

In [150]:
from sklearn.metrics.pairwise import cosine_similarity

In [151]:
# Define KMeansCosine class
class KMeansCosine:
    def __init__(self, n_clusters=2, max_iter=300, random_state=None):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.random_state = random_state

    def fit(self, X):
        # Normalize the data to unit vectors
        X_normalized = X / np.linalg.norm(X, axis=1)[:, np.newaxis]

        # Initialize centroids randomly from the data points
        np.random.seed(self.random_state)
        initial_indices = np.random.choice(X_normalized.shape[0], self.n_clusters, replace=False)
        centroids = X_normalized[initial_indices]

        # Track the minimum cost and corresponding labels/centroids
        best_cost = float('inf')
        best_labels = None
        best_centroids = None


        for _ in range(self.max_iter):
            # Compute the cosine similarity and distance
            similarities = cosine_similarity(X_normalized, centroids)
            cosine_dist = 1 - similarities
            cost = np.sum(cosine_dist)
            #print(f"Iteration cost: {cost}")

            # Update best cost and corresponding labels/centroids if the current cost is lower
            if cost < best_cost:
                best_cost = cost
                best_labels = np.argmax(similarities, axis=1)
                best_centroids = centroids.copy()

            # Assign clusters based on the highest similarity (lowest distance)
            labels = np.argmax(similarities, axis=1)

            # Update centroids by taking the mean of the points in each cluster
            new_centroids = np.array([X_normalized[labels == i].mean(axis=0) for i in range(self.n_clusters)])
            new_centroids /= np.linalg.norm(new_centroids, axis=1)[:, np.newaxis]

            # Check for convergence (if centroids do not change)
            if np.allclose(centroids, new_centroids, atol=1e-6):  # Use np.allclose for numerical stability
                break

            centroids = new_centroids

        # Store final centroids and labels
        self.labels_ = labels
        self.centroids_ = centroids
        self.best_labels_ = best_labels
        self.best_centroids_ = best_centroids
        self.best_cost_ = best_cost

In [152]:
kmeans_cosine = KMeansCosine(n_clusters=3, max_iter=100, random_state=11)
kmeans_cosine.fit(feats_np)

#print("Final Cluster Labels:", kmeans_cosine.labels_)
#print("Final Centroids:", kmeans_cosine.centroids_)
#print("Best Cluster Labels with Lowest Cost:", kmeans_cosine.best_labels_)
#print("Best Centroids with Lowest Cost:", kmeans_cosine.best_centroids_)
#print("Lowest Cost:", kmeans_cosine.best_cost_)

In [None]:
idx = kmeans_cosine.labels_
idx

In [None]:
# `idx` contains the cluster assignments from KMeans
#  `labels` contains the true labels

K = len(np.unique(idx))  # Number of clusters

# Iterate over each cluster and count the true labels in that cluster
for cluster in range(K):
    # Find the indices of images assigned to the current cluster
    cluster_indices = np.where(idx == cluster)[0]
    
    # Get the true labels for the images in this cluster
    cluster_labels = labels_np[cluster_indices]
    
    # Use Counter to count occurrences of each label in the cluster
    label_counts = Counter(cluster_labels)
    
    # Print the result
    print(f"Cluster {cluster} has true labels: {label_counts}")


In [155]:
num = 49+ 164

In [None]:
num/615

full Dataset
1 = 58.7
2 = 55.60
3 = 

In [157]:
# Function to reorder K-Means labels to match true labels
def reorder_labels(true_labels, predicted_labels):
    reordered_labels = np.zeros_like(predicted_labels)

    for cluster in np.unique(predicted_labels):
        mask = (predicted_labels == cluster)
        # Use mode and handle cases where mode() returns a scalar
        most_common_label = mode(true_labels[mask], axis=None).mode  # Get the mode for the current cluster
        if isinstance(most_common_label, np.ndarray):
            most_common_label = most_common_label[0]  # Safely extract the mode value if it's an array
        
        reordered_labels[mask] = most_common_label

    return reordered_labels

In [None]:
# Apply this after running K-Means
reordered_idx = reorder_labels(labels_np, idx)

# Now you can compare `reordered_idx` with `labels` to evaluate accuracy
accuracy = np.sum(reordered_idx == labels_np) / len(labels_np) * 100
print(f"Accuracy: {accuracy:.2f}%")

From scratch

In [159]:
# K-Means initialization function
def kMeans_init_centroids(X, K):
    randidx = np.random.permutation(X.shape[0])
    centroids = X[randidx[:K]]
    return centroids

# Function to find the closest centroids using Cosine Similarity
def find_closest_centroids(X, centroids):
    # Normalize both the data points and centroids to ensure we compute cosine similarity
    #X_norm = normalize(X, axis=1)
    #centroids_norm = normalize(centroids, axis=1)
    
    # Assign data points to closest centroids based on cosine similarity
    idx = np.zeros(X.shape[0], dtype=int)
    for i in range(X.shape[0]):
        # Compute cosine similarity
        similarities = np.dot(centroids, X[i])  # Dot product gives cosine similarity
        idx[i] = np.argmax(similarities)  # We want the most similar (highest value)
    return idx

# Function to compute new centroids
def compute_centroids(X, idx, K):
    centroids = np.zeros((K, X.shape[1]))  # Use shape[1] for features
    for k in range(K):
        points = X[idx == k]
        if len(points) > 0:
            centroids[k] = np.mean(points, axis=0)   # noralised vectors mean maynot be normalised. hence we normalise before calculating mean.https://chatgpt.com/share/671b97a7-ec2c-8010-af33-af106df0a25c
            centroids_norm = normalize(centroids, axis=1)
    return centroids_norm

# Function to run K-Means algorithm with cost tracking (using Cosine Similarity)
def run_kMeans(X, initial_centroids, max_iters=10):
    K = initial_centroids.shape[0]
    centroids = initial_centroids
    idx = np.zeros(X.shape[0])
    


    for i in range(max_iters):
        #print(f"K-Means iteration {i}/{max_iters - 1}")
        
        # Assign each data point to the closest centroid using cosine similarity
        idx = find_closest_centroids(X, centroids)
        
        # Compute new centroids
        centroids = compute_centroids(X, idx, K)

        # Calculate cost function for the current centroids using cosine distance

        # 1. Compute cosine similarity
        sim = np.dot(X, centroids.T)
        #print(sim.shape)
        #print(sim)
        # 2. Calculate cosine distance
        cosine_dist = 1 - sim
        #print(cosine_dist.shape)
        #print(cosine_dist)
        # 3. Find maximum cosine distance for each data point
        #max_cosine_dist = cosine_dist.max(axis=1)
        #print(max_cosine_dist.shape)
        #print(max_cosine_dist)
        cost = np.sum(cosine_dist)

        # 4. Sum of all maximum distances
        #cost = np.sum(max_cosine_dist)

        #cost = np.sum(1 - np.dot(X_norm, centroids_norm.T).max(axis=1))  # Cosine distance = 1 - cosine similarity  
        #print(f"Cost function value: {cost}")  # Print the cost function value

        

   # Indicate which iteration was chosen
    return centroids,idx  # Return the best centroids


In [None]:
# Main function to run the K-Means algorithm

K = 3                     # Set number of clusters
initial_centroids = kMeans_init_centroids(feats_np_norm, K)  # Step 3: Initialize centroids
max_iters = 50                # Step 4: Number of iterations
centroids, idx = run_kMeans(feats_np_norm, initial_centroids, max_iters)  # Step 5: Run K-Means
#print("Final centroids:", centroids)  # Output the final centroids

from collections import Counter
import numpy as np

# Assuming `idx` contains the cluster assignments from KMeans
# and `labels` contains the true labels

K = len(np.unique(idx))  # Number of clusters

# Iterate over each cluster and count the true labels in that cluster
for cluster in range(K):
    # Find the indices of images assigned to the current cluster
    cluster_indices = np.where(idx == cluster)[0]
    
    # Get the true labels for the images in this cluster
    cluster_labels = labels_np[cluster_indices]
    
    # Use Counter to count occurrences of each label in the cluster
    label_counts = Counter(cluster_labels)
    
    # Print the result
    print(f"Cluster {cluster} has true labels: {label_counts}")

In [None]:
num = 63+191
num/(280+40+103)

In [None]:
after full 
56.26
60.04

In [None]:
after curated full 
48.22
46.80
49.17
53.12




In [None]:
# Apply this after running K-Means
reordered_idx = reorder_labels(labels_np, idx)

# Now you can compare `reordered_idx` with `labels` to evaluate accuracy
accuracy = np.sum(reordered_idx == labels_np) / len(labels_np) * 100
print(f"Accuracy: {accuracy:.2f}%")

In [None]:
after uncure 40
75.83

In [None]:
num = 103 + 413 + 40
num/615

In [None]:
full Dataset
1 = 50.57
2= 55.
3. 58
4= 51.86
5 = 54.95
6 = 53.33
7 = 90.4

In [None]:
strong curated full

one = 62.41,
two = 62.41,
three = 68.08,
four= 85.58,
5 = 62.17,
6 = 68.08,
7 = 62.88,
8 = 68.08
9 = 62.17
10 = 62.17
11 = 100


In [344]:
from scipy.stats import mode
import numpy as np

# Function to reorder K-Means labels to match true labels
def reorder_labels(true_labels, predicted_labels):
    reordered_labels = np.zeros_like(predicted_labels)

    for cluster in np.unique(predicted_labels):
        mask = (predicted_labels == cluster)
        # Use mode and handle cases where mode() returns a scalar
        most_common_label = mode(true_labels[mask], axis=None).mode  # Get the mode for the current cluster
        if isinstance(most_common_label, np.ndarray):
            most_common_label = most_common_label[0]  # Safely extract the mode value if it's an array
        
        reordered_labels[mask] = most_common_label

    return reordered_labels




In [None]:
# Apply this after running K-Means
reordered_idx = reorder_labels(labels_np, idx)

# Now you can compare `reordered_idx` with `labels` to evaluate accuracy
accuracy = np.sum(reordered_idx == labels_np) / len(labels_np) * 100
print(f"Accuracy: {accuracy:.2f}%")

In [None]:
after strong cure 40
3 = 70.83


In [347]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

In [149]:
# Function to reduce the dimensionality of data points and centroids to 2D using PCA
def reduce_to_2D_pca(X, centroids):
    # Initialize PCA with 2 components
    pca = PCA(n_components=2)
    
    # Apply PCA to the data points (X) and centroids
    X_2D = pca.fit_transform(X)  # Reducing original data points
    centroids_2D = pca.transform(centroids)  # Reducing centroids

    return X_2D, centroids_2D

# Function to plot 2D visualization of clustered data points and centroids with true labels
def plot_2D_clusters_with_labels(X_2D, centroids_2D, idx, labels, K):
    plt.figure(figsize=(10, 7))
    
    # Define color map for clusters
    colors = plt.cm.rainbow(np.linspace(0, 1, K))
    
    # Plot data points colored by their cluster assignment
    for k in range(K):
        cluster_points = X_2D[idx == k]
        cluster_labels = labels[idx == k]  # Get true labels for the current cluster
        
        # Scatter plot for each cluster
        plt.scatter(cluster_points[:, 0], cluster_points[:, 1], 
                    c=[colors[k]], label=f"Cluster {k+1}", alpha=0.6)
        
        # Annotate each point with its true label
        for i in range(cluster_points.shape[0]):
            plt.annotate(str(cluster_labels[i]), 
                         (cluster_points[i, 0], cluster_points[i, 1]), 
                         fontsize=8, alpha=0.75)

    # Plot centroids as larger markers
    plt.scatter(centroids_2D[:, 0], centroids_2D[:, 1], 
                c='k', marker='x', s=200, label='Centroids')

    plt.title("Strong before full dataset")
    plt.xlabel("Component 1")
    plt.ylabel("Component 2")
    plt.legend()
    plt.grid()
    plt.show()


In [None]:
# Apply PCA after K-Means clustering for 2D visualization
X_2D_pca, centroids_2D_pca = reduce_to_2D_pca(feats_np_norm, centroids)

# Plot the 2D clusters with centroids and true labels
plot_2D_clusters_with_labels(X_2D_pca, centroids_2D_pca, idx, labels_np, K)
