### Assignment - 4 ::  Clustering Analysis


In [1]:
import os
path = "Cropped/" 
folders = [os.path.join(path, folder) for folder in os.listdir(path)]
folders

['Cropped/n02085620-Chihuahua',
 'Cropped/n02089078-black-and-tan_coonhound',
 'Cropped/n02091134-whippet',
 'Cropped/n02108915-French_bulldog']

In [3]:
import torchvision.transforms as transforms
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

### creating model

In [5]:
import torchvision.models as models
import warnings
warnings.filterwarnings("ignore")

### https://kozodoi.me/blog/20210527/extracting-features

In [6]:
from PIL import Image
import torch

res_model = models.resnet18(pretrained=True)
res_model.eval() 
last_layer = res_model._modules.get('layer4')  
layer_features = []
labels = []  
class_labels = {x: i for i, x in enumerate(folders)}
## forwar hook
def get_features(module, input, output):
    layer_features.append(output.squeeze().detach().numpy())
hook = last_layer.register_forward_hook(get_features)
for folder, label in class_labels.items():
    for img_name in os.listdir(folder):
        img_path = os.path.join(folder, img_name)
        try:
            image = Image.open(img_path).convert('RGB')
            input_tensor = transform(image).unsqueeze(0)
            with torch.no_grad():
                _ = res_model(input_tensor)
            labels.append(label)
        except Exception as e:
            print(f"Error processing image {img_name}: {e}")

In [7]:
hook.remove()

### PCA 

In [9]:
from sklearn.decomposition import PCA
import numpy as np
f = np.array(layer_features)
l = np.array(labels)
f = f.reshape(f.shape[0], -1)
pca_model = PCA(n_components=2)
f_2d = pca_model.fit_transform(f)

### Clustering Techniques 

#### https://scikit-learn.org/stable/modules/clustering.html

In [13]:
from sklearn.cluster import KMeans,BisectingKMeans, SpectralClustering, DBSCAN, AgglomerativeClustering

preds = {}


In [18]:
kmeans_random = KMeans(n_clusters=4, init='random', random_state=42)
kmeans_labels = kmeans_random.fit_predict(f_2d)
preds['K-Means (Random)'] = kmeans_labels

In [20]:
kmeans_plus = KMeans(n_clusters=4, init='k-means++', random_state=42)
kmeans_pl = kmeans_plus.fit_predict(f_2d)
preds['(k-means++)'] = kmeans_pl

In [22]:
bisecting_kmeans = BisectingKMeans(n_clusters=4, init='random', random_state=42)
bi_l = bisecting_kmeans.fit_predict(f_2d)
preds['Bisecting K-Means'] = bi_l

In [24]:
spectral = SpectralClustering(n_clusters=4, random_state=42)
spec_l = spectral.fit_predict(f_2d)
preds['Spectral Clustering'] = spec_l

In [25]:
ward = AgglomerativeClustering(n_clusters=4, linkage='ward')
wardlabel = ward.fit_predict(f_2d)
preds[f'Agglomerative (ward)'] = wardlabel

In [26]:
complete = AgglomerativeClustering(n_clusters=4, linkage='complete')
completelabel = complete.fit_predict(f_2d)
preds[f'Agglomerative (complete)'] = completelabel

In [27]:
average = AgglomerativeClustering(n_clusters=4, linkage='average')
averagelabel = average.fit_predict(f_2d)
preds[f'Agglomerative (average)'] = averagelabel

In [28]:
single = AgglomerativeClustering(n_clusters=4, linkage='single')
singlelabel = single.fit_predict(f_2d)
preds[f'Agglomerative (single)'] = singlelabel

In [29]:
dbscan = DBSCAN(eps=3.1, min_samples=7).fit(f_2d)
pred= dbscan.labels_
n=len(set(pred))-(1 if -1 in pred else 0)
print('num of clusters: '+str(n))
preds['dbscan'] = pred

num of clusters: 4


In [30]:
from sklearn.metrics import fowlkes_mallows_score, silhouette_score

In [31]:
fowlkes_mallows_scores = {}
silhouette_scores = {}

In [32]:
for model, predicts in preds.items():
    fm= fowlkes_mallows_score(labels, predicts)
    sil = silhouette_score(f_2d, predicts)
    fowlkes_mallows_scores[model] = fm
    silhouette_scores[model] = sil

In [42]:
metrics = {
    "Fowlkes-Mallows Index": fowlkes_mallows_scores,
    "Silhouette Coefficient": silhouette_scores,
}

for metric_name, score_dict in metrics.items():
    ranking = sorted(score_dict.items(), key=lambda x: x[1], reverse=True)
    print(f"\n{metric_name} Rankings:")
    print("\n".join(f"{method}: {score}" for method, score in ranking))



Fowlkes-Mallows Index Rankings:
Agglomerative (average): 0.7913646155149442
Agglomerative (ward): 0.7173432849135426
K-Means (Random): 0.7012661782389416
Bisecting K-Means: 0.6978067138546405
Agglomerative (complete): 0.6919126835646533
(k-means++): 0.6807228213757737
Spectral Clustering: 0.506013017358337
Agglomerative (single): 0.5043698200953882
dbscan: 0.49227632117430625

Silhouette Coefficient Rankings:
Agglomerative (average): 0.5522032976150513
Bisecting K-Means: 0.5289257764816284
(k-means++): 0.5061092972755432
K-Means (Random): 0.5030260682106018
Agglomerative (complete): 0.4893679618835449
Agglomerative (ward): 0.47830480337142944
Spectral Clustering: -0.021852178499102592
dbscan: -0.19005639851093292
Agglomerative (single): -0.2430742084980011
