# Cropping Dataset To 224x224

In [48]:
from itertools import chain
import glob
from PIL import Image 
import os 
classes = [
    'n02111889-Samoyed',
    'n02098286-West_Highland_white_terrier',
    'n02085782-Japanese_spaniel',
    'n02088466-bloodhound'
]

dog_images = list(chain.from_iterable([glob.glob(f'./Images/{cls}/*') for cls in classes]))
annotations = list(chain.from_iterable([glob.glob(f'./Annotation/{cls}/*') for cls in classes]))

dog_images = list(map(os.path.abspath, dog_images))
annotations = list(map(os.path.abspath, annotations))

print('length of dog images: ', len(dog_images))
print('length of annotations: ', len(annotations))


# Source: https://www.kaggle.com/code/espriella/stanford-dogs-transfer-crop-stack/notebook
import xml.etree.ElementTree as ET
from pathlib import Path

def get_image(annot):
    img_filename = annot.replace('Annotation', 'Images')
    return img_filename + '.jpg'

def get_bounding_boxes(annot):
    xml = annot
    tree = ET.parse(xml)
    root = tree.getroot()
    objects = root.findall('object')
    bbox = []
    for o in objects:
        bndbox = o.find('bndbox')
        xmin = int(bndbox.find('xmin').text)
        ymin = int(bndbox.find('ymin').text)
        xmax = int(bndbox.find('xmax').text)
        ymax = int(bndbox.find('ymax').text)
        bbox.append((xmin,ymin,xmax,ymax))
    return bbox

if not os.path.exists('./Cropped/'):
    for i in range(len(dog_images)):
        bbox = get_bounding_boxes(annotations[i])
        dog = get_image(annotations[i])
        im = Image.open(dog)
        for j in range(len(bbox)):
            im2 = im.crop(bbox[j])
            im2 = im2.resize((224,224), Image.ANTIALIAS)
            new_path = dog.replace('Images','Cropped')
            new_path = new_path.replace('.jpg','-' + str(j) + '.jpg')
            im2=im2.convert('RGB')
            head, tail = os.path.split(new_path)
            Path(head).mkdir(parents=True, exist_ok=True)
            im2.save(new_path)

cropped_images = list(chain.from_iterable([glob.glob(f'./Cropped/{cls}/*') for cls in classes]))
cropped_images = list(map(os.path.abspath, cropped_images))

length of dog images:  759
length of annotations:  759


# Feature Extraction
Source: https://kozodoi.me/blog/20210527/extracting-features

In [49]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import timm

import albumentations as A
from albumentations.pytorch import ToTensorV2

import cv2
import os
from pathlib import PurePath

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class ImageData(Dataset):
    def __init__(self, filenames, transform):
        self.filenames = filenames
        self.transform = transform
        self.classes = [
            'n02111889-Samoyed',
            'n02098286-West_Highland_white_terrier',
            'n02085782-Japanese_spaniel',
            'n02088466-bloodhound'
        ]
    def __len__(self):
        return len(self.filenames)
    
    def __getitem__(self, idx):
        filename = self.filenames[idx]
        image = cv2.imread(filename)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = self.transform(image = image)['image']
        label = PurePath(filename).parent.name
        label_enc = self.classes.index(label)
        return image, label_enc

transforms = A.Compose([A.Normalize(),
                        ToTensorV2()])

data_set = ImageData(cropped_images, transforms)

data_loader = DataLoader(data_set, 
                         batch_size  = 32, 
                         shuffle     = False, 
                         num_workers = 0)

model    = timm.create_model(model_name = 'resnet18', pretrained = True)
model.fc = nn.Linear(512, 2)
model.to(device)

##### HELPER FUNCTION FOR FEATURE EXTRACTION
def get_features(name):
    def hook(model, input, output):
        features[name] = output.detach()
    return hook

##### REGISTER HOOK
model.global_pool.register_forward_hook(get_features('feats'))

##### FEATURE EXTRACTION LOOP

# placeholders
PREDS = []
LABELS = []
FEATS = []

# placeholder for batch features
features = {}

# loop through batches
for idx, inputs in enumerate(data_loader):
    labels = inputs[1]
    inputs = inputs[0]
    # move to device
    inputs = inputs.to(device)
       
    # forward pass [with feature extraction]
    preds = model(inputs)
    
    # add feats and preds to lists
    PREDS.append(preds.detach().cpu().numpy())
    FEATS.append(features['feats'].cpu().numpy())
    LABELS.append(labels.cpu().numpy())

    # early stop
    if idx == 9:
        break

##### INSPECT FEATURES

PREDS = np.concatenate(PREDS)
LABELS = np.concatenate(LABELS)
FEATS = np.concatenate(FEATS)

print('- preds shape:', PREDS.shape)
print('- feats shape:', FEATS.shape)

- preds shape: (320, 2)
- feats shape: (320, 512)


# 2. Dimensionality Reduction

In [50]:
from sklearn.decomposition import PCA

feats_2d = PCA(n_components=2).fit_transform(FEATS)

# 3. Clustering Alogrithms

In [73]:
from sklearn import cluster

n_clusters = 4

eps = 0.3
min_samples = 5

methods = {
    'K-Means Random' : cluster.KMeans(n_clusters, init='random', n_init='auto').fit_predict(feats_2d),
    'K-Means++' : cluster.KMeans(n_clusters, init='k-means++', n_init='auto').fit_predict(feats_2d),
    'Bisecting K-Means' : cluster.BisectingKMeans(n_clusters, init='random').fit_predict(feats_2d),
    'Spectral Clustering' : cluster.SpectralClustering(n_clusters).fit_predict(feats_2d),
    'DBSCAN' : cluster.DBSCAN(eps=eps, min_samples=min_samples).fit_predict(feats_2d),
    'Agglomerative (Single Link)' : cluster.AgglomerativeClustering(n_clusters, linkage='single').fit_predict(feats_2d),
    'Agglomerative (Complete Link)' : cluster.AgglomerativeClustering(n_clusters, linkage='complete').fit_predict(feats_2d),
    'Agglomerative (Group Average)' : cluster.AgglomerativeClustering(n_clusters, linkage='average').fit_predict(feats_2d),
    'Agglomerative (Ward)' : cluster.AgglomerativeClustering(n_clusters, linkage='ward').fit_predict(feats_2d),
}

n_clusters_dbscan =  len(set(methods['DBSCAN']))
print('# DBSCAN #')
print("Number of clusters:", n_clusters_dbscan)
print("eps:", eps)
print("min_samples:", min_samples)

# DBSCAN #
Number of clusters: 4
eps: 0.3
min_samples: 5


# 4. Clustering Evaluation

In [81]:
from sklearn import metrics

class Clustering:
    def __init__(self, name, fowlkesMallows, sihlouette):
        self.name = name
        self.fowlkesMallows = fowlkesMallows
        self.sihlouette = sihlouette
    def __str__(self):
        return self.name

stats = []

for name, clustering in methods.items():
    fm = metrics.fowlkes_mallows_score(LABELS, clustering)
    s = metrics.silhouette_score(feats_2d, clustering)
    stats.append(Clustering(name, fm, s))
    print(f"### {name} ###")
    print('\tFowlkes-Mallows Index:', fm)
    print('\tSihlouette Score:', s)
    print()

sorted_fowlkesMallows = reversed(sorted(stats, key=lambda c : c.fowlkesMallows))
sorted_sihlouette = reversed(sorted(stats, key=lambda c : c.sihlouette))

print("Sorted by Fowlkes-Mallows Index:")
for c in sorted_fowlkesMallows:
    print('\t'+str(c))
print("Sorted by Sihlouette Score:")
for c in sorted_sihlouette:
    print('\t'+str(c))



### K-Means Random ###
	Fowlkes-Mallows Index: 0.42992487758249914
	Sihlouette Score: 0.37478018

### K-Means++ ###
	Fowlkes-Mallows Index: 0.4373898918268066
	Sihlouette Score: 0.3784739

### Bisecting K-Means ###
	Fowlkes-Mallows Index: 0.4383142604635883
	Sihlouette Score: 0.36086938

### Spectral Clustering ###
	Fowlkes-Mallows Index: 0.5254860411137812
	Sihlouette Score: 0.3380731

### DBSCAN ###
	Fowlkes-Mallows Index: 0.569283420893119
	Sihlouette Score: 0.14869535

### Agglomerative (Single Link) ###
	Fowlkes-Mallows Index: 0.7814160508895514
	Sihlouette Score: 0.20766315

### Agglomerative (Complete Link) ###
	Fowlkes-Mallows Index: 0.4405827511778126
	Sihlouette Score: 0.27606922

### Agglomerative (Group Average) ###
	Fowlkes-Mallows Index: 0.6444704032836164
	Sihlouette Score: 0.25265154

### Agglomerative (Ward) ###
	Fowlkes-Mallows Index: 0.4723955833984253
	Sihlouette Score: 0.30179957

Sorted by Fowlkes-Mallows Index:
	Agglomerative (Single Link)
	Agglomerative (Group A