In [3]:
from PIL import Image
import glob
import os
import timm
import torch
from torchvision import datasets, transforms
from torch.utils import data
import numpy as np
import pandas as pd
import itertools
import shutil
from pathlib import Path

def prep_dataset(folder_path):
    transform = transforms.Compose([transforms.Resize((224,224)),transforms.ToTensor(),

                                    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
                                    ])

    # define an image dataset

    image_dataset = datasets.ImageFolder(root=folder_path, transform=transform)

    dataloader = data.DataLoader(dataset=image_dataset, shuffle=False, batch_size=8)

    return dataloader, image_dataset

def save_embeddings(output_path, embeddings, filelist):

    np.savez('./embeddings.npz', embedding = embeddings, filelist= filelist)


def calculate_embeddings(model, dataloader):
    feature_embeddings = []
    from tqdm import tqdm
    with torch.no_grad():
        for image, label in tqdm(dataloader):
            embedding = model(image)
            feature_embeddings.extend(embedding.numpy())
    np_embeddings = np.vstack(feature_embeddings)
    print(np_embeddings.shape)
    return np_embeddings

def calculate_pca(embeddings, k = 16):
    from sklearn.decomposition import PCA

    pca = PCA(n_components=k)
    pca_embeddings = pca.fit_transform(embeddings)
    return pca_embeddings

def find_clusters(pca_embeddings, k = 10):
    from scipy.cluster.vq import kmeans2
    centroid, label = kmeans2(pca_embeddings,k, minit = 'points')
    return centroid, label

def copy_to_clusters(label, filelist):
    for label_number in range(len(np.bincount(label))) :
        label_mask = label_number == label
    cluster_images = list(itertools.compress(filelist, label_mask))
    for img_path in cluster_images:
        Path(f'./output/{label_number}').mkdir(parents=True, exist_ok=True)
        shutil.copy2(img_path, f'./output/{label_number}/{img_path.split("/")[-1]}')





def read_images_from_directory(image_directory: str) -> list :
    """
    > It takes a directory as input and returns a list of all the images in that directory
    :param image_directory: The directory where the images are stored
    :return: Alist of images
    """

    list_of_images = list()
    for ext in ("*.gif", "*.png", "*.jpg", "*.jpeg"):
        list_of_images.extend(
            glob.glob(os.path.join(image_directory, ext))
        )
    print(f"images found : {len(list_of_images)}")
    return list_of_images

def read_with_pil(list_of_images: list, resize = False) -> list :
    pil_images = list()
    for img_path in list_of_images:
        img = Image.open(img_path).convert("RGB")
        if resize:
            img.thumbnail((512,512))
        pil_images.append(img)
    return pil_images

def main():
    CLUSTER_RANGE = 16
    import torch
    print(torch.cuda.is_available())

    print('model')

    model = timm.create_model('resnet34', pretrained=True)
    print('model2')

    #list_of_images = read_images_from_directory('./images/')
    #pil_images = read_with_pil(list_of_images, resize = False)
    dataloader, image_dataset = prep_dataset('./root')

    filelist = [path for path, label in image_dataset.imgs]

    print('ceyhun1')
    feature_embeddings = calculate_embeddings(model, dataloader)
    print('ceyhun')

    save_embeddings('./embeddings.npz', feature_embeddings, filelist)

    pca_embeddings = calculate_pca(feature_embeddings)
    cluster, label = find_clusters(pca_embeddings, k= CLUSTER_RANGE)
    print('avant final')
    copy_to_clusters(label, filelist)
    print('final')

In [4]:
main()

False
model
model2
ceyhun1


100%|██████████| 256/256 [01:02<00:00,  4.10it/s]


(2046, 1000)
ceyhun
avant final
final
