In [1]:
from pathlib import Path
from collections import defaultdict
import shutil

from tqdm import tqdm
import numpy as np
import torch

In [17]:
def dbscan_gpu(feature, max_neighbor_dist):
    """
    :param feature: of shape num_samples x num_features
    :param max_neighbor_dist: The maximum distance between two samples for one to be considered as in the neighborhood of the other
    :return:
    """
    from cupy import asnumpy
    from cuml.cluster import DBSCAN
    db = DBSCAN(eps=max_neighbor_dist, min_samples=1).fit(feature)
    return asnumpy(db.labels_.values)


def _group_embedding_by_sku(embedding):
    groups = defaultdict(lambda: defaultdict(list))
    for i, f in zip(embedding["indices"], embedding["features"]):
        sku_name = str(Path(i).parent)
        groups[sku_name]["indices"].append(i)
        groups[sku_name]["features"].append(f)
    for sku_name in groups:
        groups[sku_name]["features"] = np.stack(groups[sku_name]["features"])
    return groups


def _read_embedding(path):
    embedding = torch.load(path)
    return {"indices": embedding["indices"], "features": embedding["features"].numpy()}


def euc_dist(x, y):
    return np.sqrt(np.square(x - y).sum())

In [3]:
ebd = _read_embedding("/data2/datasets/clobotics/ccth/embeddings/versions/train20200129_val20200117_test20191122/val_20200117_flip.pth")

In [4]:
groups = _group_embedding_by_sku(ebd)

In [35]:
img_root_dir = Path("/data2/datasets/clobotics/ccth/images/cropped/versions/train20200129_val20200117_test20191122/val")
save_dir = Path("/tmp/clust-exp/1037551")
sku = "1037551#RB#620ML#Leo#Lager#Beer"
max_neighbor_dist = 7.0
indices = np.array(groups[sku]["indices"])
features = groups[sku]["features"]
labels = dbscan_gpu(features, max_neighbor_dist=max_neighbor_dist)
print(f"{len(indices)} -> {labels.max()}")
print(indices[:4])

2814 -> 1185
['1037551#RB#620ML#Leo#Lager#Beer/0044f3e7063ef5a7354faa210022b52e_26_1037551.jpg'
 '1037551#RB#620ML#Leo#Lager#Beer/0044f3e7063ef5a7354faa210022b52e_27_1037551.jpg'
 '1037551#RB#620ML#Leo#Lager#Beer/006f7f53812056aee34db8c28479c127_3_1037551.jpg'
 '1037551#RB#620ML#Leo#Lager#Beer/006f7f53812056aee34db8c28479c127_4_1037551.jpg']


In [36]:
for clust_id in range(labels.max()):
    for ind in indices[labels == clust_id]:
        fn = ind.split("/")[-1]
        _save_path = save_dir / str(clust_id) / fn
        _save_path.parent.mkdir(parents=True, exist_ok=True)
        shutil.copy(img_root_dir / ind, _save_path)

In [8]:
_save_path

PosixPath('/tmp/clust-exp/1037551/303/ffd404ded63ff376f89d5597e153d3bf_0_1037551.jpg')

#### deep dive

In [25]:
# ia = [i for i, p in enumerate(indices) if "f20ea80bffeb43f0c7eef72fe5029f20_3_1037551" in p]
ia = [i for i, p in enumerate(indices) if r"0df03b20b7b29009abb414ed9b18c4dc_10_1037551" in p]  # 2
ib = [i for i, p in enumerate(indices) if r"0aa589d6695378737a7db00a4f11b7f0_4_1037551" in p]  # 2
print(ia, ib)
euc_dist(features[ia], features[ib])

[144] [107]


11.278919