# Facial image clustering

In [65]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pprint import pprint
from glob import glob
from tqdm import tqdm
from PIL import Image
from colorhash import ColorHash

from facenet_pytorch import InceptionResnetV1, MTCNN

from torchvision import transforms, datasets
from torch.utils.data import DataLoader, Dataset

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN

## Compute embedding

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Running on device: {device}')

# load pretrained models
mtcnn = MTCNN(device=device)
resnet = InceptionResnetV1(pretrained='vggface2', device=device).eval()

# calculate embeddings
embeddings = []
image_paths = []

images = glob("../../datasets/fr/**/*.jpg", recursive=True)
images.extend(glob("../../datasets/in/**/*.jpg", recursive=True))

for img_path in tqdm(images):
    with torch.no_grad():
        try:
            img_pil = Image.open(img_path).convert("RGB")
            img_cropped = mtcnn(img_pil)
            img_embedding = resnet(img_cropped.unsqueeze(0))
            embeddings.append(img_embedding.squeeze().cpu().tolist())
            image_paths.append(img_path)
        except:
            print(f"\t[ERROR] {img_path}")

# clean memory
del mtcnn
del resnet

# store computed embeddings
df = pd.concat([pd.DataFrame({'path': image_paths}), pd.DataFrame(embeddings)], axis=1)
df.to_csv('../../annotations/fr-in-embedings.csv', index=False)

## Load saved embeddings

In [3]:
df = pd.read_csv('../../annotations/fr-in-embedings.csv')

## Calculate mean embedding vector for each person

In [57]:
df["person"] = df["path"].map(lambda x: "/".join(x.split("/")[-4:-1]))

mean_embeddings = df.copy()
del mean_embeddings["path"]
 
mean_embeddings = pd.merge(
    mean_embeddings.groupby("person", as_index=False).mean(),
    mean_embeddings.groupby("person", as_index=False)["0"].count(),
    on="person") \
    .rename(columns={"0_x": "0", "0_y": "count"}) \
    .set_index("person")

del mean_embeddings["count"]


In [142]:
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import pdist, squareform

labels = mean_embeddings.index.tolist()

# euclidian & cosine distances are basically same for normalized data
dist = pdist(mean_embeddings.to_numpy(), metric='cosine')
Z = linkage(dist, method='single')

In [153]:
with plt.rc_context({'lines.linewidth': 0.5}):
    plt.clf()
    plt.figure(figsize=(10, 50))
    plt.title("Single linkage HC of avg. embeddings")
    dendrogram(Z_single, labels=labels, orientation='right')
    plt.axvline(0.05, ls=':', lw=0.8, c='r', alpha=0.5)
    plt.axvline(0.10, ls=':', lw=0.8, c='r', alpha=0.5)
    plt.axvline(0.15, ls=':', lw=0.8, c='r', alpha=0.5)
    plt.axvline(0.20, ls=':', lw=0.8, c='r', alpha=0.5)
    plt.grid(False)
    plt.tight_layout()
    plt.xticks(fontsize=6, rotation=0)
    plt.savefig("../../annotations/fr-in-clustering.pdf")
    plt.show()

<Figure size 432x288 with 0 Axes>