In [47]:
import os
import sys
import json
from pathlib import Path
sys.path.append(os.path.abspath('..'))

# ----------- local imports ----------- 
from constants import FACE_ID_TRAIN_PATH, DATA_DIR
from Facenet.face_id_dataset import get_train_val_set, get_embedding_from_path

import tensorflow as tf
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from PIL import Image
import pandas as pd
import numpy as np


from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score


In [48]:
# preprocess train embedding dataframe
df_train_embeddings = get_embedding_from_path("embeddings/train_embeddings.csv")

train_df, val_df = get_train_val_set(df_train_embeddings, unknown_cnt = 5)

In [49]:
len(train_df['person'].unique()), len(val_df['person'].unique())

(120, 125)

In [50]:
X_train = np.vstack(train_df['embeddings'].values) 
y_train = train_df['person'].values 

X_val = np.vstack(val_df['embeddings'].values)
y_val = val_df['gt'].values  # Ground truth (including "doesn't_exist")


label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

In [51]:
num_clusters = len(set(y_train_encoded))

kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
kmeans.fit(X_train)

In [52]:
train_df['cluster'] = kmeans.labels_
cluster_centers = kmeans.cluster_centers_

In [53]:
# Map clusters to person labels (most common label in cluster)
cluster_to_person = {}
for cluster_id in range(num_clusters):
    persons_in_cluster = train_df.loc[train_df['cluster'] == cluster_id, 'person']
    most_common_person = persons_in_cluster.mode()[0]  # Get the most common person in the cluster
    cluster_to_person[cluster_id] = most_common_person

# Validation

In [54]:
from sklearn.metrics.pairwise import cosine_similarity

X_val = np.vstack(val_df['embeddings'])
cluster_assignments = kmeans.predict(X_val)

similarities = [
    cosine_similarity([X_val[i]], [cluster_centers[cluster_assignments[i]]])[0, 0]
    for i in range(len(X_val))
]

# Set a similarity threshold (0.6)
SIMILARITY_THRESHOLD = 0.6

# Final Predictions
final_predictions = [
    "doesn't_exist" if sim < SIMILARITY_THRESHOLD else cluster_to_person[cluster]
    for sim, cluster in zip(similarities, cluster_assignments)
]
# Evaluate Accuracy
accuracy = accuracy_score(val_df['gt'], final_predictions)
print(f"Validation Accuracy using K-Means + Distance Filtering: {accuracy:.4f}")

Validation Accuracy using K-Means + Distance Filtering: 0.9158


# Testing

In [55]:
X = np.vstack(df_train_embeddings['embeddings'].values) 
y = df_train_embeddings['person'].values 

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

num_clusters = len(set(y_encoded))
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
kmeans.fit(X)

cluster_centers = kmeans.cluster_centers_

In [56]:
cluster_to_person = {}
for cluster_id in range(num_clusters):
    persons_in_cluster = df_train_embeddings.loc[kmeans.labels_ == cluster_id, 'person']
    most_common_person = persons_in_cluster.mode()[0]  # Get the most frequent person label
    cluster_to_person[cluster_id] = most_common_person

In [57]:
def get_test_embedding_from_path(embedding_path = "embeddings/test_embeddings.csv"):
    """ preprocess train embedding dataframe """
    # preprocess train embedding dataframe
    df_test_embeddings = pd.read_csv(embedding_path)
    # fix embeddings to np array
    df_test_embeddings['embeddings'] = df_test_embeddings['embeddings'].apply(lambda x: np.array(x[1:-1].split(), dtype=np.float32))
    
    df_test_embeddings['img'] = df_test_embeddings['identity'].apply(lambda x : Path(x).name)
    
    return df_test_embeddings
df_test_embeddings = get_test_embedding_from_path("embeddings/test_embeddings.csv")
X_test = np.vstack(df_test_embeddings['embeddings'].values)

In [61]:
cluster_assignments = kmeans.predict(X_test)
# Compute distances to the closest cluster center
similarities = [
    cosine_similarity([X_test[i]], [cluster_centers[cluster_assignments[i]]])[0, 0]
    for i in range(len(X_test))
]
# Set a similarity threshold (0.6)
SIMILARITY_THRESHOLD = 0.1

# Final Predictions
final_predictions = [
    "doesn't_exist" if sim < SIMILARITY_THRESHOLD else cluster_to_person[cluster]
    for sim, cluster in zip(similarities, cluster_assignments)
]

In [62]:
final_predictions


['person_62',
 'person_21',
 'person_26',
 'person_57',
 'person_21',
 'person_21',
 'person_21',
 'person_26',
 'person_21',
 'person_62',
 'person_110',
 'person_108',
 'person_21',
 'person_21',
 'person_57',
 'person_21',
 'person_26',
 "doesn't_exist",
 'person_112',
 'person_59',
 'person_57',
 'person_110',
 "doesn't_exist",
 'person_57',
 'person_57',
 'person_57',
 "doesn't_exist",
 'person_119',
 'person_57',
 'person_59',
 'person_57',
 'person_41',
 'person_110',
 'person_59',
 'person_59',
 'person_59',
 'person_57',
 'person_52',
 'person_52',
 'person_49',
 'person_59',
 'person_52',
 'person_21',
 'person_110',
 "doesn't_exist",
 'person_52',
 'person_57',
 "doesn't_exist",
 "doesn't_exist",
 'person_57',
 'person_57',
 'person_57',
 'person_88',
 'person_57',
 'person_99',
 'person_59',
 'person_39',
 'person_112',
 'person_112',
 'person_52',
 'person_52',
 'person_57',
 "doesn't_exist",
 'person_57',
 'person_59',
 'person_57',
 'person_52',
 'person_41',
 'person_57