In [66]:
import os
import sys
import json
from pathlib import Path
sys.path.append(os.path.abspath('..'))

# ----------- local imports ----------- 
# from constants import FACE_ID_test_PATH, DATA_DIR
from Facenet.face_id_dataset import load_faces_in_batch

import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from PIL import Image
import pandas as pd
import numpy as np

In [101]:
import ast 

df_submission = pd.read_csv('../../submissions/first.csv')

df_submission['objects'] = df_submission['objects'].apply(lambda x: ast.literal_eval(x))

assert isinstance(df_submission['objects'].iloc[0], list), "The first 'objects' entry is not a list!"


df_submission['img_name'] = df_submission.query("objective == 'face_reid'")['objects'].apply(lambda x: Path(x['image']).name)

In [102]:
df_submission

Unnamed: 0,ID,frame,objects,objective,img_name
0,0,1.0,"[{'tracked_id': 1, 'x': 757.8729248046875, 'y'...",tracking,
1,1,2.0,"[{'tracked_id': 7, 'x': 657.6649169921875, 'y'...",tracking,
2,2,3.0,"[{'tracked_id': 5, 'x': 1467.3447265625, 'y': ...",tracking,
3,3,4.0,"[{'tracked_id': 7, 'x': 660.7446899414062, 'y'...",tracking,
4,4,5.0,"[{'tracked_id': 7, 'x': 658.93408203125, 'y': ...",tracking,
...,...,...,...,...,...
5158,5158,-1.0,"{'gt': 'doesn't_exist', 'image': 'test_set/998...",face_reid,9987.jpg
5159,5159,-1.0,"{'gt': 'doesn't_exist', 'image': 'test_set/998...",face_reid,9988.jpg
5160,5160,-1.0,"{'gt': 'doesn't_exist', 'image': 'test_set/998...",face_reid,9989.jpg
5161,5161,-1.0,"{'gt': 'doesn't_exist', 'image': 'test_set/999...",face_reid,9990.jpg


In [103]:
# preprocess train embedding dataframe
df_train_embeddings = pd.read_csv("embeddings/train_embeddings.csv")
# fix embeddings to np array
df_train_embeddings['embeddings'] = df_train_embeddings['embeddings'].apply(lambda x: np.array(x[1:-1].split(), dtype=np.float32))
# add person as column
df_train_embeddings['person'] = df_train_embeddings['identity'].apply(lambda p: Path(p).parent.name)
# add image name as column `img`
df_train_embeddings['img'] = df_train_embeddings['identity'].apply(lambda p: Path(p).name)
# drop whole path `identity` column
df_train_embeddings = df_train_embeddings.drop(['identity'], axis=1)

In [104]:
# preprocess test embedding dataframe
df_test_embeddings = pd.read_csv("embeddings/test_embeddings.csv")
# fix embeddings to np array
df_test_embeddings['embeddings'] = df_test_embeddings['embeddings'].apply(lambda x: np.array(x[1:-1].split(), dtype=np.float32))
# add person as column
df_test_embeddings['person'] = df_test_embeddings['identity'].apply(lambda p: Path(p).parent.name)
# add image name as column `img`
df_test_embeddings['img'] = df_test_embeddings['identity'].apply(lambda p: Path(p).name)
# drop whole path `identity` column
df_test_embeddings = df_test_embeddings.drop(['identity'], axis=1)

In [105]:
def calculate_cosine_similarity(input_embeddings, df = df_train_embeddings, batch_size = 64):
    input_embeddings = input_embeddings.reshape(-1, 1) # embedding_size x 1
    input_embeddings /= np.linalg.norm(input_embeddings) # normalize embedding for cos calculations
    
    # store result in dataframe
    results_df = df.copy()
    cosine_similarities = np.zeros(len(df))
    # loop on df by batchs
    for i in range(0, len(df), batch_size):
        batch = df.iloc[i:i+batch_size]
        
        batch_embeddings = np.vstack(batch['embeddings'].values) # batch_size x embedding_size
        batch_embeddings /= np.linalg.norm(batch_embeddings, axis=1, keepdims=True)
        
        result = np.dot(batch_embeddings, input_embeddings).reshape(-1,)
        cosine_similarities[i:i+batch_size] = result 
    
    results_df['cosine_similarity'] = cosine_similarities
    return results_df

In [106]:
df_test_embeddings

Unnamed: 0,embeddings,person,img
0,"[-0.31785646, 0.4978342, -1.4526781, 1.2883636...",test,10000.jpg
1,"[0.033985883, 0.031320494, -1.1945478, 1.37073...",test,10001.jpg
2,"[-0.37857288, 0.9527224, -0.3485023, 0.4493107...",test,10002.jpg
3,"[-0.30105162, 1.1882895, -1.0206807, 0.5523659...",test,10003.jpg
4,"[-0.9108023, 0.82393706, -2.0247228, 1.6173184...",test,10004.jpg
...,...,...,...
4729,"[-0.560698, 0.98703194, -0.60655105, 0.527464,...",test,9987.jpg
4730,"[-0.5150113, 0.90361494, -0.8422032, 1.362708,...",test,9988.jpg
4731,"[-1.0629109, 0.6154371, -0.59645724, 1.324135,...",test,9989.jpg
4732,"[-1.524463, 1.8849525, -0.82803065, 1.0890384,...",test,9990.jpg


In [107]:
th = 0.65

# y_pred = []
for i, row in tqdm(df_test_embeddings.iterrows(), total=len(df_test_embeddings)):
    img_name = row['img']
    top_similar = calculate_cosine_similarity(row['embeddings'], batch_size=512).nlargest(1, 'cosine_similarity').iloc[0]
    
    pred = "doesn't_exist"
    if top_similar['cosine_similarity'] >= th:
        pred = top_similar['person']
        
    index_ = df_submission['img_name'] == img_name

    df_submission.loc[index_, 'objects'] = df_submission.loc[index_, 'objects'].apply(
        lambda x: {"gt": pred, "image": f"test_set/{img_name}"}
    )
    
    # break


  0%|          | 0/4734 [00:00<?, ?it/s]

In [108]:
df_submission.drop(['img_name'], axis=1).to_csv('../../submissions/fourth.csv', index=False)

In [109]:
pd.read_csv('../../submissions/fourth.csv')

Unnamed: 0,ID,frame,objects,objective
0,0,1.0,"[{'tracked_id': 1, 'x': 757.8729248046875, 'y'...",tracking
1,1,2.0,"[{'tracked_id': 7, 'x': 657.6649169921875, 'y'...",tracking
2,2,3.0,"[{'tracked_id': 5, 'x': 1467.3447265625, 'y': ...",tracking
3,3,4.0,"[{'tracked_id': 7, 'x': 660.7446899414062, 'y'...",tracking
4,4,5.0,"[{'tracked_id': 7, 'x': 658.93408203125, 'y': ...",tracking
...,...,...,...,...
5158,5158,-1.0,"{'gt': ""doesn't_exist"", 'image': 'test_set/998...",face_reid
5159,5159,-1.0,"{'gt': ""doesn't_exist"", 'image': 'test_set/998...",face_reid
5160,5160,-1.0,"{'gt': ""doesn't_exist"", 'image': 'test_set/998...",face_reid
5161,5161,-1.0,"{'gt': ""doesn't_exist"", 'image': 'test_set/999...",face_reid
