In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [3]:
import torch
#rom transformers import AutoImageProcessor, AutoModel
from PIL import Image
import faiss
import numpy as np
import os
from tqdm import tqdm
import pandas as pd
from collections import defaultdict
import json

In [None]:
# src = str(Path.cwd().resolve().parents[0]/ 'src')
# sys.path.append(src)
# from config.paths import DATA, EXPERIMENTS, METADATA

In [4]:
EXPERIMENTS = '/content/drive/MyDrive/dataHN/experiments'
DATA = '/content/drive/MyDrive/dataHN/data'

In [14]:
experiment_dir = os.path.join(EXPERIMENTS, 'faiss_similarity')
os.makedirs(experiment_dir, exist_ok=True)

results_dir = os.path.join(experiment_dir, "results")
os.makedirs(results_dir, exist_ok=True)

# Similarité inter et intra corpus

In [15]:
obj_embeddings_dir = os.path.join(DATA, 'embeddings', 'dino3_obj_crops_gray')
obj_embeddings_list = [os.path.join(obj_embeddings_dir, e) for e in os.listdir(obj_embeddings_dir)]
obj_img_dir = os.path.join(DATA,'processed', 'g-dino_obj_crops')

In [16]:
doc_embeddings_dir = os.path.join(DATA, 'embeddings', 'dino3_doc_crops_gray')
doc_embeddings_list = [os.path.join(doc_embeddings_dir, e) for e in os.listdir(doc_embeddings_dir)]
doc_img_dir = os.path.join(DATA,'processed','yolo_doc_crops')

In [None]:
embedding = torch.load(obj_embeddings_list[0])
print(embedding.shape)
embedding = torch.load(doc_embeddings_list[0])
print(embedding.shape)

torch.Size([768])
torch.Size([768])


In [None]:
liste_embeddings = obj_embeddings_list + doc_embeddings_list

In [None]:
obj_id_to_path = {i: os.path.splitext(os.path.basename(obj_embeddings_list[i]))[0] for i in range(len(obj_embeddings_list))}
doc_id_to_path = {i: os.path.splitext(os.path.basename(doc_embeddings_list[i]))[0] for i in range(len(doc_embeddings_list))}

In [None]:
with open(os.path.join(results_dir, 'obj_index_file_map.json'), "w") as f:
    json.dump(obj_id_to_path, f)

with open(os.path.join(results_dir, 'doc_index_file_map.json'), "w") as f:
    json.dump(doc_id_to_path, f)

In [None]:
len(obj_embeddings_list)

2127

In [None]:
# Objets stack embeddings
stacked_path = os.path.join(DATA, 'embeddings', 'dino3_stacked', 'obj_embeddings.pt')

if os.path.exists(stacked_path):
    obj_embeddings = torch.load(stacked_path)  # tensor (N, 768)
else:
    all_embeddings = []
    for path in tqdm(obj_embeddings_list):
        emb = torch.load(path)                  # [768]
        all_embeddings.append(emb.unsqueeze(0)) # [1, 768]

    obj_embeddings = torch.cat(all_embeddings, dim=0)  # tensor (N, 768)
    torch.save(obj_embeddings, stacked_path)

100%|██████████| 2127/2127 [00:09<00:00, 231.43it/s]


In [None]:
# Docs stack path
stacked_path = os.path.join(DATA, 'embeddings', 'dino3_stacked', 'doc_embeddings.pt')

if os.path.exists(stacked_path):
    doc_embeddings = torch.load(stacked_path)  # tensor (N, 768)
else:
    all_embeddings = []
    for path in tqdm(doc_embeddings_list):
        emb = torch.load(path)                  # [768]
        all_embeddings.append(emb.unsqueeze(0)) # [1, 768]

    doc_embeddings = torch.cat(all_embeddings, dim=0)  # tensor (N, 768)
    torch.save(doc_embeddings, stacked_path)

100%|██████████| 7705/7705 [09:39<00:00, 13.30it/s]


## Index Faiss

In [None]:
print(obj_embeddings.shape)     # torch.Size([N, 768])
print(doc_embeddings.shape)     # torch.Size([N, 768])

torch.Size([2127, 768])
torch.Size([7705, 768])


In [None]:
dim = 768
k = 20

In [None]:
if isinstance(doc_embeddings, torch.Tensor):
    doc_embeddings = doc_embeddings.cpu().numpy().astype(np.float32)
if isinstance(obj_embeddings, torch.Tensor):
    obj_embeddings = obj_embeddings.cpu().numpy().astype(np.float32)


faiss.normalize_L2(doc_embeddings)
faiss.normalize_L2(obj_embeddings)

index_docs = faiss.IndexFlatIP(dim)
index_objs = faiss.IndexFlatIP(dim)

index_docs.add(doc_embeddings)
index_objs.add(obj_embeddings)

print(f"Nombre de vecteurs dans index docs : {index_docs.ntotal}")
print(f"Nombre de vecteurs dans index obj: {index_objs.ntotal}")

Nombre de vecteurs dans index docs : 7705
Nombre de vecteurs dans index obj: 2127


In [None]:
faiss.write_index(index_objs, os.path.join(experiment_dir, "index_objs_cosine.faiss"))
faiss.write_index(index_docs, os.path.join(experiment_dir, "index_docs_cosine.faiss"))

In [None]:
# Intra-corpus
D_docs, I_docs = index_docs.search(doc_embeddings, k)
D_objs, I_objs = index_objs.search(obj_embeddings, k)

# Inter-corpus
D_doc2obj, I_doc2obj = index_objs.search(doc_embeddings, k)
D_obj2doc, I_obj2doc = index_docs.search(obj_embeddings, k)

In [None]:
obj_filenames = [os.path.basename(filepath).split('.')[0] for filepath in obj_embeddings_list]
doc_filenames = [os.path.basename(filepath).split('.')[0] for filepath in doc_embeddings_list]
paths_objs = [os.path.join(obj_img_dir, f'{filename}.jpg') for filename in obj_filenames]
paths_docs = [os.path.join(doc_img_dir, f'{filename}.jpg') for filename in doc_filenames]
print(paths_objs[0])


/content/drive/MyDrive/dataHN/data/processed/g-dino_obj_crops/GazetteDrouot_22972746_11_02.jpg


In [None]:

def results_to_df(I, D, query_paths, corpus_paths, prefix="top"):
    rows = []
    for q_idx, query_path in enumerate(query_paths):
        row = {"query": os.path.basename(query_path)}  # store filename only for query
        for rank, (idx, dist) in enumerate(zip(I[q_idx], D[q_idx])):
            # store filename only
            row[f"{prefix}{rank}_path"] = os.path.basename(corpus_paths[idx])
            row[f"{prefix}{rank}_score"] = float(dist)
        rows.append(row)
    return pd.DataFrame(rows)


In [None]:
df_docs_docs = results_to_df(I_docs, D_docs, paths_docs, paths_docs)
df_objs_objs = results_to_df(I_objs, D_objs, paths_objs, paths_objs)
df_doc2obj = results_to_df(I_doc2obj, D_doc2obj, paths_docs, paths_objs)
df_obj2doc = results_to_df(I_obj2doc, D_obj2doc, paths_objs, paths_docs)

In [None]:
df_docs_docs.to_csv(os.path.join(results_dir, "docs_within_docs.csv"), index=False)
df_objs_objs.to_csv(os.path.join(results_dir,"objs_within_objs.csv"), index=False)
df_doc2obj.to_csv(os.path.join(results_dir,"docs_to_objs.csv"), index=False)
df_obj2doc.to_csv(os.path.join(results_dir,"objs_to_docs.csv"), index=False)

# Similarité cluster test

In [None]:
cluster_json =  os.path.join(EXPERIMENTS, 'resnet_masks_cluster', 'masks_clusters.json')
with open (cluster_json, 'r') as f:
    dico_clusters = json.load(f)


In [28]:
dico_clusters.keys()

dict_keys(['5', '1', '6', '2', '3', '9', '0', '4', '8', '7'])

In [None]:
cluster7= dico_clusters['7'] #le cluster de masques sam2 sélectionné pour essai.

In [41]:
obj_embeddings_cluster7 = [
    file for file in obj_embeddings_list
    if os.path.splitext(os.path.basename(file))[0] in cluster7
]
doc_embeddings_cluster7 = [
    file for file in doc_embeddings_list
    if os.path.splitext(os.path.basename(file))[0] in cluster7
]

In [42]:
liste_embeddings_cluster7 = obj_embeddings_cluster7 + doc_embeddings_cluster7

In [45]:
# Images cluster stack path
stacked_path = os.path.join(DATA, 'embeddings', 'dino3_stacked', 'cluster7_embeddings.pt')

all_embeddings = []
for path in tqdm(liste_embeddings_cluster7):
    emb = torch.load(path)
    all_embeddings.append(emb.unsqueeze(0))

cluster7_embeddings = torch.cat(all_embeddings, dim=0)  # (N, 768)
torch.save(cluster7_embeddings, stacked_path)

100%|██████████| 881/881 [00:04<00:00, 211.14it/s]


In [None]:
print(cluster7_embeddings.shape)     #([N, 768])


torch.Size([881, 768])


In [48]:
if isinstance(cluster7_embeddings, torch.Tensor):
    cluster7_embeddings = cluster7_embeddings.cpu().numpy().astype(np.float32)

In [49]:
k = 20
dim = 768

faiss.normalize_L2(cluster7_embeddings)

index_cluster7 = faiss.IndexFlatIP(dim)

index_cluster7.add(cluster7_embeddings)

faiss.write_index(index_cluster7, os.path.join(experiment_dir, "index_cluster7_cosine.faiss"))

In [51]:
D_docs, I_docs = index_cluster7.search(cluster7_embeddings, k)

In [None]:
cluster7_filenames = []   
paths_cluster7 = []       

for emb_path in liste_embeddings_cluster7:
    fname = os.path.splitext(os.path.basename(emb_path))[0]
    cluster7_filenames.append(fname)

    obj_path = os.path.join(obj_img_dir, f"{fname}.jpg")
    doc_path = os.path.join(doc_img_dir, f"{fname}.jpg")

    if os.path.exists(obj_path):
        paths_cluster7.append(obj_path)
    elif os.path.exists(doc_path):
        paths_cluster7.append(doc_path)



In [56]:
df_cluster7 = results_to_df(I_docs, D_docs, paths_cluster7, paths_cluster7)

In [58]:
df_cluster7.to_csv(os.path.join(results_dir, "c7_within_c7.csv"), index=False)

# Rercherche de doublons

In [None]:
# def index_map_tensor(output_dir, embeddings, filenames):

#     d = embeddings.shape[1]  
#     index = faiss.IndexFlatL2(d)

#     all_vectors = embeddings.cpu().numpy().astype("float32")

#     faiss.normalize_L2(all_vectors)

#     index.add(all_vectors)

#     index_to_path = {i: os.path.basename(fname) for i, fname in enumerate(filenames)}

#     faiss.write_index(index, os.path.join(output_dir, "objets_vector.index"))

#     with open(os.path.join(output_dir, "index_map_path.json"), "w") as f:
#         json.dump(index_to_path, f)

#     return index, index_to_path

In [None]:
# Rechercher à partir d'un index pré-calculé
index = faiss.read_index(os.path.join(experiment_dir, "objets_vector.index"))


In [None]:
all_vectors = np.stack(index.reconstruct_n(0, index.ntotal)).astype("float32")

k = 2 
D, I = index.search(all_vectors, k)

duplicates = []
threshold = 1e-7  # ajuster

for i in range(len(all_vectors)):
    neighbor_idx = I[i, 1]  # premier vecteur le plus proche
    dist = D[i, 1]
    if dist < threshold:
        duplicates.append((i, neighbor_idx, dist))


In [None]:
# Retrouver les noms de fichiers avec un mapping json pré-calculé
with open(os.path.join(experiment_dir, "index_map_path.json"), 'r') as f:
    index_to_path = json.load(f)

duplicates_data = []

for i, j, dist in duplicates:
    file_i = os.path.basename(index_to_path[str(i)])
    file_j = os.path.basename(index_to_path[str(j)])
    duplicates_data.append({
        "idx1": i,
        "idx2": j,
        "file1": file_i,
        "file2": file_j,
        "distance": dist
    })

df_duplicates = pd.DataFrame(duplicates_data)

df_duplicates.to_csv(os.path.join(experiment_dir, "duplicates.csv"), index=False)

df_duplicates.head()

Unnamed: 0,idx1,idx2,file1,file2,distance
0,68,1981,GazetteDrouot_5424025_05_01.pt,GazetteDrouot_5424025_04_01.pt,0.0
1,95,1013,GazetteDrouot_15511693_01_01.pt,GazetteDrouot_16465688_01_01.pt,0.0
2,206,440,GazetteDrouot_21001152_03_01.pt,GazetteDrouot_17968923_03_01.pt,0.0
3,306,872,Christies_2021-09-30_296_01_02.pt,Christies_2021-09-30_296_03_02.pt,0.0
4,409,1077,GazetteDrouot_18793404_01_01.pt,GazetteDrouot_17582345_01_01.pt,0.0
