In [None]:
import os
from glob import glob
from tqdm import tqdm
from abraia.inference.clip import Clip
from abraia.inference.ops import cosine_similarity
from abraia.utils import dataset, load_image, image_base64

clip_model = Clip()

output_dir = 'dataset'
search_query = "vineyard leaves"
dataset.download(search_query, limit=200, output_dir=output_dir)

image_paths = glob(os.path.join(output_dir, '*.jpg'))
image_embeddings = [clip_model.get_image_embeddings([load_image(image_path)])[0] for image_path in tqdm(image_paths)]

[0;93m2025-03-24 20:33:56.799486 [W:onnxruntime:, coreml_execution_provider.cc:107 GetCapability] CoreMLExecutionProvider::GetCapability, number of partitions supported by CoreML: 6 number of nodes in the graph: 889 number of nodes supported by CoreML: 30[m
[0;93m2025-03-24 20:33:58.404728 [W:onnxruntime:, helper.cc:82 IsInputSupported] CoreML does not support input dim > 16384. Input:token_embedding.weight, shape: {49408,512}[m
[0;93m2025-03-24 20:33:58.404937 [W:onnxruntime:, coreml_execution_provider.cc:107 GetCapability] CoreMLExecutionProvider::GetCapability, number of partitions supported by CoreML: 4 number of nodes in the graph: 910 number of nodes supported by CoreML: 18[m
100%|██████████| 176/176 [00:08<00:00, 20.03it/s]


In [13]:
import pandas as pd
from IPython.display import display, HTML

def compare_pairwise_similarity(image_paths, image_embeddings):
    data = []
    for i, (path1, embeddings1) in enumerate(zip(image_paths, image_embeddings)):
        for j, (path2, embeddings2) in enumerate(zip(image_paths, image_embeddings)):
            if i < j:  # To avoid duplicate pairs and self-comparison
                similarity = cosine_similarity(embeddings1, embeddings2)
                data.append({"path1": path1, "path2": path2, "similarity": similarity})
    return data

def add_image_cell(image_path):
    try:
        src = image_base64(load_image(image_path))
        return f'<img src="{src}" width="100">'
    except:
        return ''

def display_df(df):
    display(HTML(df.to_html(escape=False)))

similarity_data = compare_pairwise_similarity(image_paths, image_embeddings)
similarity_df = pd.DataFrame(similarity_data)

for image_path in similarity_df[similarity_df['similarity'] > 0.98]['path2']:
    if image_path in image_paths:
        idx = image_paths.index(image_path)
        print(idx, image_path)
        image_paths.pop(idx)
        image_embeddings.pop(idx)
        if os.path.exists(image_path):
            os.remove(image_path)

threshold = 0.96
similar_images = similarity_df[similarity_df['similarity'] > threshold].copy()
print(len(similar_images))

# Apply the function to the dataframe
df_similar = similar_images.head(50).copy()
df_similar['image1'] = df_similar['path1'].apply(add_image_cell)
df_similar['image2'] = df_similar['path2'].apply(add_image_cell)
display_df(df_similar)

6


Unnamed: 0,path1,path2,similarity,image1,image2
1272,dataset/Image_151.jpg,dataset/Image_136.jpg,0.961597,,
6635,dataset/Image_76.jpg,dataset/Image_48.jpg,0.962674,,
6694,dataset/Image_76.jpg,dataset/Image_100.jpg,0.974849,,
7084,dataset/Image_63.jpg,dataset/Image_100.jpg,0.965824,,
8437,dataset/Image_48.jpg,dataset/Image_100.jpg,0.961129,,
8558,dataset/Image_9.jpg,dataset/Image_4.jpg,0.964188,,


In [14]:
data = [{"path": path, "vector": vector} for path, vector  in tqdm(zip(image_paths, image_embeddings))]
df = pd.DataFrame(data)

texts = [search_query]
text_embeddings = clip_model.get_text_embeddings(texts)[0]

df['score'] = df['vector'].apply(lambda x: cosine_similarity(x, text_embeddings))

df_filter = df[df['score'] < 0.25].sort_values('score', ascending=False)
df_filter['image'] = df_filter['path'].apply(add_image_cell)
display_df(df_filter[['path', 'score', 'image']])

176it [00:00, 443361.86it/s]


Unnamed: 0,path,score,image


In [11]:
for image_path in tqdm(df_filter['path']):
    if os.path.exists(image_path):
        os.remove(image_path)

0it [00:00, ?it/s]
