In [1]:
#! pip install pymilvus

In [2]:
from sentence_transformers import SentenceTransformer
import glob
from PIL import Image

import numpy as np
import time
from pathlib import Path

from pymilvus import (
    connections,
    utility,
    FieldSchema,
    CollectionSchema,
    DataType,
    Collection,
)

fmt = "\n=== {:30} ===\n"
search_latency_fmt = "search latency = {:.4f}s"
num_entities, dim = 3000, 8

In [3]:
#connections.connect("default", host="34.168.23.74", port="19530")
connections.connect("default", host="milvus", port="32179")

In [4]:
collection_name = "fsdl_ip"

In [5]:
has = utility.has_collection(collection_name)
print(f"Does the photo embedding collection exist in milvus: {has}")

Does the photo embedding collection exist in milvus: True


In [6]:
dim = 512

In [7]:
if not has:
    fields = [
        FieldSchema(name="img_name", dtype=DataType.VARCHAR, is_primary=True, auto_id=False, max_length=100),
        FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=dim)
    ]

    schema = CollectionSchema(fields, "FDSL project embedding database")
    fsdl_collection = Collection(collection_name, schema, consistency_level="Strong")
else:
    fsdl_collection = Collection(collection_name)

In [8]:
fsdl_collection

<Collection>:
-------------
<name>: fsdl_ip
<partitions>: [{"name": "_default", "collection_name": "fsdl_ip", "description": ""}]
<description>: FDSL project embedding database
<schema>: {
  auto_id: False
  description: FDSL project embedding database
  fields: [{
    name: img_name
    description: 
    type: 21
    params: {'max_length': 100}
    is_primary: True
    auto_id: False
  }, {
    name: embeddings
    description: 
    type: 101
    params: {'dim': 512}
  }]
}

In [9]:
img_names = list(glob.glob('abo/images/**/*.jpg', recursive=True))
len(img_names)

398210

In [10]:
#num_images = 10
#num_images = 4096
num_images = len(img_names)
chunk_size = 1024
#if the embedding load fails, note the latest processed
checkpoint = 0

In [11]:
import torch
img_model = SentenceTransformer('clip-ViT-B-32', device='cpu')

def encode_images_cosine(img_model, img_names):
    print("Images:", len(img_names))
    img_emb=torch.empty(0)
    #split in chunks
    processed = 0
    chunks = [img_names[x:x+chunk_size] for x in range(0, len(img_names), chunk_size)]
    #list comprehension reaches the limit of open files; TODO - how to do this in parallel
    for chunk in chunks:
        if processed < checkpoint:
            processed = processed + len(chunk)
            continue
        start_time = time.time()
        emb_tensor = img_model.encode([Image.open(filepath) for filepath in chunk],
                                        batch_size=128, convert_to_tensor=True, 
                                        show_progress_bar=False)
        end_time = time.time()
        encoding_duration = end_time - start_time
        #print(emb_tensor.shape)
        #img_emb = torch.cat((img_emb, emb_tensor),0)
        fnames = [filename.split("/")[-1].split(".")[0] for filename in chunk]
        #print(len(fnames))
        #print(fnames[:10])
        #instead of cat, normalize and insert
        #place all points on the unit sphere
        emb_tensor = emb_tensor / np.linalg.norm(emb_tensor, axis=1, keepdims=True)
        processed = processed + emb_tensor.shape[0]
        #print(processed)
        entities = [
            list(fnames),
            emb_tensor.detach().numpy()
        ]
        start_time = time.time()
        insert_result = fsdl_collection.insert(entities)
        end_time = time.time()
        insert_duration = end_time - start_time
        print(f"{time.ctime()} Processed: {processed} Encoding: {encoding_duration}; Insert: {insert_duration}; Entities: {fsdl_collection.num_entities}")
        #insert
    #print(img_emb)
    #print(img_emb.shape)
    #place all points on the unit sphere
    #img_emb = img_emb / np.linalg.norm(img_emb, axis=1, keepdims=True)
    #return img_emb, img_names

ftfy or spacy is not installed using BERT BasicTokenizer instead of ftfy.


In [12]:
#img_emb, img_names = encode_images_cosine(img_model, img_names[:num_images])
encode_images_cosine(img_model, img_names[:num_images])

Images: 398210
Fri May 19 20:13:46 2023 Processed: 1024 Encoding: 41.55492329597473; Insert: 0.8703515529632568; Entities: 1024
Fri May 19 20:14:35 2023 Processed: 2048 Encoding: 42.66278052330017; Insert: 0.8307642936706543; Entities: 2048
Fri May 19 20:15:21 2023 Processed: 3072 Encoding: 41.42938446998596; Insert: 0.8334188461303711; Entities: 3072
Fri May 19 20:16:07 2023 Processed: 4096 Encoding: 41.05150532722473; Insert: 0.9433002471923828; Entities: 4096
Fri May 19 20:16:52 2023 Processed: 5120 Encoding: 40.148574113845825; Insert: 0.8241574764251709; Entities: 5120
Fri May 19 20:17:32 2023 Processed: 6144 Encoding: 34.446739196777344; Insert: 0.8616952896118164; Entities: 6144
Fri May 19 20:18:13 2023 Processed: 7168 Encoding: 35.20821285247803; Insert: 0.933013916015625; Entities: 7168
Fri May 19 20:18:52 2023 Processed: 8192 Encoding: 35.8684024810791; Insert: 0.793999433517456; Entities: 8192
Fri May 19 20:19:33 2023 Processed: 9216 Encoding: 36.2226927280426; Insert: 0.824

Fri May 19 21:07:50 2023 Processed: 74752 Encoding: 38.13424491882324; Insert: 1.1046323776245117; Entities: 74752
Fri May 19 21:08:36 2023 Processed: 75776 Encoding: 38.2521755695343; Insert: 0.8451840877532959; Entities: 75776
Fri May 19 21:09:23 2023 Processed: 76800 Encoding: 38.73236322402954; Insert: 3.9149723052978516; Entities: 76800
Fri May 19 21:10:11 2023 Processed: 77824 Encoding: 38.569202184677124; Insert: 4.3128485679626465; Entities: 77824
Fri May 19 21:10:59 2023 Processed: 78848 Encoding: 38.03066086769104; Insert: 3.7337894439697266; Entities: 78848
Fri May 19 21:11:46 2023 Processed: 79872 Encoding: 37.79614591598511; Insert: 0.9909248352050781; Entities: 79872
Fri May 19 21:12:32 2023 Processed: 80896 Encoding: 38.563965797424316; Insert: 3.2756123542785645; Entities: 80896
Fri May 19 21:13:21 2023 Processed: 81920 Encoding: 37.76081919670105; Insert: 1.9863719940185547; Entities: 81920
Fri May 19 21:14:11 2023 Processed: 82944 Encoding: 37.95154666900635; Insert: 

Fri May 19 22:03:14 2023 Processed: 147456 Encoding: 36.759061098098755; Insert: 6.215966701507568; Entities: 147456
Fri May 19 22:04:05 2023 Processed: 148480 Encoding: 35.85289406776428; Insert: 3.8931479454040527; Entities: 148480
Fri May 19 22:05:02 2023 Processed: 149504 Encoding: 36.08091187477112; Insert: 11.651243925094604; Entities: 149504
Fri May 19 22:06:01 2023 Processed: 150528 Encoding: 35.416277170181274; Insert: 8.634471416473389; Entities: 150528
Fri May 19 22:06:44 2023 Processed: 151552 Encoding: 35.77549338340759; Insert: 1.9782185554504395; Entities: 151552
Fri May 19 22:07:34 2023 Processed: 152576 Encoding: 36.40388369560242; Insert: 6.705650091171265; Entities: 152576
Fri May 19 22:08:33 2023 Processed: 153600 Encoding: 35.39077639579773; Insert: 7.581112861633301; Entities: 153600
Fri May 19 22:09:25 2023 Processed: 154624 Encoding: 35.68463373184204; Insert: 8.682779312133789; Entities: 154624
Fri May 19 22:10:15 2023 Processed: 155648 Encoding: 35.68401622772

Fri May 19 23:01:05 2023 Processed: 220160 Encoding: 36.069321632385254; Insert: 5.320835113525391; Entities: 220160
Fri May 19 23:01:52 2023 Processed: 221184 Encoding: 36.279327392578125; Insert: 4.173388242721558; Entities: 221184
Fri May 19 23:02:38 2023 Processed: 222208 Encoding: 35.854745864868164; Insert: 2.754195213317871; Entities: 222208
Fri May 19 23:03:24 2023 Processed: 223232 Encoding: 36.68797779083252; Insert: 5.3698999881744385; Entities: 223232
Fri May 19 23:04:13 2023 Processed: 224256 Encoding: 36.460036516189575; Insert: 6.578221321105957; Entities: 224256
Fri May 19 23:05:01 2023 Processed: 225280 Encoding: 36.130685329437256; Insert: 6.588853120803833; Entities: 225280
Fri May 19 23:05:53 2023 Processed: 226304 Encoding: 35.96863913536072; Insert: 7.667951583862305; Entities: 226304
Fri May 19 23:06:40 2023 Processed: 227328 Encoding: 35.75880551338196; Insert: 4.144246816635132; Entities: 227328
Fri May 19 23:07:40 2023 Processed: 228352 Encoding: 35.0484304428

Fri May 19 23:52:11 2023 Processed: 292864 Encoding: 37.66360926628113; Insert: 0.9734258651733398; Entities: 292864
Fri May 19 23:52:53 2023 Processed: 293888 Encoding: 37.46112775802612; Insert: 0.9046320915222168; Entities: 293888
Fri May 19 23:53:35 2023 Processed: 294912 Encoding: 37.48320484161377; Insert: 0.848383903503418; Entities: 294912
Fri May 19 23:54:17 2023 Processed: 295936 Encoding: 37.579830169677734; Insert: 0.8306748867034912; Entities: 295936
Fri May 19 23:55:00 2023 Processed: 296960 Encoding: 37.57612466812134; Insert: 0.966691255569458; Entities: 296960
Fri May 19 23:55:42 2023 Processed: 297984 Encoding: 37.94548320770264; Insert: 0.8981075286865234; Entities: 297984
Fri May 19 23:56:24 2023 Processed: 299008 Encoding: 37.888339042663574; Insert: 0.834942102432251; Entities: 299008
Fri May 19 23:57:11 2023 Processed: 300032 Encoding: 37.78278851509094; Insert: 4.8607542514801025; Entities: 300032
Fri May 19 23:57:51 2023 Processed: 301056 Encoding: 37.261452436

Sat May 20 00:41:31 2023 Processed: 364544 Encoding: 37.306480407714844; Insert: 0.9062275886535645; Entities: 364544
Sat May 20 00:42:13 2023 Processed: 365568 Encoding: 37.42672538757324; Insert: 0.7260265350341797; Entities: 365568
Sat May 20 00:42:57 2023 Processed: 366592 Encoding: 37.607197999954224; Insert: 0.7540163993835449; Entities: 366592
Sat May 20 00:43:39 2023 Processed: 367616 Encoding: 37.42505669593811; Insert: 0.7177696228027344; Entities: 367616
Sat May 20 00:44:21 2023 Processed: 368640 Encoding: 37.40157890319824; Insert: 0.7529234886169434; Entities: 368640
Sat May 20 00:45:03 2023 Processed: 369664 Encoding: 37.48956322669983; Insert: 0.7693252563476562; Entities: 369664
Sat May 20 00:45:46 2023 Processed: 370688 Encoding: 37.53352189064026; Insert: 0.8294329643249512; Entities: 370688
Sat May 20 00:46:28 2023 Processed: 371712 Encoding: 37.51129627227783; Insert: 0.7244632244110107; Entities: 371712
Sat May 20 00:47:10 2023 Processed: 372736 Encoding: 37.652517

In [13]:
#create an innner product metric for cosine similarity
#inner product metric measures orientation
index = {
    "index_type": "IVF_FLAT",
    "metric_type": "IP",
    "params": {"nlist": 128}
}
fsdl_collection.create_index("embeddings", index)

Status(code=0, message='')

In [14]:
#np.save("../data/abo-cosine.npy", img_emb_np)

In [15]:
test_images = img_names[:2]

In [16]:
emb_tensor = img_model.encode([Image.open(filepath) for filepath in test_images],
                                        batch_size=128, convert_to_tensor=True, 
                                        show_progress_bar=False)

In [17]:
entities = [
            test_images,
            emb_tensor.detach().numpy()
        ]

In [18]:
fsdl_collection.load()
fsdl_collection

<Collection>:
-------------
<name>: fsdl_ip
<partitions>: [{"name": "_default", "collection_name": "fsdl_ip", "description": ""}]
<description>: FDSL project embedding database
<schema>: {
  auto_id: False
  description: FDSL project embedding database
  fields: [{
    name: img_name
    description: 
    type: 21
    params: {'max_length': 100}
    is_primary: True
    auto_id: False
  }, {
    name: embeddings
    description: 
    type: 101
    params: {'dim': 512}
  }]
}

In [19]:
len(entities[-1][-2:])

2

In [20]:
vectors_to_search = entities[-1][-2:]
search_params = {
    "metric_type": "IP",
    "params": {"nprobe": 10},
}

start_time = time.time()
result = fsdl_collection.search(vectors_to_search, "embeddings", 
                                search_params, limit=3, output_fields=["img_name"])
end_time = time.time()

for hits in result:
    for hit in hits:
        print(f"hit: {hit}, img_name: {hit.entity.get('img_name')}")
print(search_latency_fmt.format(end_time-start_time))

hit: (distance: 10.326423645019531, id: 401a5f0f), img_name: 401a5f0f
hit: (distance: 10.002557754516602, id: 885653a3), img_name: 885653a3
hit: (distance: 9.985244750976562, id: 35fe9f29), img_name: 35fe9f29
hit: (distance: 10.690556526184082, id: 40443b02), img_name: 40443b02
hit: (distance: 10.654426574707031, id: b8747b16), img_name: b8747b16
hit: (distance: 9.944278717041016, id: 9ad835ca), img_name: 9ad835ca
search latency = 0.5041s


In [21]:
import pandas as pd
df = pd.DataFrame(['https://storage.googleapis.com/fsdl_images/semsearch/88aad2c1.jpg', 
                   'https://storage.googleapis.com/fsdl_images/semsearch/ce7b4545.jpg',
                   'https://storage.googleapis.com/fsdl_images/semsearch/ef9286da.jpg',
                   'https://storage.googleapis.com/fsdl_images/semsearch/6f03bc7b.jpg', 
                   'https://storage.googleapis.com/fsdl_images/semsearch/dc90f943.jpg',
                   'https://storage.googleapis.com/fsdl_images/semsearch/43bd06ef.jpg'])
df.columns=["images"]

def make_clickable(val):
    return '<a href="{}">{}</a>'.format(val,val)

df.style.format(make_clickable)

Unnamed: 0,images
0,https://storage.googleapis.com/fsdl_images/semsearch/88aad2c1.jpg
1,https://storage.googleapis.com/fsdl_images/semsearch/ce7b4545.jpg
2,https://storage.googleapis.com/fsdl_images/semsearch/ef9286da.jpg
3,https://storage.googleapis.com/fsdl_images/semsearch/6f03bc7b.jpg
4,https://storage.googleapis.com/fsdl_images/semsearch/dc90f943.jpg
5,https://storage.googleapis.com/fsdl_images/semsearch/43bd06ef.jpg


In [22]:
from IPython.display import Image, HTML

def path_to_image_html(path):
    '''
     This function essentially convert the image url to 
     '<img src="'+ path + '"/>' format. And one can put any
     formatting adjustments to control the height, aspect ratio, size etc.
     within as in the below example. 
    '''

    return '<img src="'+ path + '" style=max-height:124px;"/>'

HTML(df.to_html(escape=False ,formatters=dict(images=path_to_image_html)))

Unnamed: 0,images
0,
1,
2,
3,
4,
5,


In [23]:
import semsearch
from semsearch.encoding import clip_encoding


ftfy or spacy is not installed using BERT BasicTokenizer instead of ftfy.


In [24]:
query_text =  "black shoes"
query_embed = clip_encoding.get_text_embeddings(query_text).detach().numpy()
len(query_embed[0])

512

In [25]:
vectors_to_search = [query_embed[0]]
search_params = {
    "metric_type": "IP",
    "params": {"nprobe": 10},
}

start_time = time.time()
result = fsdl_collection.search(vectors_to_search, "embeddings", 
                                search_params, limit=3, output_fields=["img_name"])
end_time = time.time()

for hits in result:
    for hit in hits:
        print(f"hit: {hit}, img_name: {hit.entity.get('img_name')}")
print(search_latency_fmt.format(end_time-start_time))

hit: (distance: 0.3150938153266907, id: 2cb595b3), img_name: 2cb595b3
hit: (distance: 0.31228405237197876, id: 820ee923), img_name: 820ee923
hit: (distance: 0.3102591037750244, id: 616383f4), img_name: 616383f4
search latency = 0.4699s


In [26]:
df = pd.DataFrame(['https://storage.googleapis.com/fsdl_images/semsearch/269892fb.jpg', 
                   'https://storage.googleapis.com/fsdl_images/semsearch/a564aec1.jpg',
                   'https://storage.googleapis.com/fsdl_images/semsearch/340dd50c.jpg',
                  ])
df.columns=["images"]

def make_clickable(val):
    return '<a href="{}">{}</a>'.format(val,val)

df.style.format(make_clickable)

Unnamed: 0,images
0,https://storage.googleapis.com/fsdl_images/semsearch/269892fb.jpg
1,https://storage.googleapis.com/fsdl_images/semsearch/a564aec1.jpg
2,https://storage.googleapis.com/fsdl_images/semsearch/340dd50c.jpg


In [27]:
HTML(df.to_html(escape=False ,formatters=dict(images=path_to_image_html)))

Unnamed: 0,images
0,
1,
2,
