# Vector DB Test

벡터디비 활용을 통해 임베딩 데이터 저장 및 검색 테스트<br>
>임베딩 추출 시간 절약, 데이터 저장 및 검색 효율성 증대 목적

## milvus test

오픈소스 벡터디비 milvus의 경량화 버전 milvus-lite 구동 테스트<br>
> 파이썬 연결 지원, 파일 기반 데이터 제어, 로컬 환경 구축

In [2]:
import fiftyone as fo

fo.list_datasets()

['noise_composite_current',
 'noise_composite_merged',
 'noise_composite_pred',
 'tunelpdr_merged_all']

In [3]:
dataset = fo.load_dataset("noise_composite_merged")
sample = dataset.first()

sample.get_field_schema()

OrderedDict([('id', <fiftyone.core.fields.ObjectIdField at 0x7fe0c58db440>),
             ('filepath',
              <fiftyone.core.fields.StringField at 0x7fe0c64c3a40>),
             ('tags', <fiftyone.core.fields.ListField at 0x7fe0c62a4680>),
             ('metadata',
              <fiftyone.core.fields.EmbeddedDocumentField at 0x7fe0c60149e0>),
             ('created_at',
              <fiftyone.core.fields.DateTimeField at 0x7fe0c6215340>),
             ('last_modified_at',
              <fiftyone.core.fields.DateTimeField at 0x7fe0c63fc0e0>),
             ('ground_truth',
              <fiftyone.core.fields.EmbeddedDocumentField at 0x7fe0c58db590>),
             ('source', <fiftyone.core.fields.StringField at 0x7fe0cc957860>),
             ('clip_embeddings',
              <fiftyone.core.fields.ListField at 0x7fe0c60d7680>)])

In [4]:
print(sample.id)
print(len(sample.id))
print(type(sample.id))

67c91348df3013a998789607
24
<class 'str'>


In [5]:
from pymilvus import MilvusClient
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection

db_client = MilvusClient("../db/DAE_data.db")

schema = db_client.create_schema(
    auto_id=False,
    enable_dynamic_field=True,
)
schema.add_field(field_name="sample_id", datatype=DataType.VARCHAR, is_primary=True, max_length=24)
schema.add_field(field_name="embedding", datatype=DataType.FLOAT_VECTOR, dim=512)

db_client.create_collection(
    collection_name="test_collection",
    schema=schema
)
res = db_client.get_load_state(collection_name="test_collection")
print(res)

{'state': <LoadState: NotLoad>}


In [13]:
db_client.list_collections()

['test_collection']

In [14]:
db_client.load_collection(collection_name="test_collection")
print(db_client.get_load_state(collection_name="test_collection"))

{'state': <LoadState: Loaded>}


In [6]:
print(type(sample))
print(sample.clip_embeddings)
print(type(sample.clip_embeddings))

<class 'fiftyone.core.sample.Sample'>
[-0.16796875, -0.5458984375, 0.01401519775390625, 0.0887451171875, -0.2265625, -0.35107421875, 0.1907958984375, 0.7919921875, 0.2259521484375, -0.1925048828125, 0.310791015625, -0.01157379150390625, -0.06573486328125, 0.284912109375, -0.361328125, -0.305908203125, -0.50341796875, -0.046356201171875, 0.2213134765625, -0.1453857421875, 0.035858154296875, 0.344482421875, -0.00182342529296875, 0.1048583984375, -0.08221435546875, -0.11590576171875, -0.1595458984375, -0.404052734375, 0.440185546875, -0.08111572265625, 0.3642578125, -0.02398681640625, -0.08966064453125, -0.3193359375, -0.349853515625, -0.1917724609375, 0.3037109375, 0.11138916015625, 0.80322265625, 0.92431640625, -0.0288238525390625, -0.2108154296875, -0.51806640625, -0.06768798828125, 0.10906982421875, -0.0323486328125, 0.319091796875, 0.414794921875, 0.35498046875, -0.310546875, 0.12548828125, 0.06976318359375, 0.08294677734375, -0.20703125, -0.298095703125, -0.394775390625, 0.576660156

In [7]:
from tqdm import tqdm

data = []

for sample in tqdm(dataset):
    tmp_dict = {
        "sample_id": sample.id,
        "embedding": sample.clip_embeddings
    }
    data.append(tmp_dict)

print(len(data))

100%|██████████| 38376/38376 [00:38<00:00, 1008.40it/s]

38376





In [16]:
print(type(dataset.name))
print(dataset.name)

<class 'str'>
noise_composite_merged


In [17]:
res = db_client.insert(
    collection_name="test_collection",
    parition_name=dataset.name,
    data=data,
)
print(res)

{'insert_count': 38376, 'ids': ['67c91348df3013a998789607', '67c91348df3013a998789614', '67c91348df3013a998789615', '67c91348df3013a998789616', '67c91348df3013a998789617', '67c91348df3013a998789618', '67c91348df3013a998789619', '67c91348df3013a99878961a', '67c91348df3013a99878961b', '67c91348df3013a998789671', '67c91348df3013a998789672', '67c91348df3013a998789673', '67c91348df3013a998789674', '67c91348df3013a998789675', '67c91348df3013a998789676', '67c91348df3013a998789677', '67c91348df3013a998789678', '67c91348df3013a998789679', '67c91348df3013a99878967a', '67c91348df3013a99878967b', '67c91348df3013a99878967c', '67c91348df3013a99878967d', '67c91348df3013a99878967e', '67c91348df3013a99878967f', '67c91348df3013a998789680', '67c91348df3013a998789681', '67c91348df3013a998789682', '67c91348df3013a998789683', '67c91348df3013a998789684', '67c91348df3013a998789685', '67c91348df3013a998789686', '67c91348df3013a998789687', '67c91348df3013a998789688', '67c91348df3013a998789689', '67c91348df3013a

In [28]:
res = db_client.get(
    collection_name="test_collection",
    partition_names=dataset.name,
    ids=dataset.first().id
)
print(res)

data: ["{'sample_id': '67c91348df3013a998789607', 'embedding': [np.float32(-0.16796875), np.float32(-0.54589844), np.float32(0.014015198), np.float32(0.08874512), np.float32(-0.2265625), np.float32(-0.35107422), np.float32(0.1907959), np.float32(0.7919922), np.float32(0.22595215), np.float32(-0.19250488), np.float32(0.31079102), np.float32(-0.0115737915), np.float32(-0.06573486), np.float32(0.2849121), np.float32(-0.36132812), np.float32(-0.3059082), np.float32(-0.50341797), np.float32(-0.0463562), np.float32(0.22131348), np.float32(-0.14538574), np.float32(0.035858154), np.float32(0.34448242), np.float32(-0.0018234253), np.float32(0.1048584), np.float32(-0.082214355), np.float32(-0.11590576), np.float32(-0.1595459), np.float32(-0.40405273), np.float32(0.44018555), np.float32(-0.08111572), np.float32(0.3642578), np.float32(-0.023986816), np.float32(-0.089660645), np.float32(-0.31933594), np.float32(-0.34985352), np.float32(-0.19177246), np.float32(0.30371094), np.float32(0.11138916), n

In [31]:
print(dataset.first().clip_embeddings[0])
print(dataset.first().clip_embeddings[1])
print(dataset.first().clip_embeddings[2])

-0.16796875
-0.5458984375
0.01401519775390625
