In [6]:
import weaviate, os
import weaviate.classes as wvc
from tqdm.auto import tqdm
import embed_anything
from embed_anything import EmbedData
from embed_anything.vectordb import Adapter
from embed_anything import EmbeddingModel, TextEmbedConfig, WhichModel


## Create Weaviate Adapter

In [90]:
from typing import List


class WeaviateAdapter(Adapter):
    def __init__(self, api_key, url):
        super().__init__(api_key)
        self.client = weaviate.connect_to_weaviate_cloud(
            cluster_url=url, auth_credentials=wvc.init.Auth.api_key(api_key)
        )
        if self.client.is_ready():
            print("Weaviate is ready")

    def create_index(self, index_name: str):
        self.index_name = index_name
        self.collection = self.client.collections.create(
            index_name, vectorizer_config=wvc.config.Configure.Vectorizer.none()
        )
        return self.collection
        

    def convert(self, embeddings: List[EmbedData]):
        data = []
        for embedding in embeddings:
            property = embedding.metadata
            property["text"] = embedding.text
            data.append(
                wvc.data.DataObject(properties=property, vector=embedding.embedding)
            )
        return data

    def upsert(self, data_: List[EmbedData]):
        data_ = self.convert(data_)

        return self.client.collections.get(self.index_name).data.insert_many(data_)

    def delete_index(self, index_name: str):
        self.client.collections.delete(index_name)

In [91]:
URL = "Put your URL here"
API_KEY = "Put your API key here"
weaviate_adapter = WeaviateAdapter(API_KEY, URL)

Weaviate is ready


In [92]:
index_name = "Test_index"
if index_name in weaviate_adapter.client.collections.list_all():
    weaviate_adapter.delete_index(index_name)
weaviate_adapter.create_index("Test_index")

<weaviate.collections.collection.sync.Collection at 0x19871876e90>

In [93]:
# Prase PDF and insert documents into Elasticsearch.
model = EmbeddingModel.from_pretrained_hf(
    WhichModel.Bert, model_id="sentence-transformers/all-MiniLM-L6-v2"
)
config = TextEmbedConfig(chunk_size=20, batch_size=32)
data = embed_anything.embed_file("test_files/attention.pdf", embeder=model, config=config, adapter=weaviate_adapter)

## Query Search

In [94]:
query_vector = embed_anything.embed_query(["What is self attention"], embeder=model, config=config)[
    0
].embedding

In [95]:
response = weaviate_adapter.collection.query.near_vector(
    near_vector=query_vector,
    limit=2,
    return_metadata=wvc.query.MetadataQuery(certainty=True),
)

In [96]:
import textwrap

for res in response.objects:
    print(textwrap.fill(res.properties["text"], width=120), end="\n\n")

Self-attention, sometimes called intra-attention is an attention mechanism relating different positions

self-attention and discuss its advantages over models such as [17, 18] and [9]

