## Prepare

In [None]:
!pip install -r lancedb-dev/requirements.txt

## Using predefined EF, for example, ollama

In [None]:
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector

registry = get_registry()
ollama = registry.get("ollama").create()

# overwrite default model field
ollama.host = "http://host.docker.internal:30491"

class TinyStory(LanceModel):
    vector: Vector(ollama.ndims()) = ollama.VectorField()
    text: str = ollama.SourceField()

## Add table data

In [None]:
import lancedb
import json

db = lancedb.connect("data/sampledb")
table = db.create_table("tiny_stories", schema=TinyStory, mode='overwrite')

## load data
story_text = list()
with open("datasets/tinystories-00.json", 'r') as ifile:
    for s in json.load(ifile)[:100]:
        story_text.append(s['story'])

## lancedb ollama embedding interfarce needs improvement
table.add([{"text": u} for u in story_text])

## Query table (embeddings) semantically

In [None]:
results = (
    table.search("dog")
        .limit(10)
        .to_pandas()
)

## User-defined embedding functions

here we define new sentence-transformers EF.

In [62]:
from lancedb.embeddings.registry import register
from lancedb.embeddings.base import TextEmbeddingFunction
from lancedb.util import attempt_import_or_raise


@register("sentence-transformers")
class SentenceTransformerEmbeddings(TextEmbeddingFunction):
    name: str = "sentence-transformers/all-MiniLM-L6-v2"
    # set more default instance vars like device, etc.

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self._ndims = None

    def generate_embeddings(self, texts):
        return self._embedding_model().encode(list(texts)).tolist()

    def ndims(self):
        if self._ndims is None:
            self._ndims = len(self.generate_embeddings("foo")[0])
        return self._ndims

    def _embedding_model(self):
        from sentence_transformers import SentenceTransformer
        return SentenceTransformer(self.name)

In [63]:
from lancedb.pydantic import LanceModel, Vector
from lancedb.embeddings import EmbeddingFunctionRegistry
import pandas as pd

registry = EmbeddingFunctionRegistry.get_instance()
stransformer = registry.get("sentence-transformers").create()

class TextModelSchema(LanceModel):
    vector: Vector(stransformer.ndims()) = stransformer.VectorField()
    text: str = stransformer.SourceField()

db = lancedb.connect("data/sampledb")
tbl = db.create_table("table", schema=TextModelSchema, mode='overwrite')

tbl.add(pd.DataFrame({"text": ["halo", "world", "你好"]}))
result = tbl.search("hello").limit(5).to_pandas()
print(result)

                                              vector   text  _distance
0  [-0.030238196, 0.03164673, -0.06337431, -0.013...  world   1.309270
1  [0.008483132, 0.01367103, -0.050205357, 0.0202...   halo   1.415382
2  [0.004036582, 0.015092085, 0.080479845, 0.0011...     你好   1.491913
