In [1]:
from langchain.indexes import SQLRecordManager, index
from langchain_core.documents import Document
from langchain_elasticsearch import ElasticsearchStore
from langchain_chroma import Chroma
from langchain_community.embeddings.cloudflare_workersai import CloudflareWorkersAIEmbeddings
from dotenv import load_dotenv
import os
from langchain_text_splitters import CharacterTextSplitter

In [2]:
collection_name = "test_index"

load_dotenv()

cf_embedding = CloudflareWorkersAIEmbeddings(
    account_id=os.getenv('CF_ACCOUNT_ID'),
    api_token=os.getenv('CF_API_TOKEN'),
    model_name="@cf/baai/bge-small-en-v1.5",
)

vectorstore = Chroma(persist_directory="./.chroma-data",
                     collection_name=collection_name,
                     embedding_function=cf_embedding)


In [3]:
namespace = f"chromadb/{collection_name}"
record_manager = SQLRecordManager(
    namespace, db_url="sqlite:///record_manager_cache.sql"
)

In [4]:
record_manager.create_schema()

In [5]:
doc1 = Document(page_content="kitty", metadata={"source": "kitty.txt"})
doc2 = Document(page_content="doggy", metadata={"source": "doggy.txt"})

In [6]:
index(
  [doc1, doc1, doc1, doc1, doc1],
  record_manager,
  vectorstore,
  cleanup=None,
  source_id_key="source",
)

{'num_added': 1, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}

In [8]:
vectorstore.similarity_search("kitty", k=1)

[Document(metadata={'source': 'kitty.txt'}, page_content='kitty')]