In [None]:
from langchain.indexes import SQLRecordManager, index
from langchain_core.documents import Document
from langchain_elasticsearch import ElasticsearchStore
from langchain_chroma import Chroma
from langchain_community.embeddings.cloudflare_workersai import CloudflareWorkersAIEmbeddings
from dotenv import load_dotenv
import os
from langchain_text_splitters import CharacterTextSplitter

In [ ]:
collection_name = "test_index"

load_dotenv()

cf_embedding = CloudflareWorkersAIEmbeddings(
    account_id=os.getenv('CF_ACCOUNT_ID'),
    api_token=os.getenv('CF_API_TOKEN'),
    model_name="@cf/baai/bge-small-en-v1.5",
)
vectorstore = ElasticsearchStore(
    es_url="http://localhost:9200", index_name="test_index", embedding=cf_embedding
)

In [ ]:
namespace = f"elasticsearch/{collection_name}"
record_manager = SQLRecordManager(
    namespace, db_url="sqlite:///record_manager_cache.sql"
)

In [ ]:
# 在使用记录管理器之前创建架构。
record_manager.create_schema()

In [ ]:
doc1 = Document(page_content="kitty", metadata={"source": "kitty.txt"})
doc2 = Document(page_content="doggy", metadata={"source": "doggy.txt"})

In [ ]:
# 索引到空向量存储中：
def _clear():
    """Hacky helper method to clear content. See the `full` mode section to to understand why it works."""
    index([], record_manager, vectorstore, cleanup="full", source_id_key="source")

In [ ]:
index(
    [doc1, doc1, doc1, doc1, doc1],
    record_manager,
    vectorstore,
    cleanup=None,
    source_id_key="source",
)

In [ ]:
_clear()

In [ ]:
index([doc1, doc2], record_manager, vectorstore, cleanup=None, source_id_key="source")

In [ ]:
# 第二次将跳过所有内容：
index([doc1, doc2], record_manager, vectorstore, cleanup=None, source_id_key="source")

In [ ]:
# incremental"删除

In [ ]:
_clear()

In [ ]:
index(
    [doc1, doc2],
    record_manager,
    vectorstore,
    cleanup="incremental",
    source_id_key="source",
)

In [ ]:
index([], record_manager, vectorstore, cleanup="incremental", source_id_key="source")

In [ ]:
# 如果我们改变一个文档，新版本将被写入，所有共享相同源的旧版本将被删除。
changed_doc_2 = Document(page_content="puppy", metadata={"source": "doggy.txt"})

In [ ]:
index(
    [changed_doc_2],
    record_manager,
    vectorstore,
    cleanup="incremental",
    source_id_key="source",
)

In [ ]:
# "full"删除
# 在full模式下，用户应该传递应索引到索引功能中的full内容。
# 任何未传递到索引功能且存在于向量存储中的文档都将被删除！
# 此行为对于处理源文档的删除很有用。

_clear()

all_docs = [doc1, doc2]

index(all_docs, record_manager, vectorstore, cleanup="full", source_id_key="source")

In [ ]:
del all_docs[0]

In [ ]:
# Source

In [ ]:
doc1 = Document(
    page_content="kitty kitty kitty kitty kitty", metadata={"source": "kitty.txt"}
)
doc2 = Document(page_content="doggy doggy the doggy", metadata={"source": "doggy.txt"})

In [ ]:
new_docs = CharacterTextSplitter(
    separator="t", keep_separator=True, chunk_size=12, chunk_overlap=2
).split_documents([doc1, doc2])
new_docs

In [ ]:
vectorstore.similarity_search("dog", k=30)