In [7]:
import dotenv

dotenv.load_dotenv("../config/.env")

True

In [8]:
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents import Document
from uuid import uuid4

document_1 = Document(
    page_content="AKShare 是基于 Python 的财经数据接口库，目的是实现对股票、期货、期权、基金、外汇、债券、指数、加密货币等金融产品的基本面数据、实时和历史行情数据、衍生数据从数据采集、数据清洗到数据落地的一套工具，主要用于学术研究目的。",
    metadata={"source": "introduce"},
    id=1,
)

document_2 = Document(
    page_content="国家统计局-国家数据",
    # metadata={"source": "https://data.stats.gov.cn/"},
    metadata={"source": "news"},
    id=2,
)

document_3 = Document(
    page_content="东方财富-经济数据一览-中国-企业商品价格指数, 数据区间从 20050101-至今。单次返回所有历史数据。",
    # metadata={"source": "https://data.eastmoney.com/cjsj/qyspjg.html"}
    metadata={"source": "introduce"}
)

documents=[
    document_1,
    document_2,
    document_3,
]

uuids = [str(uuid4()) for _ in range(len(documents))]

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
vector_store = Chroma(
    collection_name="collection_name",
    embedding_function=embeddings,
    # 可选：内存、本地、本地Server、Cloud存储
    persist_directory="./chroma_chain_db",
)
vector_store.add_documents(documents=documents, ids=uuids)
collection = vector_store.get_or_create_collection("collection_name")

KeyboardInterrupt: 

In [None]:
update_document_2 = Document(
    page_content="这是国家统计局-国家数据的网站，能够获取一些公开的宏观统计数据。",
    metadata={"source": "https://data.stats.gov.cn/"},
    id=2,
)

# 也可多个document同时一次更新：vector_store.update_documents(document_ids=uuids[:2], documents=[update_document_1, update_document_2])
vector_store.update_document(document_id=uuids[1], document=update_document_2)

In [None]:
# vector_store.delete(ids=uuids[-1])

In [None]:
simple_results = vector_store.similarity_search(
    "获取统计宏观数据的网站",
    k=2,
    filter={"source": "introduce"},
)

for res in simple_results:
    print(f"* {res.page_content} [{res.metadata}]")

In [None]:
results_with_score = vector_store.similarity_search_with_score(
    "获取统计宏观数据的网站",
    k=1,
    filter={"source": "introduce"},
)

for res, score in results_with_score:
    print(f"* [SIM={score:3f}] {res.page_content} [{res.medadata}]")

In [None]:
results_by_vector = vector_store.similarity_search_by_vector(
    embeddings=embeddings.embed_query("能够统计宏观数据的网站有哪些？"),
    k=2,
)

for res in results_by_vector:
    print(f"* {res.page_content} [{res.metadata}]")

In [None]:
retriever = vector_store.as_retriever(
    search_type="mmr",
    search_kwargs={"k": 1, "fetch_k": 3},
)

retriever.invoke(
    "获取统计宏观数据的网站",
    filter={"source": "introduce"},
)