# Chroma
本筆記本介紹如何開始使用Chroma向量儲存。
- https://python.langchain.com/docs/integrations/vectorstores/chroma/

In [1]:
# 安裝套件
!uv pip install -qU "langchain-chroma"

In [2]:
# llm模型設定
# https://build.nvidia.com/deepseek-ai/deepseek-r1
# nvapi-xxx
import getpass
import os
if not os.environ.get("NVIDIA_API_KEY"):
  os.environ["NVIDIA_API_KEY"] = getpass.getpass("Enter API key for NVIDIA: ")

from langchain.chat_models import init_chat_model
llm = init_chat_model("meta/llama-4-maverick-17b-128e-instruct", model_provider="nvidia")

Enter API key for NVIDIA:  ········




In [3]:
# https://jina.ai/
# jina_xxx
import getpass
import os
if not os.environ.get("JINA_API_KEY"):
  os.environ["JINA_API_KEY"] = getpass.getpass("Enter API key for Voyage AI: ")

from langchain_community.embeddings import JinaEmbeddings
embeddings = JinaEmbeddings(
    jina_api_key=os.environ["JINA_API_KEY"], model_name="jina-embeddings-v3"
)

Enter API key for Voyage AI:  ········


In [4]:
from langchain_chroma import Chroma

vector_store = Chroma(
    #collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db30",  # Where to save data locally, remove if not necessary
)

In [5]:
from uuid import uuid4

from langchain_core.documents import Document

document_1 = Document(
    page_content="I had chocolate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
    id=1,
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
    id=2,
)

document_3 = Document(
    page_content="Building an exciting new project with LangChain - come check it out!",
    metadata={"source": "tweet"},
    id=3,
)

document_4 = Document(
    page_content="Robbers broke into the city bank and stole $1 million in cash.",
    metadata={"source": "news"},
    id=4,
)

document_5 = Document(
    page_content="Wow! That was an amazing movie. I can't wait to see it again.",
    metadata={"source": "tweet"},
    id=5,
)

document_6 = Document(
    page_content="Is the new iPhone worth the price? Read this review to find out.",
    metadata={"source": "website"},
    id=6,
)

document_7 = Document(
    page_content="The top 10 soccer players in the world right now.",
    metadata={"source": "website"},
    id=7,
)

document_8 = Document(
    page_content="LangGraph is the best framework for building stateful, agentic applications!",
    metadata={"source": "tweet"},
    id=8,
)

document_9 = Document(
    page_content="The stock market is down 500 points today due to fears of a recession.",
    metadata={"source": "news"},
    id=9,
)

document_10 = Document(
    page_content="I have a bad feeling I am going to get deleted :(",
    metadata={"source": "tweet"},
    id=10,
)

documents = [
    document_1,
    document_2,
    document_3,
    document_4,
    document_5,
    document_6,
    document_7,
    document_8,
    document_9,
    document_10,
]
uuids = [str(uuid4()) for _ in range(len(documents))]

vector_store.add_documents(documents=documents, ids=uuids)

['1aa3d78d-c1c4-4f8f-b9c9-f83d4d5c9884',
 'f5030079-91cc-4652-9243-e1910da2ae6c',
 '13268674-6721-46a6-ae7d-b409c71f6b55',
 '7d929b87-a1c1-4de6-9584-844f09986730',
 'a3789186-b526-4cad-b320-dd81309ff9c7',
 '2b69b396-2195-44ec-bd11-c23f1d8affae',
 'd93f6e7f-e17c-4953-bdba-a906d6c11afe',
 'bd4ff150-933c-4601-8444-7732de415660',
 'd99c3215-94fb-4e33-9831-3cde708b739e',
 '2dabb0f6-8cba-4906-8bb2-f30f6e8eb952']

In [6]:
# 輸出所有儲存的 document（包括 id, page_content, metadata）
all_data = vector_store._collection.get()

# 印出每筆紀錄的內容
for i in range(len(all_data["ids"])):
    print(f"ID: {all_data['ids'][i]}")
    print(f"Document: {all_data['documents'][i]}")
    print(f"Metadata: {all_data['metadatas'][i]}")
    print("="*40)

ID: 1aa3d78d-c1c4-4f8f-b9c9-f83d4d5c9884
Document: I had chocolate chip pancakes and scrambled eggs for breakfast this morning.
Metadata: {'source': 'tweet'}
ID: f5030079-91cc-4652-9243-e1910da2ae6c
Document: The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.
Metadata: {'source': 'news'}
ID: 13268674-6721-46a6-ae7d-b409c71f6b55
Document: Building an exciting new project with LangChain - come check it out!
Metadata: {'source': 'tweet'}
ID: 7d929b87-a1c1-4de6-9584-844f09986730
Document: Robbers broke into the city bank and stole $1 million in cash.
Metadata: {'source': 'news'}
ID: a3789186-b526-4cad-b320-dd81309ff9c7
Document: Wow! That was an amazing movie. I can't wait to see it again.
Metadata: {'source': 'tweet'}
ID: 2b69b396-2195-44ec-bd11-c23f1d8affae
Document: Is the new iPhone worth the price? Read this review to find out.
Metadata: {'source': 'website'}
ID: d93f6e7f-e17c-4953-bdba-a906d6c11afe
Document: The top 10 soccer players in the world rig

In [14]:
# 更新
updated_document_1 = Document(
    page_content="I had chocolate chip pancakes and fried eggs for breakfast this morning.",
    metadata={"source": "tweet2"},
    id=1,
)

vector_store.update_document(document_id="d149b42a-724d-44b8-9464-aa9d79bc69a8", document=updated_document_1)

In [None]:
# 刪除
vector_store.delete(ids=[
    "d149b42a-724d-44b8-9464-aa9d79bc69a8",
    "5bdc786d-7737-43a6-aa42-4f80320de6bd"
])

In [7]:
# 搜尋一 cosine distance
results = vector_store.similarity_search(
    "LangChain provides abstractions to make working with LLMs easy",
    k=3,
    filter={"source": "tweet"},
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* Building an exciting new project with LangChain - come check it out! [{'source': 'tweet'}]
* LangGraph is the best framework for building stateful, agentic applications! [{'source': 'tweet'}]
* I had chocolate chip pancakes and scrambled eggs for breakfast this morning. [{'source': 'tweet'}]


In [8]:
# 搜尋二 cosine distance
results = vector_store.similarity_search_with_score(
    "LangChain provides abstractions to make working with LLMs easy",
    k=3,
    filter={"source": "tweet"},
)
for res, score in results:
    print(f"* [SIM={score:3f}] {res.page_content} [{res.metadata}]")

* [SIM=0.528160] Building an exciting new project with LangChain - come check it out! [{'source': 'tweet'}]
* [SIM=0.679338] LangGraph is the best framework for building stateful, agentic applications! [{'source': 'tweet'}]
* [SIM=1.425552] I had chocolate chip pancakes and scrambled eggs for breakfast this morning. [{'source': 'tweet'}]


In [9]:
# 搜尋三
embedding_vector=embeddings.embed_query("I love green eggs and ham!")
results = vector_store.similarity_search_by_vector(
    embedding=embedding_vector, k=1
)
for doc in results:
    print(f"* {doc.page_content} [{doc.metadata}]")

* I had chocolate chip pancakes and scrambled eggs for breakfast this morning. [{'source': 'tweet'}]


In [10]:
# Query by turning into retriever

retriever = vector_store.as_retriever(
    search_type="mmr", search_kwargs={"k": 2, "fetch_k": 5}
)
retriever.invoke("Stealing from the bank is a crime", filter={"source": "news"})

[Document(id='7d929b87-a1c1-4de6-9584-844f09986730', metadata={'source': 'news'}, page_content='Robbers broke into the city bank and stole $1 million in cash.'),
 Document(id='f5030079-91cc-4652-9243-e1910da2ae6c', metadata={'source': 'news'}, page_content='The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.')]

In [13]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate



system_prompt = (
    "Use the given context to answer the question. "
    "If you don't know the answer, say you don't know. "
    "Use three sentence maximum and keep the answer concise. "
    "Context: {context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)
question_answer_chain = create_stuff_documents_chain(llm, prompt)
chain = create_retrieval_chain(retriever, question_answer_chain)
query = "Stealing from the bank is a crime"
chain.invoke({"input": query}, filter={"source": "news"})


{'input': 'Stealing from the bank is a crime',
 'context': [Document(id='7d929b87-a1c1-4de6-9584-844f09986730', metadata={'source': 'news'}, page_content='Robbers broke into the city bank and stole $1 million in cash.'),
  Document(id='2dabb0f6-8cba-4906-8bb2-f30f6e8eb952', metadata={'source': 'tweet'}, page_content='I have a bad feeling I am going to get deleted :(')],
 'answer': 'The robbers committed a serious crime by breaking into the city bank and stealing $1 million. Their actions are considered theft and are punishable by law. The robbers will likely face charges.'}

##   作業

**將 Chroma 與 LLM 結合**

Chroma 可以與 LLM 結合，創建更強大的應用程式。

**作業：**

1.  **設計一個問答系統：**
    * 該系統使用 Chroma 或其他向量資料庫 (可選擇) 來檢索相關文檔，然後使用 LLM 來回答用戶的問題。
2.  **選擇文檔來源：**
    * 選擇一個適合問答系統的文檔來源，例如：
        * 網頁文章
        * 產品說明書
        * 法律文件
        * 學術論文
    * 請在報告中說明你選擇的文檔來源及其適用情境。
3.  **實作向量資料庫操作：**
    * 實作以下向量資料庫操作：
        * **新增文檔：** 將選擇的文檔轉換為 embedding 並存入向量資料庫。
        * **相似度搜尋：** 根據使用者查詢，從向量資料庫中檢索相關文檔。
        * **結果排序：** 根據相關性對檢索結果進行排序。
4.  **回答使用者問題：**
    * 系統能夠根據檢索到的文檔，回答使用者提出的問題。

**評估標準：**

* 系統是否能夠正確回答使用者提出的問題？ (50%)
* 系統是否能夠找到所有相關的文檔？ (50%)
