In [None]:
import os

from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv(), override=True)
os.environ.get('OPENAI_API_KEY')

In [None]:
from langchain_openai.chat_models import ChatOpenAI

In [None]:
chat = ChatOpenAI(
    openai_api_base=os.environ["CHATGPT_API_ENDPOINT"],
    openai_api_key=os.environ["OPENAI_API_KEY"]
)

4.2 Chroma向量數據庫相似度搜索

In [None]:
from langchain.document_loaders import PyPDFLoader

In [None]:
#讀取PDF文件資料
loaders = [
    PyPDFLoader("./data/01.pdf"),
    PyPDFLoader("./data/02.pdf"),
    PyPDFLoader("./data/03.pdf"),
    PyPDFLoader("./data/04.pdf")
]

In [None]:
#將資料個別放入docs，以document格式儲存
docs = []

for loader in loaders:
    docs.extend(loader.load())

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
#分割資料為多個chunk，每個chunk大小為1000
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    length_function=len,
    separators=["\n\n", "\n", " ", ""]
)

splits = text_splitter.split_documents(docs)

len(splits)

In [None]:
from langchain_openai import OpenAIEmbeddings

In [None]:
#使用openAI的embedding工具，將資料進行embedding
embeddings = OpenAIEmbeddings(
    base_url=os.environ["EMBEDDINGS_BASE_URL"]
)

In [None]:
from langchain.vectorstores import Chroma

persist_directory = "./db"

In [None]:
#先將db中資料清空
#!rm -rf ./db    #Linux指令

import os
import shutil

# 檢查目標是否存在
if os.path.exists('./db'):
    # 删除目錄及其内容
    shutil.rmtree('./db')
    print("目錄已刪除")
else:
    print("目錄不存在")

In [None]:
#建立向量資料庫
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embeddings,
    persist_directory=persist_directory
)

print(vectordb._collection.count())

In [None]:
#搜尋與問題相似度最高的資料
question = "有什麼西式美食推薦?"

docs_ss = vectordb.similarity_search(question, k=3)   #返回3筆
docs_nmr = vectordb.max_marginal_relevance_search(question, k=2, fetch_k=3)   #先找3個，刪掉1個最相似的(留下2個)

In [None]:
len(docs_ss)

In [None]:
len(docs_nmr)

In [None]:
docs_ss[0].page_content[:200]

In [None]:
docs_nmr[0].page_content[:200]

In [None]:
question = "有什麼景色優美的景點可以推薦?"

docs_ss = vectordb.similarity_search(
    question,
    k=3,
    filter={"source": "./data/03.pdf"}   #針對指定檔案搜索
)

for d in docs_ss:
    print(d.metadata)

In [1]:
from langchain.llms import Ollama

In [2]:
#使用地端模型
chat = Ollama(model="openchat:latest")

  chat = Ollama(model="openchat:latest")


In [None]:
from langchain.retrievers.self_query.base import SelfQueryRetriever 
from langchain.chains.query_constructor.base import AttributeInfo

In [None]:
#進行檢索資料設定
metadata_field_info = [
    AttributeInfo(
        name="source",
        description="搜索的訊息來源於以下三個PDF文檔，他們分別是`./data/01.pdf`, `./data/02.pdf`, `./data/03.pdf`,`./data/04.pdf`",
        type="string"
    ),
    AttributeInfo(
        name="page",
        description="訊息來源的頁面",
        type="integer"
    )
]

In [None]:
document_content_description = "這裡存放的是關於香港特色的旅遊勝地以及美食和特有文化紀錄"

#進行檢索設定
retriever = SelfQueryRetriever.from_llm(
    llm=chat,
    vectorstore=vectordb,
    document_contents=document_content_description,
    metadata_field_info=metadata_field_info
)

In [None]:
#進行檢索
question = "介紹一下香港特色美食?"

docs = retriever.invoke(question, k=5)

for d in docs:
    print(d.metadata)

4.4 如何使用LLM摘要總結Chroma檢索訊息

In [None]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

In [None]:
#一般相似度搜索
compressor = LLMChainExtractor.from_llm(chat)

compression_retriever = ContextualCompressionRetriever(
    base_retriever=vectordb.as_retriever(),
    base_compressor=compressor
)

In [None]:
question = "介紹一下西貢的優美景色"

compressed_docs = compression_retriever.invoke(question)

def pretty_print_docs(docs):
    print(
        f"\n\n{'-'*60}".join([f"\n\n第{i+1}個檢索:\n\n" + d.page_content for i, d in enumerate(docs)])
    )

pretty_print_docs(compressed_docs)

In [None]:
#nmr搜索，避免有相同的
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever(search_type="mmr")
)

In [None]:
question = "香港哪裡有最好吃的蛋塔? 如果有，請提供該店鋪的地址"

compressed_docs = compression_retriever.invoke(question)
pretty_print_docs(compressed_docs)