In [None]:
# Advanced RAG Implementation on Custom Data Using Hybrid Search, Embed Caching And Mistral-AI
https://medium.aiplanet.com/advanced-rag-implementation-on-custom-data-using-hybrid-search-embed-caching-and-mistral-ai-ce78fdae4ef6

In [1]:
# RUN: python3 ingest.py

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter 

DATA_PATH = 'data/'
DB_FAISS_PATH = 'vectorstore/db_faiss'

# Create vector database
def create_vector_db():
    loader = DirectoryLoader(DATA_PATH,
                             glob='*.pdf',
                             loader_cls=PyPDFLoader)

    documents = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,
                                                   chunk_overlap=50)
    texts = text_splitter.split_documents(documents)

    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2',
                                       model_kwargs={'device': 'cpu'})

    db = FAISS.from_documents(texts, embeddings)
    db.save_local(DB_FAISS_PATH)

create_vector_db()

## 實作堆疊：
- 嵌入器： BAAI通用嵌入
- 檢索： FAISS Vectorstore
- 生成： Mistral-7B-Instruct GPTQ 模型
- 基礎架構： Google Colab、A100 GPU
- 數據：財務文件

In [None]:
# 安裝所需的軟體包
!pip install -q langchain Faiss-gpu tiktoken sentence-transformers
!pip install -q trl Py7zr auto-gptq optimum
!pip install -q rank_bm25
!pip install -q PyPdf

In [1]:
# 導入必要的套件
import langchain
from langchain.embeddings import CacheBackedEmbeddings,HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.retrievers import BM25Retriever,EnsembleRetriever
from langchain.document_loaders import PyPDFLoader,DirectoryLoader
from langchain.llms import HuggingFacePipeline
from langchain.cache import InMemoryCache
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import prompt
from langchain.chains import RetrievalQA
from langchain.callbacks import StdOutCallbackHandler
from langchain import PromptTemplate
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

In [None]:
# 使用 RecursiveCharacterTextSplitter 建立 Managebale 文字片段以建立評論區塊
dir_loader = DirectoryLoader("/home/g00cjz00/github/pdf_chatbot_llama2_vectorstore_chainlit/data",
                             glob="*.pdf",
                             loader_cls=PyPDFLoader)
docs = dir_loader.load()
#
print(f"len of documents in :{len(docs)}")

In [None]:
#使用 RecursiveCharacterTextSplitter 建立 Managebale 文字片段以建立評論區塊
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,
                                      chunk_overlap=200,)
#
esops_documents = text_splitter.transform_documents(docs)
print(f"number of chunks in barbie documents : {len(esops_documents)}")

### 建立向量存儲
- 在這裡，我們將利用CacheBackedEmbeddings來防止我們一遍又一遍地重新嵌入類似的查詢。
- 結構化文件將轉換為有用的格式，以便在法學碩士申請中查詢、檢索和使用
- 這裡我們將使用 FAISS（Facebook AI 相似性搜尋）作為向量儲存

In [None]:
store = LocalFileStore("./cache/")
#embed_model_id = 'BAAI/bge-small-en-v1.5'
embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'
core_embeddings_model = HuggingFaceEmbeddings(model_name=embed_model_id)
embedder = CacheBackedEmbeddings.from_bytes_store(core_embeddings_model,
                                                  store,
                                                  namespace=embed_model_id)
# Create VectorStore
vectorstore = FAISS.from_documents(esops_documents,embedder)

DB_FAISS_PATH = 'vectorstore/db_faiss'
vectorstore.save_local(DB_FAISS_PATH)

In [4]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
core_embeddings_model = embeddings

DB_FAISS_PATH = 'vectorstore/db_faiss'
vectorstore = FAISS.load_local(DB_FAISS_PATH, embeddings)

In [None]:
# 從向量庫中檢索與查詢相似的段落
query = "What is Acupuncture?"
embedding_vector = core_embeddings_model.embed_query(query)
print(len(embedding_vector))
#
docs_resp = vectorstore.similarity_search_by_vector(embedding_vector,k=5)
#
for page in docs_resp:
  print(page.page_content)
  print("------------\n")


### 檢查 CacheBackedEmbeddings 模式為我們節省了多少時間

In [None]:
%%timeit -n 1 -r 1
query = "What is Acupuncture?"
#
embedding_vector = core_embeddings_model.embed_query(query)
docs_resp = vectorstore.similarity_search_by_vector(embedding_vector,k=5)

In [11]:
# 建立稀疏嵌入
bm25_retriever = BM25Retriever.from_documents(esops_documents)
bm25_retriever.k=5

NameError: name 'esops_documents' is not defined

In [9]:
# 設定整合檢索器（混合搜尋）
faiss_retriever = vectorstore.as_retriever(search_kwargs={"k":5})
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever,faiss_retriever],
                                       weights=[0.5,0.5])

NameError: name 'bm25_retriever' is not defined

In [None]:
# 下載量化的 GPTQ 模型

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

model_name_or_path = "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ"
# To use a different branch, change revision
# For example: revision="gptq-4bit-32g-actorder_True"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                             device_map="auto",
                                             trust_remote_code=False,
                                             revision="gptq-8bit-32g-actorder_True")
#
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

In [None]:
# 創建管道
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    do_sample=True,
    temperature=0.1,
    top_p=0.95,
    top_k=40,
    repetition_penalty=1.1
)

In [None]:
# 使用量化 GPTQ 模型初始化 LLM
from langchain.llms import HuggingFacePipeline
llm = HuggingFacePipeline(pipeline=pipe)

In [None]:
# 設定快取
langchain.llm_cache = InMemoryCache()

In [None]:
# 制定提示模板

PROMPT_TEMPLATE = '''
You are my financial advisor. You are great at providing tips on investments, savings and on financial markets with your knowledge in finances.
With the information being provided try to answer the question. 
If you cant answer the question based on the information either say you cant find an answer or unable to find an answer.
So try to understand in depth about the context and answer only based on the information provided. Dont generate irrelevant answers

Context: {context}
Question: {question}
Do provide only helpful answers

Helpful answer:
'''
#
input_variables = ['context', 'question']
#
custom_prompt = PromptTemplate(template=PROMPT_TEMPLATE,
                            input_variables=input_variables)

## 設定檢索鏈——無混合搜索

In [None]:
handler = StdOutCallbackHandler()
#
qa_with_sources_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever = vectorstore.as_retriever(search_kwargs={"k":5}),
    verbose=True,
    callbacks=[handler],
    chain_type_kwargs={"prompt": custom_prompt},
    return_source_documents=True
)

### 處理用戶查詢1
CPU 時間：使用者 19.3 秒，系統：690 毫秒，總計：20 秒
掛起時間：20 秒

In [None]:
%%time
query = "What is Acupuncture?"
response = qa_with_sources_chain({"query":query})
print(f"Response generated : \n {response['result']}")
print(f"Source Documents : \n {response['source_documents']}")

### 處理用戶查詢2
CPU 時間：使用者 19.3 秒，系統：690 毫秒，總計：20 秒
掛起時間：20 秒

In [None]:
%%time
query = "What is Acupuncture?"
response = qa_with_sources_chain({"query":query})
print(f"Response generated : \n {response['result']}")
print(f"Source Documents : \n {response['source_documents']}")



### 處理用戶查詢3
CPU 時間：使用者 19.3 秒，系統：690 毫秒，總計：20 秒
掛起時間：20 秒

In [None]:
%%time
query = "What is Acupuncture?"
response = qa_with_sources_chain({"query":query})
print(f"Response generated : \n {response['result']}")
print(f"Source Documents : \n {response['source_documents']}")
print(f"Number of of Documents returned : {len(response['source_documents'])}")


### 處理用戶查詢4
CPU 時間：使用者 19.3 秒，系統：690 毫秒，總計：20 秒
掛起時間：20 秒

In [None]:
%%time
query = "What is Acupuncture?"
response = qa_with_sources_chain({"query":query})
print(f"Response generated : \n {response['result']}")
print(f"Source Documents : \n {response['source_documents']}")
print(f"Number of Documents returned : {len(response['source_documents'])}")


## Setup Retrieval chain — with Hybrid Search


In [None]:
#
handler = StdOutCallbackHandler()
#
qa_with_sources_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever = ensemble_retriever,
    callbacks=[handler],
    chain_type_kwargs={"prompt": custom_prompt},
    return_source_documents=True
)

### 處理用戶查詢1
CPU 時間：使用者 6.7 秒，系統：267 毫秒，總計：6.97 秒
運行時間：6.94 秒

In [None]:
%%time
query = "What is Acupuncture?"
response = qa_with_sources_chain({"query":query})
print(f"Response generated : \n {response['result']}")
print(f"Source Documents : \n {response['source_documents']}")


### 處理用戶查詢2
CPU 時間：使用者 6.7 秒，系統：267 毫秒，總計：6.97 秒
運行時間：6.94 秒

In [None]:
%%time
query = "What is Acupuncture?"
response = qa_with_sources_chain({"query":query})
print(f"Response generated : \n {response['result']}")
print(f"Source Documents : \n {response['source_documents']}")


### 處理用戶查詢3
CPU 時間：使用者 6.7 秒，系統：267 毫秒，總計：6.97 秒
運行時間：6.94 秒

In [None]:
%%time
query = "What is Acupuncture?"
response = qa_with_sources_chain({"query":query})
print(f"Response generated : \n {response['result']}")
print(f"Source Documents : \n {response['source_documents']}")
print(f"Number of of Documents returned : {len(response['source_documents'])}")


### 處理用戶查詢4
CPU 時間：使用者 6.7 秒，系統：267 毫秒，總計：6.97 秒
運行時間：6.94 秒

In [None]:
%%time
query = "What is Acupuncture?"
response = qa_with_sources_chain({"query":query})
print(f"Response generated : \n {response['result']}")
print(f"Source Documents : \n {response['source_documents']}")


### 處理用戶查詢5
CPU 時間：使用者 6.7 秒，系統：267 毫秒，總計：6.97 秒
運行時間：6.94 秒

In [None]:
%%time
query ="What is the document about?"
response = qa_with_sources_chain({"query":query})
print(f"Response generated : \n {response['result']}")
print(f"Source Documents : \n {response['source_documents']}")


### 處理用戶查詢6
CPU 時間：使用者 6.7 秒，系統：267 毫秒，總計：6.97 秒
運行時間：6.94 秒

In [None]:
%%time
query ="tell me about this document?"
response = qa_with_sources_chain({"query":query})
print(f"Response generated : \n {response['result']}")
print(f"Source Documents : \n {response['source_documents']}")
print(f"Number of Documents returned : {len(response['source_documents'])}")


### 結論：
我們可以看到，使用 EnsembleRetriver 的混合搜尋為生成式 AI 模型提供了更好的上下文，從而可以製定更好的回應。快取回應和查詢還可以減少推理時間並降低計算成本。快取查詢嵌入還有助於避免重新計算它們。