In [1]:
from google.colab import userdata
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
PINECONE_API_KEY = userdata.get('PINECONE_API_KEY')

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install PyMuPDF
!pip install langchain_openai
!pip install pinecone
!pip install langchain_pinecone
!pip install langchain_community

In [4]:
#讀取檔案
from pprint import pprint
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

loader = PyMuPDFLoader("/content/drive/MyDrive/rag_pdf/FD.pdf")
documents = loader.load_and_split(
    text_splitter=RecursiveCharacterTextSplitter(
        chunk_size=300,
        chunk_overlap=50
    )
)

In [None]:
!pip install --force-reinstall -v openai==1.55.3

In [None]:
#寫入向量資料庫
from langchain_community.document_loaders import PyMuPDFLoader
# from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_core.documents import Document
from pinecone import Pinecone
import pinecone
embedding = OpenAIEmbeddings(
    openai_api_key=OPENAI_API_KEY
)

vector_store = PineconeVectorStore(
    index=Pinecone(api_key=PINECONE_API_KEY).Index("rag"),
    embedding=embedding
)
vector_store.add_documents(loader.load())

In [7]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.memory import ConversationSummaryBufferMemory
from langchain_core.output_parsers import BaseOutputParser
from typing import List
import logging
from langchain_core.prompts import PromptTemplate

# 定義 LineListOutputParser 來解析多行文本
class LineListOutputParser(BaseOutputParser[List[str]]):
    """Output parser for a list of lines."""

    def parse(self, text: str) -> List[str]:
        lines = text.strip().split("\n")
        return list(filter(None, lines))  # Remove empty lines

# 設置日誌級別，顯示生成的查詢
logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)
# 使用 ChatOpenAI 作為 LLM 來生成查詢
llm = ChatOpenAI(
    api_key=OPENAI_API_KEY,
    model_name="gpt-4o"
)

# 生成自定義 PromptTemplate
CUSTOM_QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""你是一個 AI 語言模型助手。你的任務是根據用戶輸入的問題生成三個不同版本的查詢，以繁體中文進行檢索。請將這些查詢分行顯示。
    原始問題: {question}"""
)
# 建立 MultiQueryRetriever，並將 Pinecone 索引作為檢索器
llm_chain = CUSTOM_QUERY_PROMPT | llm | LineListOutputParser()
retriever_from_llm = MultiQueryRetriever(
    retriever=vector_store.as_retriever(),
    llm_chain=llm_chain
)
question = "指標"
unique_docs = retriever_from_llm.invoke(question)
unique_docs

INFO:langchain.retrievers.multi_query:Generated queries: ['1. 什麼是指標的定義和用途？', '2. 常見的指標類型有哪些？', '3. 如何有效地使用指標進行數據分析？']


[Document(id='64798b3f-81b2-4f7e-be39-e3635c6e791c', metadata={'author': 'A000000', 'creationDate': "D:20240417093840+08'00'", 'creator': 'Microsoft® Word 2019', 'file_path': '/content/drive/MyDrive/LineBot/FD.pdf', 'format': 'PDF 1.7', 'keywords': '', 'modDate': "D:20240417093840+08'00'", 'page': 41.0, 'producer': 'Microsoft® Word 2019', 'source': '/content/drive/MyDrive/LineBot/FD.pdf', 'subject': '', 'title': '「西醫基層與醫院之財務風險分擔監控指標」會議議程', 'total_pages': 42.0, 'trapped': ''}, page_content='42 \n \n治療指引 \n參考指標 \n分子定義 \n分母定義 \n操作型定義 \n \nABC 達標率 \n \n \n \n\uf0acStark, C.S., Fradkin, J.E., \nSaydah, S.H., Rust, K.F., \nCowie, C.C. (2013).The \nprevalence of meeting A1C, \nblood pressure, and LDL goals \namong people with diabetes, \n1988-2010. Diabetes \nCare,36(8),2271-9. doi: \n10.2337/dc12-2258.  \n\uf0acAli, M.K., Bullard, K.M., \nGregg, E.W., Del, R.C. (2014). \nA cascade of care for diabetes \nin the United States: \nvisualizing the gaps. Ann \nIntern Med, 161(10),681-9. \ndoi: 10.

In [8]:
def qa(question: str, k: int = 1):
    documents = vector_store.similarity_search(question, k=k)
    prompt = ChatPromptTemplate.from_template(
    """
    原始問題: {question}
    取得的參考資料: {data}
    "你是專業客服人員,請根據上下文來回答問題,"
    "你不知道答案就說你不知道, 不要試圖編造答案。\n"
    請用繁體中文回答
    """
    )
    llm = ChatOpenAI(
        api_key=OPENAI_API_KEY,
        model_name="gpt-4o"
    )
    chain = prompt | llm

    result = chain.invoke({"question": question, "data": documents[0].page_content})
    return str(result.content)

In [9]:
qa("糖尿病指標")

'根據提供的資料，糖尿病指標的治療指引包括以下幾個達標率的參考指標：\n\n1. **醣化血紅素 (HbA1C) 達標率**：糖尿病病人在當年度最後一次檢測中，HbA1C<7%的人數。糖尿病病人至少每三個月檢查一次HbA1C，且當年度最後一次HbA1C檢查值<7%者被視為控制良好。\n\n2. **血壓 (BP) 達標率**：糖尿病病人在全年紀錄中的門診血壓次數至少每三個月一次，其中有1/2(含)以上的BP<140/90 mmHg者被視為控制良好。\n\n3. **低密度膽固醇 (LDL cholesterol) 達標率**：糖尿病病人在全年檢驗中的LDL-C次數至少每六個月一次，其中有1/2(含)以上的LDL<100 mg/dl者被視為控制良好。\n\n4. **ABC 三項指標全部達標率**：同時符合HbA1C<7%、BP<140/90 mmHg及LDL<100 mg/dl的人數。\n\n這些指標是用來衡量糖尿病患者的健康管理效果，確保患者的血糖、血壓和膽固醇水平都處於良好控制的狀態。\n\n如果有其他問題或需要更詳細的解釋，請告訴我。'