In [1]:

from dotenv import load_dotenv

load_dotenv(override=True)

True

# Step1: Indexing(Load)

## 1.1 Importing the libraries

In [24]:
import bs4
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

tag_strainer = bs4.SoupStrainer(("p", "h1", "h2", "h3", "h4", "h5", "h6", "table", "span.pre", "pre"))
loader = WebBaseLoader(
    web_paths=(r"https://docs.python.org/zh-tw/3/library/collections.html#collections.Counter",),
    bs_kwargs={"parse_only": tag_strainer},
)

## 1.2 Load website content

In [26]:
docs = loader.load()

docs

[Document(page_content='目錄上個主題calendar --- 日曆相關函式下個主題collections.abc --- 容器的抽象基类此頁面瀏覽collections --- 容器資料型態¶原始碼：Lib/collections/__init__.py這個模組實作了一些特別的容器資料型態，用來替代 Python 一般內建的容器，例如 dict（字典）、list（串列）、set（集合）和 tuple（元組）。\n\n\n\n\n\nnamedtuple()\n用來建立具名欄位的 tuple 子類別的工廠函式\n\ndeque\n一個類似 list 的容器，可以快速的在頭尾加入 (append) 元素與移除 (pop) 元素\n\nChainMap\n一個類似 dict 的類別，用來為多個對映 (mapping) 建立單一的視圖 (view)\n\nCounter\ndict 的子類別，用來計算可雜湊物件的數量\n\nOrderedDict\ndict 的子類別，會記錄物件被加入的順序\n\ndefaultdict\ndict 的子類別，當值不存在 dict 中時會呼叫一個提供預設值的工廠函式\n\nUserDict\ndict 物件的包裝器 (wrapper)，簡化了 dict 的子類別化過程\n\nUserList\nlist 物件的包裝器，簡化了 list 的子類別化過程\n\nUserString\n字串物件的包裝器，簡化了字串的子類別化過程\n\n\nChainMap 物件¶在 3.3 版本新加入.ChainMap（對映鏈結）類別的目的是快速將數個對映連結在一起，讓它們可以被當作一個單元來處理。它通常會比建立一個新的字典並多次呼叫 update() 來得更快。這個類別可用於模擬巢狀作用域 (nested scopes)，且在模板化 (templating) 時能派上用場。一個 ChainMap 將多個 dict 或其他對映組合在一起，建立一個獨立、可更新的視圖。如果沒有指定 maps，預設會提供一個空字典讓每個新鏈結都至少有一個對映。底層的對映儲存於一個 list 中，這個 list 是公開的且可透過 maps 屬性存取或更新，沒有其他狀態 (state)。檢索 (lookup) 陸續查詢底層對映，直到鍵被找到，然而讀取、更新和刪除就

# Step2: Indexing(Split)

## 2.1 Split the website content with `RecursiveCharacterTextSplitter`

In [27]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)

In [28]:
all_splits[0].page_content

'目錄上個主題calendar --- 日曆相關函式下個主題collections.abc --- 容器的抽象基类此頁面瀏覽collections --- 容器資料型態¶原始碼：Lib/collections/__init__.py這個模組實作了一些特別的容器資料型態，用來替代 Python 一般內建的容器，例如 dict（字典）、list（串列）、set（集合）和 tuple（元組）。\n\n\n\n\n\nnamedtuple()\n用來建立具名欄位的 tuple 子類別的工廠函式\n\ndeque\n一個類似 list 的容器，可以快速的在頭尾加入 (append) 元素與移除 (pop) 元素\n\nChainMap\n一個類似 dict 的類別，用來為多個對映 (mapping) 建立單一的視圖 (view)\n\nCounter\ndict 的子類別，用來計算可雜湊物件的數量\n\nOrderedDict\ndict 的子類別，會記錄物件被加入的順序\n\ndefaultdict\ndict 的子類別，當值不存在 dict 中時會呼叫一個提供預設值的工廠函式\n\nUserDict\ndict 物件的包裝器 (wrapper)，簡化了 dict 的子類別化過程\n\nUserList\nlist 物件的包裝器，簡化了 list 的子類別化過程\n\nUserString\n字串物件的包裝器，簡化了字串的子類別化過程'

# Step3: Indexing(Store)

## 3.1 Initialize the `vectorstore` using Chroma

In [29]:
from langchain.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())

# Step4: Retrieval

## 4.1 Create `Retriever` from `vectorstore`

In [30]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})

## Testing the retriever

In [33]:
retrieved_docs = retriever.invoke("how can i find most common element")

In [34]:
retrieved_docs

[Document(page_content="回傳一個 list，包含出現最多次的 n 個元素及其出現次數，並按照出現次數排序。如果 n 被省略或者為 None，most_common() 會回傳所有 counter 中的元素。出現次數相同的元素會按照首次出現的時間先後來排列：>>> Counter('abracadabra').most_common(3)\n[('a', 5), ('b', 2), ('r', 2)]\n減去自一個 iterable 或另一個對映（或 Counter）中的計數元素，行為類似 dict.update() 但是是為了減去計數而非取代其值。輸入和輸出都可以是 0 或是負數。>>> c = Counter(a=4, b=2, c=0, d=-2)\n>>> d = Counter(a=1, b=2, c=3, d=4)\n>>> c.subtract(d)\n>>> c\nCounter({'a': 3, 'b': 0, 'c': -3, 'd': -6})\n在 3.2 版本新加入.計算總計數值。>>> c = Counter(a=10, b=5, c=0)\n>>> c.total()\n15", metadata={'source': 'https://docs.python.org/zh-tw/3/library/collections.html#collections.Counter', 'start_index': 6244}),
 Document(page_content=">>> # Find the ten most common words in Hamlet\n>>> import re\n>>> words = re.findall(r'\\w+', open('hamlet.txt').read().lower())\n>>> Counter(words).most_common(10)\n[('the', 1143), ('and', 966), ('to', 762), ('of', 669), ('i', 631),\n ('you', 554),  ('a', 546), ('my', 514), ('hamlet', 471), ('in', 451)]\nCounter 是 dict 的子類別，用

# Step5: Generate

In [45]:
from langchain_openai import ChatOpenAI
from langchain.callbacks import OpenAICallbackHandler
cb = OpenAICallbackHandler()
llm = ChatOpenAI(model_name="gpt-3.5-turbo-1106", temperature=0.3, callbacks=[cb])

In [43]:
from langchain import hub

prompt = hub.pull("rlm/rag-prompt")

In [37]:
prompt

ChatPromptTemplate(input_variables=['context', 'question'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))])

In [38]:
example_messages = prompt.invoke(
    {"context": "filler context", "question": "filler question"}
).to_messages()
example_messages

[HumanMessage(content="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: filler question \nContext: filler context \nAnswer:")]

In [46]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [50]:
for chunk in rag_chain.stream("compare difference between namedtuple and dict"):
    print(chunk, end="", flush=True)

A namedtuple is a factory function for creating tuple subclasses with named fields, while a dictionary is a collection of key-value pairs. Namedtuples are more lightweight and memory-efficient compared to dictionaries. Namedtuples also support additional methods and properties, such as _make, _asdict, and _replace.