In [None]:
import os

from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv(), override=True)
os.environ.get('OPENAI_API_KEY')

In [None]:
from langchain_openai.chat_models import ChatOpenAI

In [None]:
chat = ChatOpenAI(
    openai_api_base=os.environ["CHATGPT_API_ENDPOINT"],
    openai_api_key=os.environ["OPENAI_API_KEY"]
)

3.1 如何加載PDF和搜索網頁信息

In [None]:
from langchain.document_loaders import PyPDFLoader

In [None]:
##讀取PDF資料
loader = PyPDFLoader("./data/ML-guide.pdf")

In [None]:
pages = loader.load()

In [None]:
#確認頁數
len(pages)

In [None]:
#讀取第1頁前500字符訊息
page = pages[0]
print(page.page_content[:500])

In [None]:
#了解資料來源
page.metadata

In [None]:
from langchain.document_loaders import WebBaseLoader

In [None]:
#讀取網頁資料
loader = WebBaseLoader("http://google.com")

In [None]:
docs = loader.load()

In [None]:
print(docs[0].page_content)

In [None]:
#利用Serper搜尋訊息，並載入
import requests
import json

url = "https://google.serper.dev/news"

payload = json.dumps({
  "q": "apple inc",
  "hl": "zh-tw"
})
headers = {
  'X-API-KEY': 'b4a777328367a49d9e0c287a72c709995c66068f',
  'Content-Type': 'application/json'
}

response = requests.request("POST", url, headers=headers, data=payload)

print(response.text)

3.2 Text Splitter文本分割器

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=9,      #每個chunk大小(每幾個token切1組chunk)
    chunk_overlap=9    #前後chunk重疊的字符
)

In [None]:
text1 = "123456789"

In [None]:
text_splitter.split_text(text1)
#剛好9字符沒有分割

In [None]:
text2 = "123456789123456789"

In [None]:
text_splitter.split_text(text2)

In [None]:
text3 = "This is a sample text to split. It has multiple sentences"

In [None]:
text_splitter.split_text(text3)

In [None]:
from langchain.document_loaders import PyPDFLoader

In [None]:
loader = PyPDFLoader("./data/ML-guide.pdf")
loader

In [None]:
pages = loader.load()
pages[0].page_content[:500]

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=150,
    length_function=len,
    separators=["\n\n", "\n", " ", ""]   #分割邏輯(有優先順序性)

)

In [None]:
len(pages)

In [None]:
docs = text_splitter.split_documents(pages)

In [None]:
#分割後，總共分成幾個chunk
len(docs)

In [None]:
#第1組chunk資料
docs[0].page_content

In [None]:
#第2組chunk資料
docs[1].page_content

3.4 Chunking分塊大小怎麼決定

In [None]:
from langchain.document_loaders import ReadTheDocsLoader

In [None]:
#讀取html檔案，可把html的語法濾掉
loader = ReadTheDocsLoader(
    "htmldocs",   #可讀取該目錄下所有html文檔
    encoding="utf-8",  # 確保編碼與文件一致
    errors="ignore"    # 忽略無法解碼的字符
)     
docs = loader.load()

In [None]:
len(docs)

In [None]:
print(docs[0].page_content[:500])

In [None]:
# gpt-3.5-turbo 4096 tokens
# If 4096 - (Input(Instruction + query + context) + output)
#     If Chunk nums = 5:
#         Chunk Size = 2000 / 5 = 400      *假設Instruction + query使用2000 tokens

# So Chunk Size <= 400

# Too small not meaningful
# Too big not efficient

In [None]:
import tiktoken

In [None]:
tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")
tokenizer

In [None]:
def token_count(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

In [None]:
tokens = [token_count(doc.page_content) for doc in docs]
tokens

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=token_count,
    separators=["\n\n", "\n", " ", ""]
)

In [None]:
chunks = text_splitter.split_text(docs[0].page_content)
len(chunks)

In [None]:
token_count(chunks[0]), token_count(chunks[1]), token_count(chunks[2]), token_count(chunks[3]), token_count(chunks[4])

In [None]:
chunks

4.1 Embedding與Chroma向量數據庫的創建

In [None]:
from langchain_openai import OpenAIEmbeddings

In [None]:
embeddings = OpenAIEmbeddings(
    base_url=os.environ["EMBEDDINGS_BASE_URL"]    #中介網址需加v1
)

In [None]:
sentence1 = "I like cats."
sentence2 = "I like dogs."
sentence3 = "The weather is ugly outside."

In [None]:
embedding1 = embeddings.embed_query(sentence1)
embedding2 = embeddings.embed_query(sentence2)
embedding3 = embeddings.embed_query(sentence3)

In [None]:
import numpy as np

In [None]:
np.dot(embedding1, embedding2)

In [None]:
np.dot(embedding1, embedding3)

In [None]:
np.dot(embedding2, embedding3)

In [None]:
from langchain.vectorstores import Chroma

In [None]:
persist_directory = "./db"

In [None]:
!rm -rf ./db

In [None]:
#將chunks轉成documents格式
doc_chunks = text_splitter.create_documents(chunks)

In [None]:
#建立向量資料庫(透過documents格式)
vectordb = Chroma.from_documents(
    documents=doc_chunks, 
    embedding=embeddings,
    persist_directory=persist_directory
)

In [None]:
#建立向量資料庫(透過text格式)
vectordb = Chroma.from_texts(
    texts=chunks, 
    embedding=embeddings,
    persist_directory=persist_directory
)

In [None]:
print(vectordb._collection.count())