## 初始化

In [None]:
import os

os.environ["OPENAI_API_KEY"] = ""

root_dir = os.getcwd()
html_dir = os.path.join(root_dir, "html")
txt_dir = os.path.join(root_dir, "txt")
db_dir = os.path.join(root_dir, "db")


## 从 html 中提取 txt

### 函数声明

In [None]:
from langchain.document_loaders import UnstructuredHTMLLoader


def getFiles(directory, ext):
    files = os.listdir(directory)
    files = [f.split(".")[0] for f in files if os.path.splitext(f)[1] == ext]
    return files


def convHtmlToTxt(src, dest, files):
    for file in files:
        loader = UnstructuredHTMLLoader(os.path.join(src, f"{file}.html"))
        data = loader.load()

        cut_file = os.path.join(dest, f"{file}.txt")
        with open(cut_file, "w") as f:
            content = data[0].page_content
            f.write(content)


### 执行转换操作

In [None]:
files = getFiles(html_dir, ".html")
convHtmlToTxt(html_dir, txt_dir, files)


## 提取向量数据

### 函数声明

In [None]:
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import CharacterTextSplitter


def getDocumentsOnCharacterTextSplitter(src, chunk_size):
    loader = DirectoryLoader(src, glob="**/*.txt")
    documents = loader.load()

    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=chunk_size,
        chunk_overlap=0,
    )
    return text_splitter.split_documents(documents)


### 提取向量数据

In [None]:
# 此步骤需要消耗 OpenAI API 的 token
texts = getDocumentsOnCharacterTextSplitter(txt_dir, 512)


## 持久化向量数据

### 函数声明

In [None]:
import os
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

embeddings = OpenAIEmbeddings(openai_api_key=os.environ["OPENAI_API_KEY"])


def createDb(documents, persist_directory):
    db = Chroma.from_documents(
        documents=documents, embedding=embeddings, persist_directory=persist_directory
    )
    db.persist()


### 数据持久化，需要消耗 token，确保网络可以访问 OpenAI 的 API，大概花费 1.5 美金，耗时 2 分钟左右，确保账户中余额充足

In [None]:
createDb(texts, db_dir)


## 查询

### 函数声明

In [None]:
import os
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

embeddings = OpenAIEmbeddings(openai_api_key=os.environ["OPENAI_API_KEY"])


def getDb(persist_directory):
    return Chroma(persist_directory=persist_directory, embedding_function=embeddings)


### 加载向量数据库

In [None]:
db = getDb(db_dir)


### 查询向量数据

In [None]:
query = "如何优化数据库"
docs = db.similarity_search(query, k=4)


### 显示查询结果

In [None]:
import json

doc_list = [doc.page_content for doc in docs]
print(json.dumps(doc_list, ensure_ascii=False, indent=4))


### 函数声明

In [None]:
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.chat_models import ChatOpenAI


def getChain(temperature, chain_type):
    return load_qa_with_sources_chain(
        ChatOpenAI(model_name="gpt-3.5-turbo", temperature=temperature, streaming=True),
        chain_type=chain_type,
    )


### 初始化 chain

In [None]:
chain = getChain(0, "refine")
# 参数说明参见 https://www.youtube.com/watch?v=f9_BWhCI4Zo


### 调用 chatgpt 查询，鉴于 chatgpt 模型的特点，每次查询的结果可能会有很大的区别，一次查询结果不理想可以多查询几次

In [None]:
import time

start_time = time.time()
print(f"开始回答 {query}")
chains = chain(
    {"input_documents": docs, "question": f'用中文回答:"{query}"'},
    return_only_outputs=False,
)
print(f"回答完成,用时 {time.time()-start_time:.2f} 秒")


### 显示查询结果

In [None]:
inputs = chains["input_documents"]
input_documents = [input.page_content for input in inputs]
output_text = chains["output_text"]
print(json.dumps(input_documents, indent=4, ensure_ascii=False))
print(json.dumps(output_text, indent=4, ensure_ascii=False))
