In [None]:
%pip install python-dotenv

## 环境变量

In [None]:
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
# print(os.environ)

AI_TEMPERATURE=0.1
AI_MAX_TOKENS=2048

## Embedding加载

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import AzureOpenAI

def load_azureLLM():
    # # chat
    # chat = AzureChatOpenAI(
    #     deployment_name=os.environ.get('DEPLOYMENT_NAME_CHAT'),
    #     temperature=AI_TEMPERATURE,
    #     max_tokens=AI_MAX_TOKENS,
    #     )
    # llm
    llm = AzureOpenAI(
        deployment_name=os.environ.get('DEPLOYMENT_NAME_LLM'),
        model_name="text-davinci-003",
        temperature=AI_TEMPERATURE,
        max_tokens=AI_MAX_TOKENS,
        )
    
    embedding = OpenAIEmbeddings(deployment = os.environ.get('DEPLOYMENT_NAME_EMBEDDING'),chunk_size=1)
    
    return llm,embedding

llm,embedding = load_azureLLM()

## 文档解析

In [None]:
import os
import tempfile

CHUNK_SIZE = 1000
CHUNK_OVERLAP = 100

from langchain.document_loaders import (
    PyPDFLoader, TextLoader, CSVLoader,
    UnstructuredEPubLoader, UnstructuredWordDocumentLoader,
    UnstructuredMarkdownLoader
)
from langchain.text_splitter import RecursiveCharacterTextSplitter

def extract_pdf_content(file_path):
    loader = PyPDFLoader(file_path)
    return loader.load_and_split()


def extract_word_content(file_path):
    loader = UnstructuredWordDocumentLoader(file_path, mode="elements")
    return loader.load_and_split()


def extract_csv_content(file_path):
    loader = CSVLoader(file_path)
    return loader.load_and_split()


def extract_epub_content(file_path):
    loader = UnstructuredEPubLoader(file_path, mode="elements")
    return loader.load_and_split()


def extract_md_content(file_path):
    loader = UnstructuredMarkdownLoader(file_path, mode="elements")
    return loader.load_and_split()


def extract_txt_content(file_path):
    loader = TextLoader(file_path, encoding="utf8")
    return loader.load_and_split()

def extract_file_content(file):
    file_extension = os.path.splitext(file)[1]
    documents = []
    
    with open(file, mode='rb') as f:
        file_content = f.read()

    with tempfile.NamedTemporaryFile(delete=False) as temp_file:
        temp_file.write(file_content)
        temp_file_name = temp_file.name

    loaders = {
        ".pdf": extract_pdf_content,
        ".xls": extract_csv_content,
        ".xlsx": extract_csv_content,
        ".csv": extract_csv_content,
        ".docx": extract_word_content,
        ".epub": extract_epub_content,
        ".md": extract_md_content,
        ".txt": extract_txt_content
    }
    if file_extension in loaders:
        documents = loaders[file_extension](temp_file_name)

    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
    chunks = text_splitter.split_documents(documents)

    temp_file.close()
    os.remove(temp_file_name)

    return chunks

## 添加文档
> 遍历目录下所有文件，并分割。

In [None]:
root_dir = './data'

docs = []
docs_id =  []
for dirpath, dirnames, filenames in os.walk(root_dir):
    for file in filenames:
        try:
            print(file)
            docs_id.extend(filenames)
            content = extract_file_content(os.path.join(dirpath, file))
            docs.extend(content)
        except Exception as e:
            print("ERROR: ",e)
            pass


In [None]:
# 展示分割好的文档
# docs
docs_id


## 文档向量化存储

In [None]:
%pip install faiss-cpu
from langchain.vectorstores import FAISS

# %pip install chromadb
# from langchain.vectorstores import Chroma

DB_DIR = 'db'
DB_VECTOR_NAME = "langchain_db_chroma"
DB_VECTOR_NAME_FAISS = "langchain_db_faiss"

ABS_PATH = os.path.abspath(os.getcwd())
# ABS_PATH = os.path.dirname(os.path.abspath(__file__))
DB_DIR = os.path.join(ABS_PATH, DB_DIR)

def load_vectorDB():
    if not os.path.exists(DB_DIR):
        os.mkdir(DB_DIR)

    # vectorstore = Chroma(persist_directory=DB_DIR,
    #                      embedding_function=embedding)
    
    vectorstore = FAISS.load_local(DB_VECTOR_NAME_FAISS, embedding)

    return vectorstore

def save_vectorDB(docs, embedding):
    if not os.path.exists(DB_DIR):
        os.mkdir(DB_DIR)

    # vectorstore = Chroma.from_documents(
    #     collection_name=DB_VECTOR_NAME,
    #     documents=docs,
    #     embedding=embedding,
    #     ids=docs_id,
    #     persist_directory=DB_DIR)
    # vectorstore.persist()
    
    vectorstore = FAISS.from_documents(docs,embedding)
    vectorstore.save_local(DB_VECTOR_NAME_FAISS)
    

    return vectorstore

### 加载向量数据库

In [None]:
db = load_vectorDB()

### 保存文档到向量数据库

In [None]:
#db = save_vectorDB(docs,embedding)

### 对话链构建

In [None]:
from langchain.chains import ConversationalRetrievalChain

retriever = db.as_retriever()

retriever.search_kwargs['distance_metric'] = 'cos'
retriever.search_kwargs['fetch_k'] = 100
retriever.search_kwargs['maximal_marginal_relevance'] = True
retriever.search_kwargs['k'] = 4

qa = ConversationalRetrievalChain.from_llm(llm=llm,retriever=retriever)

In [None]:
# questions = ["书名叫什么？", "出版社是哪家？", "作者是谁？", "作者致谢了多少人？", "作者致谢了谁？"]
# chat_history = []

# for question in questions:  
#     result = qa({"question": question, "chat_history": chat_history})
#     chat_history.append((question, result['answer']))
#     print(f"-> **Question**: {question} \n")
#     print(f"**Answer**: {result['answer']} \n")

## 与文档对话

In [None]:

chat_history = []

def ask_question(user_input):
    result = qa({"question": user_input, "chat_history": chat_history})
    chat_history.append((user_input, result['answer']))
    print(f"-> **Question**: {user_input} \n")
    print(f"**Answer**: {result['answer']} \n")



print("欢迎使用问答系统，请输入您要提问的问题，按回车键确认。")
while True:
    user_input = input("> ").strip()  # 获取用户输入的问题
    if user_input == "exit":
        break  # 用户输入 exit，退出程序
    elif user_input.strip():
        ask_question(user_input)
    else:
        print("对不起，我暂时无法回答这个问题，请尝试其它问题。")
