# ChatDoc智能文档助手

读取pdf、excel、doc三种常见的文档格式
根据文档内容，智能抽取内容并输出相应格式

## 1. 加载docx

In [None]:
from langchain_community.document_loaders import Docx2txtLoader

#定义chatdoc
class ChatDoc():
    def getFile():
        #读取文件
        loader = Docx2txtLoader("resource/example/fake.docx")
        text = loader.load()
        return text;

ChatDoc.getFile()

## 2. 加载pdf

In [None]:
from langchain_community.document_loaders import PyPDFLoader

#定义chatdoc
class ChatDoc():
    def getFile():
        try:
            #读取文件
            loader = PyPDFLoader("resource/example/fake.pdf")
            pages = loader.load_and_split()
            return pages;
        except Exception as e:
            print(f"Error loading files:{e}")
ChatDoc.getFile()

## 3.加载excel

In [None]:
from langchain_community.document_loaders import UnstructuredExcelLoader



#定义chatdoc
class ChatDoc():
    def getFile():
        try:
            #读取文件
            loader = UnstructuredExcelLoader("resource/example/fake.xlsx",mode="elements")
            text = loader.load()
            return text;
        except Exception as e:
            print(f"Error loading files:{e}")
ChatDoc.getFile()

# 综合处理

1. 加载文档
2. 分割文档
3. 向量化
4. 向量存储
5. 提问并找到相关的文本块

In [1]:
from langchain_community.document_loaders import UnstructuredExcelLoader,Docx2txtLoader,PyPDFLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI
import os
from dotenv import load_dotenv
from langchain.retrievers.multi_query import MultiQueryRetriever

load_dotenv()

# 定义chatdoc
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)


class ChatDoc():
    def __init__(self):
        self.doc = None
        self.splitText = [] #分割后的文本

    def getFile(self):
        doc = self.doc
        loaders = {
            "docx":Docx2txtLoader,
            "pdf":PyPDFLoader,
            "xlsx":UnstructuredExcelLoader,
        }
        file_extension = doc.split(".")[-1]
        loader_class = loaders.get(file_extension)
        if loader_class:
            try:
                loader = loader_class(doc)
                text = loader.load()
                return text
            except Exception as e:
                print(f"Error loading {file_extension} files:{e}")
        else:
             print(f"Unsupported file extension: {file_extension}")
             return  None

    #处理文档的函数
    def splitSentences(self):
        full_text = self.getFile() #获取文档内容
        if full_text != None:
            #对文档进行分割
            text_split = CharacterTextSplitter(
                chunk_size=150,
                chunk_overlap=20,
            )
            texts = text_split.split_documents(full_text)
            self.splitText = texts

    #向量化与向量存储
    def embeddingAndVectorDB(self):
        embeddings = embedding_model
        db =Chroma.from_documents(
            documents = self.splitText,
            embedding = embeddings,
        )
        return db
    #提问并找到相关的文本块
    def askAndFindFiles(self,question):
        db = self.embeddingAndVectorDB()
        llm = ChatOpenAI(
            model="gpt-3.5-turbo:free",
            temperature=0.7,
            api_key=os.getenv("OPENAI_API_KEY"),
            base_url=os.getenv("OPENAI_BASE_URL"),
        )
        retriever_from_llm = MultiQueryRetriever.from_llm(
            retriever=db.as_retriever(),
            llm=llm,
        )
        # 使用正确的公共接口方法
        return retriever_from_llm.get_relevant_documents(question)

chat_doc = ChatDoc()
chat_doc.doc = "resource/example/fake.docx"
chat_doc.splitSentences()
import logging
logging.basicConfig(level=logging.INFO)
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.DEBUG)
unique_doc = chat_doc.askAndFindFiles("公司名称是什么?")
print(unique_doc)

ModuleNotFoundError: No module named 'langchain.retrievers'