In [None]:
from langchain.document_loaders.pdf_loader import PDFLoader
from langchain.embeddings import AzureOpenAIEmbeddings
from langchain.vectorstores import FAISS
import gradio as gr
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

from langchain.text_splitter import CharacterTextSplitter,MarkdownTextSplitter
#from langchain.document_loaders import UnstructuredFileLoader,UnstructuredMarkdownLoader
from langchain.document_loaders import UnstructuredPDFLoader
#from langchain.document_loaders import UnstructuredImageLoader
from rapidocr_onnxruntime import RapidOCR

import os
from dotenv import load_dotenv

max_token = 8000
split_doc_size = 1000
chunk_overlap = 50
pdf_file = 'data/IRM_Help.pdf'
work_dir = '/Users/I069899/Documents/study/AI/ai_anna'
db_path = "data/vectordb"

#分割pdf文件
def load_pdf_splitter(pdf_file, chunk_size=max_token, chunk_overlap=chunk_overlap):
    docs = load_pdf_file(pdf_file)
    text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    split_docs = text_splitter.split_documents(docs)
    #默认展示分割后第一段内容
    print('split_docs[0]: ', split_docs[0])
    return split_docs

#加载pdf文件
def load_pdf_file(pdf_file):    
    loader = UnstructuredPDFLoader(os.path.join(work_dir, pdf_file))
    docs = loader.load()
    print('pdf:\n',docs[0].page_content[:split_doc_size])
    return docs

def load_pdf_by_pdfloader(pdf_file):
    # 读取PDF文件并将其内容转换为文本。
    pdf_loader = PDFLoader(file_path=pdf_file)
    data = pdf_loader.load()
    return data

#try load_pdf_splitter
split_docs = load_pdf_splitter(pdf_file, max_token, chunk_overlap)
print(" pdf data load by UnstructuredPDFLoader is ", split_docs)
docs = load_pdf_by_pdfloader(pdf_file)
print(" pdf data load by PDFLoader is ", docs)


def initialize_data(pdf_path: str=input_file_path):
    # loader = PDFLoader(file_path=pdf_path)
    # data = loader.load()
    split_docs = load_pdf_splitter(pdf_file, max_token, chunk_overlap)
    db = FAISS.from_documents(split_docs, AzureOpenAIEmbeddings())
    db.save_local(db_path)

    new_db = FAISS.load_local(db_path, AzureOpenAIEmbeddings())
    llm = AzureChatOpenAI(model_name="gpt-35-turbo", temperature=0.5)
    
    global AMAZON_REVIEW_BOT    
    AMAZON_REVIEW_BOT = RetrievalQA.from_chain_type(llm,
                  retriever=db.as_retriever(search_type="similarity_score_threshold",
                    search_kwargs={"score_threshold": 0.7}))
    AMAZON_REVIEW_BOT.return_source_documents = True
    # qa_chain = RetrievalQA.from_chain_type(llm,
    #          retriever=new_db.as_retriever(search_type="similarity_score_threshold",
    #            search_kwargs={"score_threshold": 0.75}))
    # qa_chain.combine_documents_chain.verbose = True
    # qa_chain.return_source_documents = True

    return AMAZON_REVIEW_BOT

def chat(message, history):
    print(f"[message]{message}")
    print(f"[history]{history}")
    enable_chat = True

    ans = AMAZON_REVIEW_BOT({"query": message})
    if ans["source_documents"] or enable_chat:
        print(f"[result]{ans['result']}")
        print(f"[source_documents]{ans['source_documents']}")
        return ans["result"]
    else:
        return "I don't know."
    

def launch_ui():
    demo = gr.ChatInterface(
        fn=chat,
        title="Amazon Food Review",
        chatbot=gr.Chatbot(height=600),
    )

    demo.launch(share=True, server_name="0.0.0.0")

if __name__ == "__main__":
    os.environ["OPENAI_API_TYPE"] = "azure"
    os.environ["OPENAI_API_VERSION"] = "2023-05-15"
    os.environ["OPENAI_API_BASE"] = "https://pvg-azure-openai-uk-south.openai.azure.com/openai"
    env_path = os.getenv("HOME") + "/Documents/src/openai/.env"
    load_dotenv(dotenv_path=env_path, verbose=True)
    
    initialize_data()
    launch_ui()
    
    
   
def process_pdf(pdf_path):
    """
    处理 PDF 文档并生成文档表示

    Args:
        pdf_path: PDF 文档路径

    Returns:
        文档表示
    """

    # 加载 PDF 文档
    loader = PDFLoader(file_path=pdf_path)
    document = loader.load()

    # 使用滑动窗口或语义分割技术切分文档

    # 生成文本嵌入

    # 使用注意力机制来关注文档中重要的语义信息

    # 返回文档表示
    return document_representation


def save_embeddings(embeddings, vector_store_dir):
    """
    保存嵌入到向量存储中

    Args:
        embeddings: 文档嵌入
        vector_store_dir: 向量存储目录

    Returns:
        None
    """

    # 创建向量数据库索引

    # 插入数据到向量数据库