In [None]:
from langchain.document_loaders.pdf_loader import PDFLoader
from langchain.embeddings import AzureOpenAIEmbeddings
from langchain.vectorstores import FAISS
import gradio as gr
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

from langchain.text_splitter import CharacterTextSplitter,MarkdownTextSplitter
#from langchain.document_loaders import UnstructuredFileLoader,UnstructuredMarkdownLoader
from langchain.document_loaders import UnstructuredPDFLoader
#from langchain.document_loaders import UnstructuredImageLoader
from rapidocr_onnxruntime import RapidOCR

max_token = 8000
split_doc_size = 1000
chunk_overlap = 50
pdf_file = 'data/IRM_Help.pdf'
work_dir = '/Users/I069899/Documents/study/AI/ai_anna'



#分割pdf文件
def load_pdf_splitter(pdf_file, chunk_size=max_token, chunk_overlap=chunk_overlap):
    docs = load_pdf_file(pdf_file)
    text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    split_docs = text_splitter.split_documents(docs)
    #默认展示分割后第一段内容
    print('split_docs[0]: ', split_docs[0])
    return split_docs

#加载pdf文件
def load_pdf_file(pdf_file):    
    loader = UnstructuredPDFLoader(os.path.join(work_dir, pdf_file))
    docs = loader.load()
    print('pdf:\n',docs[0].page_content[:split_doc_size])
    return docs

def load_pdf_by_pdfloader(pdf_file):
    # 读取PDF文件并将其内容转换为文本。
    pdf_loader = PDFLoader(file_path=pdf_file)
    data = pdf_loader.load()
    return data

#try load_pdf_splitter
split_docs = load_pdf_splitter(pdf_file, max_token, chunk_overlap)
print(" pdf data load by UnstructuredPDFLoader is ", split_docs)
docs = load_pdf_by_pdfloader(pdf_file)
print(" pdf data load by PDFLoader is ", docs)


# 将文本切分为小块，确保每块都不超过token的最大长度限制。
# 这里假设你有一个名为tokenize的函数可以进行这个操作
tokens = tokenize(data.get_text(), max_length=max_token)

# 使用embedding模型将文本块转换为向量，并将这些向量存储到向量数据库中。
embedder = AzureOpenAIEmbeddings()
db = FAISS.from_documents(tokens, embedder)

# 将数据库保存到本地
db_path = 'your_db_path'
db.save_local(db_path)

# 从本地加载数据库
new_db = FAISS.load_local(db_path, embedder)

# 使用Gradio开发用户界面，允许用户输入查询并执行语义搜索。

def search_reviews(product_description, n=3):
    product_embedding = embedder.embed(product_description)
    similarities = [cosine_similarity([vec], [product_embedding])[0][0] for vec in new_db.vectors]
    df = pd.DataFrame({'tokens': tokens, 'similarity': similarities})

    results = (
        df.sort_values("similarity", ascending=False)
        .head(n)
        .tokens.str.replace("Title: ", "")
        .str.replace("; Content:", ": ")
    )
    return "\n".join(results)

iface = gr.Interface(fn=search_reviews, inputs="text", outputs="text")
iface.launch()