In [2]:
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import FAISS
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader
import os
from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI
from langchain.chains import RetrievalQA

max_token = 8000
split_doc_size = 1000
chunk_overlap = 50
pdf_file_name = 'data/IRM_Help.pdf'
work_dir = '/Users/I069899/Documents/study/AI/ai_anna/'
db_path =  "data/vectordb/"

def load_pdf_splitter():
  loader = PyPDFLoader(os.path.join(work_dir, pdf_file_name))
  pages = loader.load()
  text_splitter = CharacterTextSplitter(separator ="\n",chunk_size=1000,chunk_overlap=150)
  docs = text_splitter.split_documents(pages)
  return docs
  
split_docs = load_pdf_splitter()
# Print success message
print("**** PDF load complete")


def initialize_data():
    split_docs = load_pdf_splitter()
    db = FAISS.from_documents(split_docs, AzureOpenAIEmbeddings())
    db.save_local(db_path)

    new_db = FAISS.load_local(db_path, AzureOpenAIEmbeddings())
    llm = AzureChatOpenAI(model_name="gpt-35-turbo", temperature=0.5)
    
    global AMAZON_REVIEW_BOT    
    AMAZON_REVIEW_BOT = RetrievalQA.from_chain_type(llm,
                  #retriever=db.as_retriever(search_type="similarity_score_threshold",
                retriever=new_db.as_retriever(search_type="similarity_score_threshold",
                    search_kwargs={"score_threshold": 0.5}))
                
    AMAZON_REVIEW_BOT.return_source_documents = True
    return AMAZON_REVIEW_BOT

os.environ["OPENAI_API_TYPE"] = "azure"
os.environ["OPENAI_API_VERSION"] = "2023-05-15"
os.environ["OPENAI_API_BASE"] = "https://pvg-azure-openai-uk-south.openai.azure.com/openai"
env_path = os.getenv("HOME") + "/Documents/src/openai/.env"
load_dotenv(dotenv_path=env_path, verbose=True)

searchResult = initialize_data()

ans = searchResult({"query": "Regional Settings"})
if ans["source_documents"]:
    source_document = ans['source_documents']
    print("00000000000 ", source_document)
    result = ans["result"]
    print("111111111111 ", result)
else:
    result = "I don't know."
    print("2222222", result)

**** PDF load complete
00000000000  [Document(page_content='Settings \n Setup\n.\n2. On the Regions tab, select Regions Con\x00guration, choose Edit, and do any of the following:\ue05c Note\nRegion IDs are generated by the system, and each new ID self-increments by one.', metadata={'source': '/Users/I069899/Documents/study/AI/ai_anna/data/IRM_Help.pdf', 'page': 5}), Document(page_content='2/26/2024\n6 This is custom documentation. For more information, please visit the SAP Help PortalCon\x00gure regions.\nParent topic: Global Settings \nRelated Information\nCon\x00guring Regional Settings\nDe\x00ning the Company Pro\x00le\nBrands\nManaging Return Reasons\nManaging Payment Methods\nManaging Workstations\nManaging Countries of Origin\nManaging Carriers\nManaging Regions\nManage regions.\nProcedure\n1. In the navigation tree, choose \nSettings \n Setup\n.\n2. On the Regions tab, select Regions, and do any of the following:\nAdd one or more regions.\nSelect an entry, choose Edit, and then 