# Libraries

In [40]:
import os # operating system for files
from langchain_community.vectorstores import Chroma # vector db
from langchain.document_loaders import PyPDFLoader # pdf loader
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter # text chunking
from langchain.embeddings import OpenAIEmbeddings # embedding text
from langchain.llms import OpenAI # LLM
from langchain.chains import RetrievalQA # chain

# Vector database processor

In [39]:
def text_processor(documents):
    
    # creating text splitter
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    
    # splitting pdf
    documents = text_splitter.split_documents(documents)

    # initializing vector db using Chroma
    vectordb = Chroma.from_documents(
        documents, 
        embedding=OpenAIEmbeddings(),
        persist_directory='./data'
    )
    
    return vectordb


# LLM function

In [1]:
def run_model(db, query):
    # running openAI
    llm = OpenAI()

    # initialize qa chain, each time function 
    qa_chain = RetrievalQA.from_chain_type( 
    llm, # call LLM
    retriever=db.as_retriever(search_kwargs={'k': 7}),
    return_source_documents=True
    )   
    
    # returning response
    return qa_chain.invoke({'query': query})

# Driver code

In [42]:
# setting environment key
os.environ["OPENAI_API_KEY"] = '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'

# uploading my document (we could extend this to parse over a list of docs)
pdf = '/Users/benstager/Desktop/business_report.pdf'

# running it through loader
loader = PyPDFLoader(pdf)
documents = loader.load()

# returning our pertinent vector database
db = text_processor(documents)
db.persist()

# user end input
query = input('Please ask the bot a question: ')

while query != 'quit' or query != 'q':
    result = run_model(db, query)
    print(result['result'])
    query = input('Please ask the bot a question: ')