In [14]:
#import libraries
import os
import openai
import sys

from langchain.document_loaders import PyPDFLoader, TextLoader, PyPDFDirectoryLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI

from langchain.chains import RetrievalQA, ConversationalRetrievalChain
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory


In [4]:
# Read OpenAI API KEY
sys.path.append('../..')
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key  = os.environ['OPENAI_API_KEY']

In [35]:
#load pdf
loader = PyPDFDirectoryLoader("data/benefits/")
pages = loader.load()
len(pages)

52

In [38]:
for p in pages[:10]:    
    print(p.metadata)   

{'source': 'data\\benefits\\CS US Benefits Guide February 2023.pdf', 'page': 0}
{'source': 'data\\benefits\\CS US Benefits Guide February 2023.pdf', 'page': 1}
{'source': 'data\\benefits\\CS US Benefits Guide February 2023.pdf', 'page': 2}
{'source': 'data\\benefits\\CS US Benefits Guide February 2023.pdf', 'page': 3}
{'source': 'data\\benefits\\CS US Benefits Guide February 2023.pdf', 'page': 4}
{'source': 'data\\benefits\\CS US Benefits Guide February 2023.pdf', 'page': 5}
{'source': 'data\\benefits\\CS US Benefits Guide February 2023.pdf', 'page': 6}
{'source': 'data\\benefits\\CS US Benefits Guide February 2023.pdf', 'page': 7}
{'source': 'data\\benefits\\CS US Benefits Guide February 2023.pdf', 'page': 8}
{'source': 'data\\benefits\\CS US Benefits Guide February 2023.pdf', 'page': 9}


In [40]:
for p in pages:    
    company = 'CS' if 'CS' in p.metadata['source'] else 'UBS'
    p.metadata.update({'company': company})   

In [41]:
for p in pages[:10]:    
    print(p.metadata)   

{'source': 'data\\benefits\\CS US Benefits Guide February 2023.pdf', 'page': 0, 'company': 'CS'}
{'source': 'data\\benefits\\CS US Benefits Guide February 2023.pdf', 'page': 1, 'company': 'CS'}
{'source': 'data\\benefits\\CS US Benefits Guide February 2023.pdf', 'page': 2, 'company': 'CS'}
{'source': 'data\\benefits\\CS US Benefits Guide February 2023.pdf', 'page': 3, 'company': 'CS'}
{'source': 'data\\benefits\\CS US Benefits Guide February 2023.pdf', 'page': 4, 'company': 'CS'}
{'source': 'data\\benefits\\CS US Benefits Guide February 2023.pdf', 'page': 5, 'company': 'CS'}
{'source': 'data\\benefits\\CS US Benefits Guide February 2023.pdf', 'page': 6, 'company': 'CS'}
{'source': 'data\\benefits\\CS US Benefits Guide February 2023.pdf', 'page': 7, 'company': 'CS'}
{'source': 'data\\benefits\\CS US Benefits Guide February 2023.pdf', 'page': 8, 'company': 'CS'}
{'source': 'data\\benefits\\CS US Benefits Guide February 2023.pdf', 'page': 9, 'company': 'CS'}


In [42]:
#split pages
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=150,
    length_function=len
)
docs = text_splitter.split_documents(pages)

#crete vector db and save the data
persist_directory = 'metadata/chroma/'
embedding = OpenAIEmbeddings()
vectordb = Chroma.from_documents(
    documents=docs,
    embedding=embedding,
    persist_directory=persist_directory
)
vectordb.persist()


In [43]:
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

In [83]:
def qa(query, company=''):    
    search_kwargs={"k": 3}
    if company !='':
        search_kwargs.update({"filter": {"company": company}}) 
    
    retriever= vectordb.as_retriever(search_type="similarity", search_kwargs=search_kwargs)
    qa_chain = RetrievalQA.from_chain_type(llm, retriever=retriever)
    result = qa_chain({"query": query})
    print(result["result"])

In [77]:
qa(query="what is cost for basic life insurance?", company="CS")

The cost for basic life insurance is not mentioned in the provided context.


In [78]:
qa(query="what is cost for basic life insurance?", company="UBS")

The basic life insurance coverage is provided at no cost to employees.


In [79]:
qa(query="what is cost for basic life insurance?")

The basic life insurance coverage is provided at no cost to eligible employees.


In [80]:
qa(query="when is open enrollment?", company="CS")

The information provided does not specify the exact dates for open enrollment. It states that Benefits Annual Enrollment typically takes place in the fall. For specific dates, it is recommended to refer to the Benefits Service Center or the provided 2023 U.S. Benefits Guide.


In [81]:
qa(query="when is open enrollment?", company="UBS")

Open Enrollment is from November 1 to November 17.


In [82]:
qa(query="when is open enrollment?")

The information provided does not specify the exact dates of the open enrollment period.
