In [32]:
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint, HuggingFaceEndpointEmbeddings
from langchain_core.prompts import PromptTemplate
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader

import weaviate
from weaviate.classes.init import Auth

import os 
from dotenv import load_dotenv

In [7]:
load_dotenv()

True

In [10]:
weaviate_url = os.environ["WEAVIATE_URL"]
weaviate_api_key = os.environ["WEAVIATE_API_KEY"]

In [58]:
client = weaviate.connect_to_weaviate_cloud(
    cluster_url=weaviate_url,
    auth_credentials=Auth.api_key(weaviate_api_key),
)

print(client.is_ready())

True


In [8]:
llm = HuggingFaceEndpoint(
    repo_id = "google/gemma-2-2b-it",
    task="text-generation",
    temperature=0.2,
    max_new_tokens=512
)

model = ChatHuggingFace(llm = llm)

embeddings = HuggingFaceEndpointEmbeddings(repo_id="sentence-transformers/all-MiniLM-L6-v2")

parser = StrOutputParser()

In [39]:
def loadDocs(folderPath : str):
    loader = DirectoryLoader(
        path = folderPath,
        glob= "*.pdf",
        loader_cls=PyPDFLoader,
    )
    docs = loader.load()
    return docs

In [38]:
docs = loadDocs(r"documents")
print(len(docs))
print(docs[0])

63
page_content='Health Insurance Coverage Status and Type 
by Geography: 2021 and 2022
American Community Survey Briefs
ACSBR-015
Issued September 2023
Douglas Conway and Breauna Branch
INTRODUCTION
Demographic shifts as well as economic and govern-
ment policy changes can affect people’s access to 
health coverage. For example, between 2021 and 2022, 
the labor market continued to improve, which may 
have affected private coverage in the United States 
during that time.
1 Public policy changes included 
the renewal of the Public Health Emergency, which 
allowed Medicaid enrollees to remain covered under 
the Continuous Enrollment Provision.
2 The American 
Rescue Plan (ARP) enhanced Marketplace premium 
subsidies for those with incomes above 400 percent 
of the poverty level as well as for unemployed people.
3
In addition to national policies, individual states and 
the District of Columbia can affect health insurance 
coverage by making Marketplace or Medicaid more 
accessible and a

In [44]:
def chunkDocs(docs):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size = 1200,
        chunk_overlap = 250
    )
    chunks = splitter.split_documents(docs)
    return chunks

In [45]:
chunks = chunkDocs(docs)
print(len(chunks))
print(chunks[0])

270
page_content='Health Insurance Coverage Status and Type 
by Geography: 2021 and 2022
American Community Survey Briefs
ACSBR-015
Issued September 2023
Douglas Conway and Breauna Branch
INTRODUCTION
Demographic shifts as well as economic and govern-
ment policy changes can affect people’s access to 
health coverage. For example, between 2021 and 2022, 
the labor market continued to improve, which may 
have affected private coverage in the United States 
during that time.
1 Public policy changes included 
the renewal of the Public Health Emergency, which 
allowed Medicaid enrollees to remain covered under 
the Continuous Enrollment Provision.
2 The American 
Rescue Plan (ARP) enhanced Marketplace premium 
subsidies for those with incomes above 400 percent 
of the poverty level as well as for unemployed people.
3
In addition to national policies, individual states and 
the District of Columbia can affect health insurance 
coverage by making Marketplace or Medicaid more 
accessible and 

In [59]:
from langchain_weaviate.vectorstores import WeaviateVectorStore

vectorstore = WeaviateVectorStore.from_documents(
    documents=chunks,
    embedding=embeddings,
    client=client,
    index_name="ClusterForRag", 
)

In [61]:
def retrieveFromDB(query : str):
    topDocs = vectorstore.similarity_search(query, k = 5)
    return topDocs


In [None]:
retrieveFromDB("DIFFERENCES IN THE UNINSURED RATE BY STATE IN 2022")

[Document(metadata={'source': 'documents\\1.pdf', 'page': 1.0, 'subject': None, 'author': 'U.S. Census Bureau', 'page_label': '2', 'moddate': datetime.datetime(2023, 9, 12, 14, 44, 47, tzinfo=datetime.timezone(datetime.timedelta(seconds=3600))), 'total_pages': 18.0, 'creationdate': datetime.datetime(2023, 9, 9, 7, 52, 17, tzinfo=datetime.timezone(datetime.timedelta(days=-1, seconds=72000))), 'keywords': 'acsbr-015', 'creator': 'Adobe InDesign 18.2 (Windows)', 'trapped': '/false', 'producer': 'Adobe PDF Library 17.0', 'title': 'Health Insurance Coverage Status and Type by Geography: 2021 and 2022'}, page_content='allows for an examination of the \nuninsured rate and coverage by \ntype for subnational geographies.8\nKey Findings\n• In 2022, the uninsured rate \nvaried from 2.4 percent in \nMassachusetts to 16.6 percent \nin Texas (Figure 1 and Figure \n2). The District of Columbia \nwas among the lowest with an \nuninsured rate of 2.9 percent, \nnot statistically different from \nMassach

In [67]:
def generateResponse(query):
    topKDocs = retrieveFromDB(query)
    context = "\n\n".join([doc.page_content for doc in topKDocs])

    prompt = PromptTemplate(
    template = 
    """
    You are an assistant for question-answering tasks.
    Use the following pieces of retrieved context to answer the question.
    If you don't know the answer, just say that you don't know. Use ten sentences maximum and keep the answer concise.
    <context>
    {context}
    </context>
    Question : {question}
    Answer:
    """,
    input_variables=["context", "question"]
    )

    chain = prompt | model | parser
    response = chain.invoke({
    "context": context,
    "question": query})

    return response

In [68]:
while True:
    query = input("Enter your question related to the document: ")
    
    if not query.strip():
        print("Please enter a valid question.\n")
        continue
    
    print("\nGenerating answer...\n")
    answer = generateResponse(query)
    print("Answer:", answer)
    continue_chat = input("Do you want to ask another question? (yes/no): ").strip().lower()
    
    if continue_chat not in ['yes', 'y']:
        print("\nThank you for using the document Q&A system. Goodbye!")
        break


Generating answer...

Answer: From 2021 to 2022, 27 states saw a decrease in their uninsured rates, while only Maine saw an increase. The uninsured rate in Maine increased from 5.7 percent to 6.6 percent.  Non-expansion states experienced a decrease in the uninsured rate from 12.8 percent to 11.8 percent, which was driven by an increase in private coverage. 


Thank you for using the document Q&A system. Goodbye!
