In [32]:
# Misc
import numpy as np
import os
from dotenv import load_dotenv
load_dotenv()
os.environ["HUGGINGFACEHUB_API_TOKEN"] = os.getenv("HUGGINGFACEHUB_API_TOKEN")

In [1]:
# Data Ingestion
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader


In [2]:
# Data Transformation
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [3]:
# Embedding Layer
from langchain_community.vectorstores import FAISS

In [55]:
# Hugging Face Models
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.llms import HuggingFaceHub
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

In [5]:
# Get prompt template
from langchain.prompts import PromptTemplate

In [6]:
# Import QA Chain
from langchain.chains import RetrievalQA

In [12]:
# Data Ingestion
loader = PyPDFDirectoryLoader("./us_census")
documents = loader.load()


In [13]:
# Data Transformation
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
final_documents = text_splitter.split_documents(documents)
final_documents[0]

Document(page_content='Health Insurance Coverage Status and Type \nby Geography: 2021 and 2022\nAmerican Community Survey Briefs\nACSBR-015Issued September 2023Douglas Conway and Breauna Branch\nINTRODUCTION\nDemographic shifts as well as economic and govern-\nment policy changes can affect people’s access to health coverage. For example, between 2021 and 2022, the labor market continued to improve, which may have affected private coverage in the United States \nduring that time.\n1 Public policy changes included \nthe renewal of the Public Health Emergency, which \nallowed Medicaid enrollees to remain covered under the Continuous Enrollment Provision.\n2 The American \nRescue Plan (ARP) enhanced Marketplace premium subsidies for those with incomes above 400 percent of the poverty level as well as for unemployed people.\n3', metadata={'source': 'us_census\\acsbr-015.pdf', 'page': 0})

In [15]:
len(final_documents)


316

In [18]:
# Embedding using HuggingFace
huggingface_embeddings = HuggingFaceBgeEmbeddings(
    model_name = "BAAI/bge-small-en-v1.5",  # or use sent trf sentence-transformers/all-MiniLM-l6-v2
    model_kwargs = {'device':'cpu'},
    encode_kwargs = {'normalize_embeddings' : True}
)

In [24]:
np.array(huggingface_embeddings.embed_query(final_documents[0].page_content)).shape

(384,)

In [26]:
# Vector Store Creation
vectorstore = FAISS.from_documents(final_documents, huggingface_embeddings)

In [30]:
# Query using similarity search

query = "Say something about household median income of New Jersey"
relevant_documents = vectorstore.similarity_search(query)
print(relevant_documents[0].page_content)

4 U.S. Census Bureau
to the ACS (Table 1). Real median 
household income in the United 
States declined 0.8 percent 
between the 2021 ACS and 2022 
ACS.7 Figure 1 shows a historical 
series of median household income 
back to 2005.
New Jersey and Maryland had the 
highest median household incomes 
of all states ($96,346 and $94,991, 
respectively); there was no statisti -
cal difference between the two. 
⁷ “Real” refers to income after adjusting 
for inflation.The District of Columbia’s median 
household income ($101,027) 
was the highest in the nation. 
Mississippi had the lowest median 
household income ($52,719) of any 
state. Puerto Rico’s median house -
hold income in 2022 was $24,112 
(Table 1 and Figure 2). Median 
household income was lower than 
the U.S. median in 30 states and 
Puerto Rico. It was higher than the 
U.S. median in 17 states and the 
District of Columbia. The medians 
for Arizona, Oregon, and Vermont were not statistically different from 
the U.S. median.


In [46]:
# create the retriever
retriever = vectorstore.as_retriever(search_type = "similarity", search_kwargs = {"k": 1})
print(retriever)

tags=['FAISS', 'HuggingFaceBgeEmbeddings'] vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x000001A8D1B4C5D0> search_kwargs={'k': 1}


In [85]:
# Build the LLM
hf = HuggingFaceHub(
    repo_id = "mistralai/Mistral-7B-v0.1",
    model_kwargs = {"temperature":0.001, "max_length" : 500}
)

query = "What is the household median income of District of Columbia"
response = hf.invoke(query)

In [86]:
print(response)

What is the household median income of District of Columbia?

The median income in District of Columbia is $100,000, and the median house value is $500,000.

What is the average income in Washington DC?

The average income in Washington DC is $100,000.

What is the average income in DC?

The average income in DC is $100,000.

What is the average income in DC


In [87]:

hf = HuggingFacePipeline.from_model_id(
    model_id="openai-community/gpt2",
    task="text-generation",
    pipeline_kwargs={"temperature": 0.1, "max_new_tokens": 300},
)

llm = hf 
llm.invoke(query)

'What is the household median income of District of Columbia residents?\n\nThe median household income of District of Columbia residents is $46,000.\n\nThe median household income of District of Columbia residents is $46,000 higher than the median household income of the District of Columbia.\n\nThe median household income of District of Columbia residents is $46,000 higher than the median household income of the District of Columbia.\n\nThe median household income of District of Columbia residents is $46,000 higher than the median household income of the District of Columbia.\n\nThe median household income of District of Columbia residents is $46,000 higher than the median household income of the District of Columbia.\n\nThe median household income of District of Columbia residents is $46,000 higher than the median household income of the District of Columbia.\n\nThe median household income of District of Columbia residents is $46,000 higher than the median household income of the Dis

In [88]:
prompt_template="""
Use the following piece of context to answer the question asked.
Please try to provide the answer only based on the context

{context}
Question:{question}

Helpful Answers:
 """

In [89]:
prompt=PromptTemplate(template=prompt_template,input_variables=["context","question"])

In [91]:
retrievalQA=RetrievalQA.from_chain_type(
    llm=hf,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt":prompt}
)

In [92]:
query="""DIFFERENCES IN THE
UNINSURED RATE BY STATE
IN 2022"""

In [93]:
# Call the QA chain with our query.
result = retrievalQA.invoke({"query": query})
print(result['result'])


Use the following piece of context to answer the question asked.
Please try to provide the answer only based on the context

comparison of ACS and CPS ASEC measures 
of health insurance coverage, refer to < www.
census.gov/topics/health/health-insurance/
guidance.html >.
9 Respondents may have more than one 
health insurance coverage type at the time 
of interview. As a result, adding the total 
number of people with private coverage and 
the total number with public coverage will 
sum to more than the total number with any 
coverage.• From 2021 to 2022, nine states 
reported increases in private 
coverage, while seven reported 
decreases (Appendix Table B-2). 
DIFFERENCES IN THE 
UNINSURED RATE BY STATE 
IN 2022
In 2022, uninsured rates at the 
time of interview ranged across 
states from a low of 2.4 percent 
in Massachusetts to a high of 16.6 
percent in Texas, compared to the 
national rate of 8.0 percent.10 Ten 
of the 15 states with uninsured 
10 The uninsured rates in the Distr