In [14]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
import  numpy as np
import os

In [11]:
#Read the ppdfs from the folder
loader=PyPDFDirectoryLoader("./us_census")
documents=loader.load()
text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)
final_documents=text_splitter.split_documents(documents)
from tqdm import tqdm

In [8]:
print(len(final_documents))
final_documents[0]

316


Document(metadata={'source': 'us_census/acsbr-017.pdf', 'page': 0}, page_content='KEY DEFINITIONS\nHousehold income: Includes income of the \nhouseholder and all other people 15 years and older in the household, whether or not they are related to the householder.\nMedian: The point that divides the household \nincome distribution into halves, one half with income above the median and the other with income below the median. The median is based on the income distribution of all households, including those with no income.\nGini index: A summary measure of income \ninequality. The Gini index varies from 0 to 1, with 0 indicating perfect equality, where there is a proportional distribution of income. A Gini index of 1 indicates perfect inequality, where one household has all the income.Household Income in States and \nMetropolitan Areas: 2022\nAmerican  Community S urvey  Briefs\nBy Kirby G. Posey\nACSBR-017\nDecember 2023\nINTRODUCTION\nThis brief presents statistics on median household')

In [12]:
# Embedding Using Huggingface
huggingface_embeddings=HuggingFaceBgeEmbeddings(
    model_name="sentence-transformers/all-MiniLM-l6-v2",      
    model_kwargs={'device':'cpu'},
    encode_kwargs={'normalize_embeddings':True}

)



In [15]:
'''Test'''
print(np.array(huggingface_embeddings.embed_query(final_documents[0].page_content)))

[-1.71477459e-02 -2.76800878e-02  4.26293686e-02  2.39637569e-02
 -4.71268892e-02  6.54380023e-03 -6.81887642e-02  1.56108551e-02
 -8.66209418e-02 -1.72416624e-02  7.30304606e-03 -5.59468642e-02
  2.54343934e-02 -2.15982944e-02 -8.21789503e-02 -5.13732545e-02
 -7.67956674e-03 -1.57067012e-02  2.58647464e-02  7.06210211e-02
  9.43597704e-02  1.35466196e-02  1.10754641e-02 -4.42244671e-02
  1.43394217e-01 -3.71766649e-02 -7.54728308e-03 -4.82792407e-02
  1.39716184e-02  1.10534549e-01  5.20765483e-02  6.84058145e-02
  1.51482940e-01 -4.48741112e-03 -2.82990951e-02 -5.87211177e-02
  5.34513071e-02  3.21172141e-02  6.85726851e-02 -4.84611616e-02
  2.02024858e-02 -5.20371571e-02 -4.88764467e-03 -5.69964610e-02
 -2.56069954e-02 -3.28649092e-03  2.01870743e-02  3.31615061e-02
  3.18519734e-02  5.57018220e-02 -8.64430442e-02  8.28257278e-02
  5.69304302e-02  6.52428344e-02  5.10994829e-02  2.24289205e-02
 -7.25073391e-04 -4.40494418e-02  2.98080537e-02  4.56850231e-02
 -4.17241938e-02  5.72430

In [16]:
## VectorStore Creation
vectorstore=FAISS.from_documents(final_documents[:120],huggingface_embeddings)

In [17]:
## Query using Similarity Search
query="WHAT IS HEALTH INSURANCE COVERAGE?"
relevant_docments=vectorstore.similarity_search(query)

print(relevant_docments[0].page_content)

detailed estimates of income and 
to measure change in national-
level estimates. The CPS ASEC 
is the official source of national 
poverty estimates. For more infor -
mation from the CPS ASEC about 
national income estimates, refer to 
the report “ Income in the United 
States: 2022 .”
For information on income esti -
mates from the ACS and how they 
differ from those based on the 
CPS ASEC, refer to “ Fact Sheet: 
Differences Between the American 
Community Survey and the Annual 
Social and Economic Supplement 
to the Current Population Survey  
(CPS ASEC) .”
WHAT IS THE AMERICAN COMMUNITY SURVEY?
The American Community Survey (ACS) is a nationwide survey designed to provide reliable and timely 
demographic, social, economic, and housing data for the nation, states, congressional districts, counties, 
places, and other localities every year. It has an annual sample size of about 3.5 million addresses across


In [19]:
'''Creating retriever with top 3 ranked results being retrieved on the basis of similarity search'''
retriever=vectorstore.as_retriever(search_type="similarity",search_kwargs={"k":3})
print(retriever)

tags=['FAISS', 'HuggingFaceBgeEmbeddings'] vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7f7875e19180> search_kwargs={'k': 3}


In [None]:

os.environ['HUGGINGFACEHUB_API_TOKEN']=""