In [41]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.prompts import PromptTemplate

from  langchain.chains import RetrievalQA



In [6]:
## Read the pdfs from the folder

loader = PyPDFDirectoryLoader("./us_census")

documents =loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)

final_documents = text_splitter.split_documents(documents)

In [7]:
final_documents[0]

Document(metadata={'source': 'us_census\\acsbr-015.pdf', 'page': 0}, page_content='Health Insurance Coverage Status and Type \nby Geography: 2021 and 2022\nAmerican Community Survey Briefs\nACSBR-015\nIssued September 2023\nDouglas Conway and Breauna Branch\nINTRODUCTION\nDemographic shifts as well as economic and govern-\nment policy changes can affect people’s access to \nhealth coverage. For example, between 2021 and 2022, \nthe labor market continued to improve, which may \nhave affected private coverage in the United States \nduring that time.1 Public policy changes included \nthe renewal of the Public Health Emergency, which \nallowed Medicaid enrollees to remain covered under \nthe Continuous Enrollment Provision.2 The American \nRescue Plan (ARP) enhanced Marketplace premium \nsubsidies for those with incomes above 400 percent \nof the poverty level as well as for unemployed people.3\nIn addition to national policies, individual states and \nthe District of Columbia can affect 

In [8]:
len(final_documents)

316

In [13]:
##embedding Using Huggingface
hugginface_embeddings = HuggingFaceBgeEmbeddings(
    model_name = "BAAI/bge-large-en-v1.5",
    #model_name = "sentence-tansformers/all-MiniLM-16-v2"
    model_kwargs = {'device':'cpu'},
    encode_kwargs = {'normalize_embeddings':True}
)


  from tqdm.autonotebook import tqdm, trange
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [26]:
#testing 
import numpy as np
print(np.array(hugginface_embeddings.embed_query(final_documents[0].page_content)))
print(np.array(hugginface_embeddings.embed_query(final_documents[0].page_content)).shape)

[ 0.01662007  0.02551923  0.00805529 ... -0.04528228 -0.00728949
 -0.01784871]
(1024,)


In [27]:
## VectorStore Creation
vectorstore = FAISS.from_documents(final_documents[:100], hugginface_embeddings)

In [28]:
## Query using Similarity Search
query = "WHAT IS HEALTH INSAURANCE COVERAGE?"
relevant_documents=vectorstore.similarity_search(query)
print(relevant_documents[0].page_content)

private health insurance as a plan provided through an employer 
or a union, coverage purchased directly by an individual from an 
insurance company or through an exchange (such as healthcare.
gov), or coverage through TRICARE. Public insurance coverage 
includes federal programs (such as Medicare, Medicaid, and the 
Children’s Health Insurance Program or CHIP), individual state 
health plans, and CHAMPVA (Civilian Health and Medical Program 
at the Department of Veterans Affairs), as well as care provided 
by the Department of Veterans Affairs. In the ACS, people are 
considered insured if they were covered by any of these types 
of health insurance at time of interview. People are considered 
uninsured if they were not covered by any of these types of health 
insurance at time of interview or if they only had coverage through 
the Indian Health Service (IHS), as IHS coverage is not considered 
comprehensive.


In [29]:
retriever =vectorstore.as_retriever(search_type="similarity", search_kwargs={"k":3})
print(retriever)

tags=['FAISS', 'HuggingFaceBgeEmbeddings'] vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x00000197683C6D80> search_kwargs={'k': 3}


In [32]:
import os
os.environ['HUGGINGFACEHUB_API_TOKEN'] = "jf_PcMedRaThKltmnDhyvcQJsXWvVwQzxhtwh"


The Hugging Face Hub is an platform with over 350k models, 75k datasets, and 150k demo apps (Spaces), all open source and publicly available, in an online platform where people can easily collaborate and build ML together.

In [48]:
from langchain_community.llms import HuggingFaceHub

hf=HuggingFaceHub(
    repo_id ="mistralai/Mistral-7B-v0.1",
    model_kwargs={"temperature":0.1,"max_length":500}
)

query="What is the health insaurance coverage?"
hf.invoke(query)


#this is not specific to our document and is using information
#from the web

'What is the health insaurance coverage?\n\nThe health insurance coverage is a contract between an individual and a health insurance company. The contract is called a policy. The policy states that the health insurance company will pay for the individual’s medical expenses. The individual pays a premium to the health insurance company. The premium is the amount of money that the individual pays to the health insurance company. The premium is usually paid monthly. The health insurance company pays for the individual’s medical expenses. The health insurance company pays for the individual’'

In [47]:
## Hugging Face Models can be run locally through the HuggingFacePipeline class.
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

hf = HuggingFacePipeline.from_model_id(
    model_id="mistralai/Mistral-7B-v0.1",
    task="text-generation",
    pipeline_kwargs = {"temperature":0.1,"max_length":500}
)

llm = hf
llm.invoke(query)


OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/mistralai/Mistral-7B-v0.1.
401 Client Error. (Request ID: Root=1-6728b767-31c61b2f7c18e8b564436efe;828c8331-d456-42f5-b0e6-b6d4ea01ec42)

Cannot access gated repo for url https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json.
Access to model mistralai/Mistral-7B-v0.1 is restricted. You must have access to it and be authenticated to access it. Please log in.

In [49]:
prompt_template="""
Use the following piece of context to answer the question asked. 
Please try to provide the answer only based on the context

{context}
Question:{question}

Helpful Answers:
"""

In [50]:
prompt = PromptTemplate(template = prompt_template, input_variables=["context","question"])

In [51]:
retrievalQA = RetrievalQA.from_chain_type(
    llm = hf,
    chain_type = "stuff",
    retriever = retriever,
    return_source_documents = True,
    chain_type_kwargs={"prompt":prompt}
)

In [54]:
query="""What is health insaurance coverage"""


In [55]:
# Call the QA chain with our query.
result = retrievalQA.invoke({"query" : query})
print(result['result'])


Use the following piece of context to answer the question asked. 
Please try to provide the answer only based on the context

private health insurance as a plan provided through an employer 
or a union, coverage purchased directly by an individual from an 
insurance company or through an exchange (such as healthcare.
gov), or coverage through TRICARE. Public insurance coverage 
includes federal programs (such as Medicare, Medicaid, and the 
Children’s Health Insurance Program or CHIP), individual state 
health plans, and CHAMPVA (Civilian Health and Medical Program 
at the Department of Veterans Affairs), as well as care provided 
by the Department of Veterans Affairs. In the ACS, people are 
considered insured if they were covered by any of these types 
of health insurance at time of interview. People are considered 
uninsured if they were not covered by any of these types of health 
insurance at time of interview or if they only had coverage through 
the Indian Health Service (IHS),