In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

from langchain_community.embeddings import HuggingFaceBgeEmbeddings
import numpy as np
from langchain.prompts import PromptTemplate

from langchain.chains import RetrievalQA


In [None]:
# Reading data from pdfs
loader=PyPDFDirectoryLoader("./us_census")

documents=loader.load()

text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=250)

final_documents=text_splitter.split_documents(documents)

final_documents[0]


Document(metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 18.2 (Windows)', 'creationdate': '2023-09-09T07:52:17-04:00', 'author': 'U.S. Census Bureau', 'keywords': 'acsbr-015', 'moddate': '2023-09-12T14:44:47+01:00', 'title': 'Health Insurance Coverage Status and Type by Geography: 2021 and 2022', 'trapped': '/false', 'source': 'us_census/acsbr-015.pdf', 'total_pages': 18, 'page': 0, 'page_label': '1'}, page_content='Health Insurance Coverage Status and Type \nby Geography: 2021 and 2022\nAmerican Community Survey Briefs\nACSBR-015\nIssued September 2023\nDouglas Conway and Breauna Branch\nINTRODUCTION\nDemographic shifts as well as economic and govern-\nment policy changes can affect peopleâ€™s access to \nhealth coverage. For example, between 2021 and 2022, \nthe labor market continued to improve, which may \nhave affected private coverage in the United States \nduring that time.1 Public policy changes included \nthe renewal of the Public Health Emergency, w

In [4]:
# Embedding using HuggingFace

hugging_face=HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",#sentence-transformers/all-MiniLM-l6-v2
    model_kwargs={"device":"cpu"},
    encode_kwargs={"normalize_embeddings":True }
                                                                                              
)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
arrary=np.array(hugging_face.embed_query(final_documents[0].page_content))

In [7]:
arrary

array([-0.07903483, -0.01134113, -0.023121  ,  0.02844467,  0.05053339,
        0.05317826, -0.01907787,  0.0345603 , -0.10211373, -0.02915701,
        0.08524261,  0.05650727, -0.02545442, -0.03308494, -0.00635735,
        0.04090862, -0.00628107,  0.0035674 , -0.03854132,  0.03667679,
       -0.04289804,  0.03425257, -0.03116897, -0.03793732,  0.0172839 ,
        0.01214924,  0.00653121,  0.01463568, -0.05529053, -0.15320703,
        0.00730846,  0.03202944, -0.04701128, -0.01595979,  0.01874445,
        0.02642935, -0.02306375,  0.08438037,  0.04182493,  0.05278177,
       -0.03057601,  0.01564265, -0.01689073,  0.00529408, -0.02417439,
        0.00412994, -0.01889933, -0.0015063 , -0.00836946, -0.03390065,
        0.03515958, -0.00553132,  0.04910939,  0.0597186 ,  0.05615968,
       -0.05105155,  0.01475136, -0.01849963, -0.03284639,  0.03576629,
        0.04947708, -0.00938879, -0.26202118,  0.09750336,  0.01715689,
        0.04781392, -0.0055632 , -0.00298304, -0.02207354, -0.04

In [8]:
arrary.shape

(384,)

In [9]:
vectore_store=FAISS.from_documents(final_documents[:120],hugging_face)


In [10]:
# query with similarity search

query="WHAT IS HEALTH INSURANCE COVERAGE?"

relevant_info=vectore_store.similarity_search(query=query)

relevant_info[0].page_content

'2 U.S. Census Bureau\nWHAT IS HEALTH INSURANCE COVERAGE?\nThis brief presents state-level estimates of health insurance coverage \nusing data from the American Community Survey (ACS). The  \nU.S. Census Bureau conducts the ACS throughout the year; the \nsurvey asks respondents to report their coverage at the time of \ninterview. The resulting measure of health insurance coverage, \ntherefore, reflects an annual average of current comprehensive \nhealth insurance coverage status.* This uninsured rate measures a \ndifferent concept than the measure based on the Current Population \nSurvey Annual Social and Economic Supplement (CPS ASEC). \nFor reporting purposes, the ACS broadly classifies health insurance \ncoverage as private insurance or public insurance. The ACS defines \nprivate health insurance as a plan provided through an employer \nor a union, coverage purchased directly by an individual from an \ninsurance company or through an exchange (such as healthcare.'

In [12]:
# using retrievers to get accurate information from vectore database

retriever=vectore_store.as_retriever(search_type="similarity",search_kwargs={"k":3})
retriever

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceBgeEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x1685a0d50>, search_kwargs={'k': 3})

In [20]:
import os
huggingface_api=os.getenv("HUGGINGFACE_API_KEY")

In [21]:
from langchain_community.llms import HuggingFaceHub

hf=HuggingFaceHub(
    repo_id="mistralai/Mistral-7B-v0.1",
    model_kwargs={"temperature":0.1,"max_length":500},
    huggingfacehub_api_token=huggingface_api
)

hf.invoke(query)



"WHAT IS HEALTH INSURANCE COVERAGE? Health insurance coverage is a type of insurance that covers the cost of medical and surgical expenses incurred by the insured. Here are some key aspects of health insurance coverage:\n\n1. **Premium**: This is the amount you pay regularly (usually monthly) to maintain your insurance policy. It's like renting your insurance.\n\n2. **Deductible**: This is the amount you have to pay out-of-pocket for covered health care services before your insurance starts to pay. For example, if your deductible is $1,000, you'll pay the first $1,000 of covered services.\n\n3. **Copayment (Copay)**: This is a fixed amount you pay for a specific service, like a doctor's visit or a prescription. For example, you might have a $20 copay for a primary care visit.\n\n4. **Coinsurance**: This is a percentage of the cost of a service that you pay after you've met your deductible. For example, if your coinsurance is 20%, you'll pay 20% of the cost of a service, and your insura

In [22]:
prompt_template="""
Use the following piece of context to answer the question asked.
Please try to provide the answer only based on the context

{context}
Question:{question}

Helpful Answers:
 """

In [23]:
prompt=PromptTemplate(template=prompt_template,
                      input_variables=['context',"question"])


In [24]:
retrievalqa=RetrievalQA.from_chain_type(
    llm=hf,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt":prompt}
    
)

In [None]:
# prompt="""DIFFERENCES IN THE
# UNINSURED RATE BY STATE
# IN 2022"""

In [None]:
# Call the QA chain with our query.
result = retrievalqa.invoke({"query": query})
print(result['result'])