# Installing dependencies

In [None]:
!pip install -q langchain
!pip install -q torch
!pip install -q transformers
!pip install -q sentence-transformers
!pip install -q datasets
!pip install -q faiss-cpu
!pip install -U langchain-community
!pip install pypdf

from IPython.display import clear_output
clear_output()

In [None]:
import os
from urllib.request import urlretrieve
import numpy as np
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.llms import HuggingFacePipeline
from langchain_community.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

# Downloading data

In [None]:
# Download documents from U.S. Census Bureau to local directory.
os.makedirs("us_census", exist_ok=True)
files = [
    "https://www.census.gov/content/dam/Census/library/publications/2022/demo/p70-178.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-017.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-016.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-015.pdf",
]
for url in files:
    file_path = os.path.join("us_census", url.rpartition("/")[2])
    urlretrieve(url, file_path)

In [None]:
# Load pdf files in the local directory
loader = PyPDFDirectoryLoader("./us_census/")

docs_before_split = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 700,
    chunk_overlap  = 50,
)
docs_after_split = text_splitter.split_documents(docs_before_split)

docs_after_split[0]

Document(metadata={'source': 'us_census/acsbr-015.pdf', 'page': 0}, page_content='Health Insurance Coverage Status and Type \nby Geography: 2021 and 2022\nAmerican Community Survey Briefs\nACSBR-015Issued September 2023Douglas Conway and Breauna Branch\nINTRODUCTION\nDemographic shifts as well as economic and govern-\nment policy changes can affect people’s access to health coverage. For example, between 2021 and 2022, the labor market continued to improve, which may have affected private coverage in the United States \nduring that time.\n1 Public policy changes included \nthe renewal of the Public Health Emergency, which \nallowed Medicaid enrollees to remain covered under the Continuous Enrollment Provision.\n2 The American')

In [None]:
avg_doc_length = lambda docs: sum([len(doc.page_content) for doc in docs])//len(docs)
avg_char_before_split = avg_doc_length(docs_before_split)
avg_char_after_split = avg_doc_length(docs_after_split)

print(f'Before split, there were {len(docs_before_split)} documents loaded, with average characters equal to {avg_char_before_split}.')
print(f'After split, there were {len(docs_after_split)} documents (chunks), with average characters equal to {avg_char_after_split} (average chunk length).')

Before split, there were 63 documents loaded, with average characters equal to 3830.
After split, there were 400 documents (chunks), with average characters equal to 618 (average chunk length).


# Embeddings using Sentence transformers (HF)

In [None]:
huggingface_embeddings = HuggingFaceBgeEmbeddings(
    # model_name="BAAI/bge-small-en-v1.5",
    model_name = "sentence-transformers/all-MiniLM-l6-v2", # For light and faster experience.
    model_kwargs = {'device':0},
    encode_kwargs = {'normalize_embeddings': True}
)

  from tqdm.autonotebook import tqdm, trange


In [None]:
sample_embedding = np.array(huggingface_embeddings.embed_query(docs_after_split[0].page_content))
print("Sample embedding of a document chunk: ", sample_embedding)
print("Size of the embedding: ", sample_embedding.shape)

Sample embedding of a document chunk:  [ 3.62205021e-02  2.08621453e-02  6.65481910e-02  6.49368623e-03
  6.40590861e-02  1.03430465e-01 -2.49341391e-02 -1.88159440e-02
 -8.36598873e-02  4.00242731e-02 -2.08017379e-02  1.11279137e-01
 -6.59805955e-03 -7.69475773e-02  2.79084388e-02 -5.11178700e-03
 -1.75802317e-02 -2.69326456e-02 -2.69205794e-02  6.55195117e-02
  1.30409235e-02  4.62137759e-02 -1.29379882e-02  4.29107063e-02
 -8.32869858e-03  8.16422328e-03  6.18163198e-02 -2.42102202e-02
  5.30796964e-03  5.58978133e-02  4.98896427e-02  3.47685106e-02
 -7.17519037e-03  1.98417827e-02  2.15068981e-02 -8.55922177e-02
 -4.41659428e-02  2.56719980e-02 -5.36422320e-02  2.86921915e-02
 -3.75566483e-02 -8.61203969e-02 -7.37674385e-02  9.83738378e-02
  2.79736817e-02  1.51086645e-02 -6.12781942e-02  7.05660656e-02
 -5.04608778e-03  5.60577177e-02  2.26505548e-02 -2.16067377e-02
  3.39270122e-02 -3.91953513e-02  4.61027287e-02 -3.05536371e-02
 -8.71735532e-03 -1.78834889e-02 -1.64443795e-02  4

# Vector store using FAISS

In [None]:
vectorstore = FAISS.from_documents(docs_after_split, huggingface_embeddings)

In [None]:
query = """What were the trends in median household income across
           different states in the United States between 2021 and 2022."""

relevant_documents = vectorstore.similarity_search(query)
print(f'There are {len(relevant_documents)} documents retrieved which are relevant to the query. Display the first one:\n')
print(relevant_documents[0].page_content)

There are 4 documents retrieved which are relevant to the query. Display the first one:

hold income in 2022 was $24,112 
(Table 1 and Figure 2). Median 
household income was lower than 
the U.S. median in 30 states and 
Puerto Rico. It was higher than the 
U.S. median in 17 states and the 
District of Columbia. The medians 
for Arizona, Oregon, and Vermont were not statistically different from 
the U.S. median.
From 2021 to 2022, five states—
Alabama, Alaska, Delaware, Florida, 
and Utah—showed a statistically 
significant increase in real median 
household income; 17 states 
showed a decrease. Real median 
household income in 2022 was not 
statistically different from that in 
2021 for 28 states, the District of 
Columbia, and Puerto Rico  
(Table 1).


In [None]:
# Use similarity searching algorithm and return 3 most relevant documents.
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})

# HuggingFace API

In [None]:
from google.colab import userdata

In [None]:
os.environ["HUGGINGFACEHUB_API_TOKEN"] = userdata.get('HF_TOKEN')

In [None]:
from langchain_community.llms import HuggingFaceHub

hf = HuggingFaceHub(
    repo_id = "openai-community/gpt2",
    model_kwargs = {"temperature":0.1, "max_length":500},
    huggingfacehub_api_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
)
query = """What were the trends in median household income across different states in the United States between 2021 and 2022."""  # Sample question, change to other questions you are interested in.
hf.invoke(query)

  warn_deprecated(


'What were the trends in median household income across different states in the United States between 2021 and 2022.\n\nThe median household income in the United States is $37,000 higher than the median household income in the United States in 2016.\n\nThe median household income in the United States is $36,000 higher than the median household income in the United States in 2016.\n\nThe median household income in the United States is $36,000 higher than the median household income in the United States in 2016.\n\nThe median household income in the United States is $36,000 higher'

# HuggingFace Pipeline

In [None]:
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

hf = HuggingFacePipeline.from_model_id(
    # model_id="mistralai/Mistral-7B-Instruct-v0.2",
    model_id="openai-community/gpt2",
    task="text-generation",
    pipeline_kwargs={"temperature": 0.3, "max_new_tokens": 300}
)

llm = hf
llm.invoke(query)



'What were the trends in median household income across different states in the United States between 2021 and 2022.\n\nThe next chart shows median household income across the United States between 2021 and 2022, which is the median household income for all states.\n\nThe next chart shows median household income across all states, which is the median household income for all states.\n\nThe next chart shows median household income across all states, which is the median household income for all states.\n\nThe next chart shows median household income across all states, which is the median household income for all states.\n\nThe next chart shows median household income across all states, which is the median household income for all states.\n\nThe next chart shows median household income across all states, which is the median household income for all states.\n\nThe next chart shows median household income across all states, which is the median household income for all states.\n\nThe next ch

In [None]:
llm = hf

In [None]:
prompt_template = """Use the following pieces of context to answer the question at the end. Please follow the following rules:
1. If you don't know the answer, don't try to make up an answer. Just say "I can't find the final answer but you may want to check the following links".
2. If you find the answer, write the answer in a concise way with five sentences maximum.

{context}

Question: {question}

Helpful Answer:
"""

PROMPT = PromptTemplate(
 template=prompt_template, input_variables=["context", "question"]
)

In [None]:
retrievalQA = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT}
)

In [None]:
# Call the QA chain with our query.
result = retrievalQA.invoke({"query": query})
print(result['result'])

Use the following pieces of context to answer the question at the end. Please follow the following rules:
1. If you don't know the answer, don't try to make up an answer. Just say "I can't find the final answer but you may want to check the following links".
2. If you find the answer, write the answer in a concise way with five sentences maximum.

hold income in 2022 was $24,112 
(Table 1 and Figure 2). Median 
household income was lower than 
the U.S. median in 30 states and 
Puerto Rico. It was higher than the 
U.S. median in 17 states and the 
District of Columbia. The medians 
for Arizona, Oregon, and Vermont were not statistically different from 
the U.S. median.
From 2021 to 2022, five states—
Alabama, Alaska, Delaware, Florida, 
and Utah—showed a statistically 
significant increase in real median 
household income; 17 states 
showed a decrease. Real median 
household income in 2022 was not 
statistically different from that in 
2021 for 28 states, the District of 
Columbia, and 

In [None]:
relevant_docs = result['source_documents']
print(f'There are {len(relevant_docs)} documents retrieved which are relevant to the query.')
print("*" * 100)
for i, doc in enumerate(relevant_docs):
    print(f"Relevant Document #{i+1}:\nSource file: {doc.metadata['source']}, Page: {doc.metadata['page']}\nContent: {doc.page_content}")
    print("-"*100)
    print(f'There are {len(relevant_docs)} documents retrieved which are relevant to the query.')

There are 3 documents retrieved which are relevant to the query.
****************************************************************************************************
Relevant Document #1:
Source file: us_census/acsbr-017.pdf, Page: 3
Content: hold income in 2022 was $24,112 
(Table 1 and Figure 2). Median 
household income was lower than 
the U.S. median in 30 states and 
Puerto Rico. It was higher than the 
U.S. median in 17 states and the 
District of Columbia. The medians 
for Arizona, Oregon, and Vermont were not statistically different from 
the U.S. median.
From 2021 to 2022, five states—
Alabama, Alaska, Delaware, Florida, 
and Utah—showed a statistically 
significant increase in real median 
household income; 17 states 
showed a decrease. Real median 
household income in 2022 was not 
statistically different from that in 
2021 for 28 states, the District of 
Columbia, and Puerto Rico  
(Table 1).
----------------------------------------------------------------------------------