## RAG implementation using hugging face 

In [7]:
from langchain_community.document_loaders import UnstructuredURLLoader
urls = ['https://www.livemint.com/economy/budget-2024-key-highlights-live-updates-nirmala-sitharaman-infrastructure-defence-income-tax-modi-budget-23-july-11721654502862.html',
        'https://cleartax.in/s/budget-2024-highlights',
        'https://www.hindustantimes.com/budget',
        'https://economictimes.indiatimes.com/news/economy/policy/budget-2024-highlights-india-nirmala-sitharaman-capex-fiscal-deficit-tax-slab-key-announcement-in-union-budget-2024-25/articleshow/111942707.cms?from=mdr']
loader = UnstructuredURLLoader(urls=urls)
data = loader.load()  


In [8]:
len(data)


4

In [9]:
# data

In [10]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# split data
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000)
docs = text_splitter.split_documents(data)


print("Total number of documents: ",len(docs))


Total number of documents:  176


In [11]:
print(docs[7])

page_content='24 Jul 2024, 09:45:48 PM IST

Budget 2024 Key Highlights Live Updates: Rohan Bhargava, Co-Founder of CashKaro, said

Budget 2024 Key Highlights Live Updates: "The Union Budget 2024 brings substantial changes to personal finance that will benefit a large number of taxpayers. The increase in the standard deduction from ₹50,000 to ₹75,000 and the revision of the tax slab limit for the 5% tax rate from ₹5 lakh to ₹7 lakh will significantly enhance disposable income. These changes will provide much-needed financial relief to the middle class and boost overall consumption.' metadata={'source': 'https://www.livemint.com/economy/budget-2024-key-highlights-live-updates-nirmala-sitharaman-infrastructure-defence-income-tax-modi-budget-23-july-11721654502862.html'}


In [12]:
# Embedding models: https://python.langchain.com/v0.1/docs/integrations/text_embedding/
# Let's load the Hugging Face Embedding class.  sentence_transformers
from langchain_community.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings()

vector = embeddings.embed_query("hello, world!")
vector[:5] #to check first 5 dimensions such that it is working or not
#vector


  embeddings = HuggingFaceEmbeddings()


[0.034922655671834946,
 0.018830018118023872,
 -0.017854738980531693,
 0.0001388440141454339,
 0.0740736871957779]

In [13]:
#Now we need to store these embeddings in a vector database. Lets use chroma vector database for this!


In [14]:
from langchain_chroma import Chroma
vectorstore = Chroma.from_documents(documents=docs, embedding=HuggingFaceEmbeddings())


  vectorstore = Chroma.from_documents(documents=docs, embedding=HuggingFaceEmbeddings())


In [15]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})

retrieved_docs = retriever.invoke("Budget highlights")


In [16]:
len(retrieved_docs)


3

In [17]:
print(retrieved_docs[2].page_content)


You Might Also Like:

You Might Also Like thumb-111943807

Budget 2o24: What's cheaper and what's costlier? Here's the list

Budget 2024 Announcements for STT, Short-term capital gains and LTCG


In [18]:
#Lets set up or LLM using HuggingFaceHub

In [22]:
from langchain_huggingface import HuggingFacePipeline
from langchain_core.prompts import PromptTemplate
from transformers import pipeline
from langchain_core.output_parsers import StrOutputParser
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer


#model_id = "meta-llama/Meta-Llama-3-8B"
model_id = "tiiuae/falcon-7b"

text_generation_pipeline = pipeline(
    "text-generation", model=model_id, model_kwargs={"torch_dtype": torch.bfloat16}, max_new_tokens=400, device=0)

llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

prompt_template = """
<|system|>
Answer the question based on your knowledge. Use the following context to help:

{context}

</s>
<|user|>
{question}
</s>
<|assistant|>

 """

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

llm_chain = prompt | llm | StrOutputParser()


Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Fetching 2 files: 100%|██████████| 2/2 [01:00<00:00, 30.49s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.51it/s]
Device set to use cpu


In [23]:
from langchain_core.runnables import RunnablePassthrough

rag_chain = {"context": retriever, "question": RunnablePassthrough()} | llm_chain


In [24]:
question = "2024 Budget Highlights"


In [25]:
rag_chain.invoke(question)


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


'\n<|system|>\nAnswer the question based on your knowledge. Use the following context to help:\n\n[Document(id=\'2d8f091f-64be-474c-8688-ede02b59fda7\', metadata={\'source\': \'https://economictimes.indiatimes.com/news/economy/policy/budget-2024-highlights-india-nirmala-sitharaman-capex-fiscal-deficit-tax-slab-key-announcement-in-union-budget-2024-25/articleshow/111942707.cms?from=mdr\'}, page_content="2024 Budget Announcement for Space Industry\\n\\n➤ Budget 2024 Highlights: A venture capital fund of Rs 1,000 crore will be set up. ➤ 2024 India Budget Highlights: Land records in urban areas will be digitized with GIS mapping. An IT based system for property record administration, updating, and tax administration will be established. ➤ Union Budget 2024 Highlights: Sitharaman said that the Budget is focused on poor, women, youth and farmer. ➤ India Budget 2024 Highlights: Global uncertainty poses a downside for growth. Despite these headwinds India\'s growth continue to be higher. ➤ 202

In [26]:
question = "2024 Budget Highlights"
response = rag_chain.invoke(question)

# Making the response readable
response = response.replace("</s>", "").strip()
print("Response:", response)


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Response: <|system|>
Answer the question based on your knowledge. Use the following context to help:

[Document(id='2d8f091f-64be-474c-8688-ede02b59fda7', metadata={'source': 'https://economictimes.indiatimes.com/news/economy/policy/budget-2024-highlights-india-nirmala-sitharaman-capex-fiscal-deficit-tax-slab-key-announcement-in-union-budget-2024-25/articleshow/111942707.cms?from=mdr'}, page_content="2024 Budget Announcement for Space Industry\n\n➤ Budget 2024 Highlights: A venture capital fund of Rs 1,000 crore will be set up. ➤ 2024 India Budget Highlights: Land records in urban areas will be digitized with GIS mapping. An IT based system for property record administration, updating, and tax administration will be established. ➤ Union Budget 2024 Highlights: Sitharaman said that the Budget is focused on poor, women, youth and farmer. ➤ India Budget 2024 Highlights: Global uncertainty poses a downside for growth. Despite these headwinds India's growth continue to be higher. ➤ 2024 Uni

In [27]:
question = "What is the Union Budget?"
response = rag_chain.invoke(question)

# Making the response readable
response = response.replace("</s>", "").strip()
print("Response:", response)


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Response: <|system|>
Answer the question based on your knowledge. Use the following context to help:

[Document(id='dd03baa7-7b90-4c86-8d7f-66e1b21077e7', metadata={'source': 'https://economictimes.indiatimes.com/news/economy/policy/budget-2024-highlights-india-nirmala-sitharaman-capex-fiscal-deficit-tax-slab-key-announcement-in-union-budget-2024-25/articleshow/111942707.cms?from=mdr'}, page_content='announcement for Women: Working women hostels and creche to be opened to facilitate higher participation of women in the workforce. ➤ Union Budget 2024 Highlights: Internship scheme for 1 crore youth in 500 top companies with Rs 5,000 per month as internship allowance and one-time assistance of Rs 6,000 to be provided. ➤ Budget 2024 announcement for Students: Financial support for loans up to Rs 10 lakh will be provided for higher education in domestic institutions. E-vouchers will be given directly to 1 lakh students every year for annual interest subvention of 3 per cent of the loan amou