In [15]:
%pip install sentence-transformers langchain torch unstructured huggingface_hub

Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import streamlit as st
import pickle
import time
from langchain.llms import HuggingFaceHub
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.vectorstores import FAISS
import torch 

In [2]:
# Set Hugging Face API token and model parameters
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"
repo_id = "google/flan-t5-large"

In [3]:
# Define model kwargs with GPU support if available
model_kwargs = {
    'torch_dtype': torch.float16,
    'trust_remote_code': True,
    'device_map': 'cpu'  # Load model automatically on GPU if available
}

In [4]:
# Initialize LLM with HuggingFace
llm = HuggingFaceHub(
    repo_id=repo_id,
    model_kwargs=model_kwargs,
    huggingfacehub_api_token=os.environ["HUGGINGFACEHUB_API_TOKEN"],
    task="text-generation"
)

  llm = HuggingFaceHub(
  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Define the embeddings model ID
embeddings_model_id = "sentence-transformers/all-MiniLM-L6-v2"

# Initialize embeddings with HuggingFace
embeddings = HuggingFaceEmbeddings(
    model_name=embeddings_model_id,
    encode_kwargs={'device': 'cpu'}  # Adjust based on your device
)

  embeddings = HuggingFaceEmbeddings(


In [6]:
# Load URL Data
loaders = UnstructuredURLLoader(urls=[
    "https://www.moneycontrol.com/news/business/markets/wall-street-rises-as-tesla-soars-on-ai-optimism-11351111.html",
    "https://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-starts-at-rs-7-1-lakh-11098751.html"
])
data = loaders.load()
len(data)

2

In [18]:
# Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)
docs = text_splitter.split_documents(data)

In [19]:
len(docs)

15

In [20]:
docs[0]

Document(metadata={'source': 'https://www.moneycontrol.com/news/business/markets/wall-street-rises-as-tesla-soars-on-ai-optimism-11351111.html'}, page_content='English\n\nHindi\n\nGujarati\n\nSpecials\n\nHello, Login\n\nHello, Login\n\nLog-inor Sign-Up\n\nMy Account\n\nMy Profile\n\nMy Portfolio\n\nMy Watchlist\n\nMy Alerts\n\nMy Messages\n\nPrice Alerts\n\nMy Profile\n\nMy PRO\n\nMy Portfolio\n\nMy Watchlist\n\nMy Alerts\n\nMy Messages\n\nPrice Alerts\n\nLogout\n\nLoans up to ₹50 LAKHS\n\nFixed Deposits\n\nCredit CardsLifetime Free\n\nCredit Score\n\nChat with Us\n\nDownload App\n\nFollow us on:\n\nGo Ad-Free\n\nMy Alerts\n\n>->MC_ENG_DESKTOP/MC_ENG_NEWS/MC_ENG_MARKETS_AS/MC_ENG_ROS_NWS_MKTS_AS_ATF_728\n\nGo PRO @₹99 PRO\n\nAdvertisement\n\nRemove Ad\n\nBusiness\n\nMarkets\n\nStocks\n\nEconomy\n\nCompanies\n\nTrends\n\nIPO\n\nOpinion\n\nEV Special\n\nOptions FestWebinar\n\nHomeNewsBusinessMarketsWall Street rises as Tesla soars on AI optimism\n\nTrending Topics\n\nSensex TodaySensex C

In [21]:
vectorindex_huggingface = FAISS.from_documents(docs, embeddings)# Create FAISS vector index

In [22]:
# Store the vectorindex_huggingface object in a pickle file
with open('vectorindex_huggingface.pkl', 'wb') as f:
    pickle.dump(vectorindex_huggingface, f)

In [23]:
# Storing vector index create in local
file_path="vectorindex_huggingface.pkl"
with open(file_path, "wb") as f:
    pickle.dump(vectorindex_huggingface, f)

In [24]:
if os.path.exists(file_path):
    with open(file_path, "rb") as f:
        vectorIndex = pickle.load(f)

# (4) Retrieve similar embeddings for a given question and call LLM to retrieve final answer

In [25]:
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorIndex.as_retriever())
chain



In [26]:
import langchain

query = "what is the price of Tiago iCNG?"
# query = "what are the main features of punch iCNG?"

langchain.debug=True

# Convert torch_dtype to string
model_kwargs['torch_dtype'] = str(model_kwargs['torch_dtype'])

# Update the model_kwargs in the llm object
llm.model_kwargs = model_kwargs

# Execute the chain with the query
result = chain({"question": query}, return_only_outputs=True)
result

[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "what is the price of Tiago iCNG?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "context": "The company also said it has also introduced the twin-cylinder technology on its Tiago and Tigor models.\n\nThe Tiago iCNG is priced between Rs 6.55 lakh and Rs 8.1 lakh, while the Tigor iCNG comes at a price range of Rs 7.8 lakh to Rs 8.95 lakh.\n\nTata Motors Passenger Vehicles Ltd Head-Marketing, Vinay Pant said these introductions put together will make the company's CNG line up \"appealing, holistic, and stronger than ever\".\n\nPTI\n\nfirst published: Aug 4, 2023 02:17 pm\n\nDiscover 



[36;1m[1;3m[llm/end][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain > llm:HuggingFaceHub] [3.42s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "Use the following portion of a long document to see if any of the text is relevant to answer the question. \nReturn any relevant text verbatim.\nThe company also said it has also introduced the twin-cylinder technology on its Tiago and Tigor models.\n\nThe Tiago iCNG is priced between Rs 6.55 lakh and Rs 8.1 lakh, while the Tigor iCNG comes at a price range of Rs 7.8 lakh to Rs 8.95 lakh.\n\nTata Motors Passenger Vehicles Ltd Head-Marketing, Vinay Pant said these introductions put together will make the company's CNG line up \"appealing, holistic, and stronger than ever\".\n\nPTI\n\nfirst published: Aug 4, 2023 02:17 pm\n\nDiscover the latest Business News, Budget 2025 News, Sensex, and Nifty updates. Obtain Personal Finance insights, tax queries, and expert o



[36;1m[1;3m[llm/end][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:StuffDocumentsChain > chain:LLMChain > llm:HuggingFaceHub] [2.99s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "generation_info": null,
        "type": "Generation"
      }
    ]
  ],
  "llm_output": null,
  "run": null,
  "type": "LLMResult"
}
[36;1m[1;3m[chain/end][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:StuffDocumentsChain > chain:LLMChain] [2.99s] Exiting Chain run with output:
[0m{
}
[36;1m[1;3m[chain/end][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:StuffDocumentsChain] [3.00s] Exiting Chain run with output:
[0m{
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > c



[36;1m[1;3m[llm/end][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:StuffDocumentsChain > chain:LLMChain > llm:HuggingFaceHub] [2.18s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "generation_info": null,
        "type": "Generation"
      }
    ]
  ],
  "llm_output": null,
  "run": null,
  "type": "LLMResult"
}
[36;1m[1;3m[chain/end][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:StuffDocumentsChain > chain:LLMChain] [2.19s] Exiting Chain run with output:
[0m{
}
[36;1m[1;3m[chain/end][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:StuffDocumentsChain] [2.20s] Exiting Chain run with output:
[0m{
}
[31;1m[1;3m[chain/error][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain] [9.07s] Chain run errored with error:
[0m"ValueError('A single document was longer than the context length, we cannot handle this.')Traceback (m

ValueError: A single document was longer than the context length, we cannot handle this.