In [1]:
import os
import streamlit as st
import pickle
import time
import faiss
import langchain
from langchain import OpenAI
from dotenv import load_dotenv
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS

In [16]:
#load openAI api key
load_dotenv()
# Access the environment variable
open_api_key = os.getenv('OPEN_API_KEY')
os.environ['OPENAI_API_KEY'] = open_api_key


In [17]:
# Initialise LLM with required params
llm = OpenAI(temperature=0.9, max_tokens=500) 


In [18]:
loaders = UnstructuredURLLoader(urls=[
    "https://www.moneycontrol.com/news/business/markets/wall-street-rises-as-tesla-soars-on-ai-optimism-11351111.html",
    "https://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-starts-at-rs-7-1-lakh-11098751.html"
])
data = loaders.load() 
len(data)

2

In [19]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

# As data is of type documents we can directly use split_documents over split_text in order to get the chunks.
docs = text_splitter.split_documents(data)

In [20]:
len(docs)

6

Create embeddings for these chunks and save them to FAISS index

In [21]:
# Create the embeddings of the chunks using openAIEmbeddings
embeddings = OpenAIEmbeddings()

# Pass the documents and embeddings inorder to create FAISS vector index
vectorindex_openai = FAISS.from_documents(docs, embeddings)

In [22]:
# Assuming vectorindex_openai is an instance of FAISS
index = vectorindex_openai.index
index

<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x000001E33DFE5920> >

In [None]:
import os
import pickle
import faiss
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS

file_path = "vector_index.faiss"
docstore_path = "vector_index_docstore.pkl"
index_to_docstore_id_path = "vector_index_index_to_docstore_id.pkl"

if not (os.path.exists(file_path) and os.path.exists(docstore_path) and os.path.exists(index_to_docstore_id_path)):
    # Create the embeddings of the chunks using OpenAIEmbeddings
    embeddings = OpenAIEmbeddings()

    # Pass the documents and embeddings in order to create FAISS vector index
    # vectorindex_openai = FAISS.from_documents(docs, embeddings)

    # Save the FAISS index to a file
    index = vectorindex_openai.index
    faiss.write_index(index, file_path)

    # Save the docstore and index_to_docstore_id separately
    with open(docstore_path, "wb") as f:
        pickle.dump(vectorindex_openai.docstore, f)

    with open(index_to_docstore_id_path, "wb") as f:
        pickle.dump(vectorindex_openai.index_to_docstore_id, f)

    # Save the embedding function separately if it is picklable
    # If not, you may need to recreate it during loading
    embedding_function = vectorindex_openai.embedding_function
else:
    print("FAISS index already exists. Skipping creation.")

In [None]:
import faiss
import pickle
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS

file_path = "vector_index.faiss"
docstore_path = "vector_index_docstore.pkl"
index_to_docstore_id_path = "vector_index_index_to_docstore_id.pkl"

if os.path.exists(file_path) and os.path.exists(docstore_path) and os.path.exists(index_to_docstore_id_path):
    # Load the FAISS index from a file
    index = faiss.read_index(file_path)

    # Load the docstore and index_to_docstore_id
    with open(docstore_path, "rb") as f:
        docstore = pickle.load(f)

    with open(index_to_docstore_id_path, "rb") as f:
        index_to_docstore_id = pickle.load(f)

    # Recreate the embedding function if necessary
    embedding_function = OpenAIEmbeddings()

    # Recreate the FAISS object with the loaded index and additional data
    vectorIndex = FAISS(
        index=index,
        embedding_function=embedding_function,
        docstore=docstore,
        index_to_docstore_id=index_to_docstore_id
    )
else:
    print("FAISS index files do not exist. Please create the index first.")

Retrieve similar embeddings for a given question and call LLM to retrieve final answer

In [None]:
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorIndex.as_retriever())
chain

In [25]:
query = "what is the price of Tiago iCNG?"
# query = "what are the main features of punch iCNG?"

langchain.debug=True

chain({"question": query}, return_only_outputs=True)

[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "what is the price of Tiago iCNG?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "context": "Please select (*) all mandatory conditions to continue.\n\nPage Generated = 2024-12-20 20:13:21 AKAMAI_DEVICE_CHARACTERISTICS =\n\nis_mobile=false\n\ndevice_type = desktop api_url = https://www.moneycontrol.com/https://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-starts-at-rs-7-1-lakh-11098751.html?PAGE_VIEW_IN_EU_REGION=1&DEVICE_TYPE=desktop continent = EU region_code = SN url = https://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-sta

{'answer': ' The price of Tata Punch iCNG starts at Rs 7.1 lakh.\n',
 'sources': 'https://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-starts-at-rs-7-1-lakh-11098751.html'}