In [5]:
import os
import glob
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

# --- 1. FIND ALL PDF FILES ---
raw_data_path = '../data/raw/'
pdf_files = glob.glob(os.path.join(raw_data_path, '*.pdf'))

if not pdf_files:
    print("Error: No PDF files found in the '/data/raw/' folder.")
else:
    print(f"Found {len(pdf_files)} PDF files to process.")
    
    all_documents = []
    # --- 2. LOAD ALL DOCUMENTS ---
    for file_path in pdf_files:
        print(f"Loading file: {os.path.basename(file_path)}...")
        try:
            loader = PyPDFLoader(file_path)
            documents = loader.load()
            all_documents.extend(documents)
        except Exception as e:
            print(f"--> Error loading {os.path.basename(file_path)}: {e}")

    print(f"\nSuccessfully loaded a total of {len(all_documents)} pages from all files.")

    # --- 3. SPLIT DOCUMENTS INTO CHUNKS ---
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
    splits = text_splitter.split_documents(all_documents)
    print(f"Split all documents into {len(splits)} chunks.")

    # --- 4. GENERATE EMBEDDINGS AND CREATE VECTOR STORE ---
    # This will use the same local model running on your CPU
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    model_kwargs = {'device': 'cpu'}
    embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

    persist_directory = '../vector_store'
    
    # This will overwrite your old vector store with the new, comprehensive one
    print("\nCreating and persisting the new, expanded vector store... This will take a significant amount of time.")
    # 1. Create the vector store in memory
    vectordb = FAISS.from_documents(
        documents=splits, 
        embedding=embeddings
    )

    # 2. Save it to the specified folder
    vectordb.save_local(folder_path=persist_directory)
    
    print("\nNew vector store created successfully!")

Found 15 PDF files to process.
Loading file: Annual_Report_RBI_2020-2021.pdf...
Loading file: Annual_Report_RBI_2021-2022.pdf...
Loading file: Annual_Report_RBI_2022-2023.pdf...
Loading file: Annual_Report_RBI_2023-2024.pdf...
Loading file: Annual_Report_RBI_2024-2025.pdf...
Loading file: Budget_Speech_2020-2021.pdf...
Loading file: Budget_Speech_2021-2022.pdf...
Loading file: Budget_Speech_2022-2023.pdf...
Loading file: Budget_Speech_2023-2024.pdf...
Loading file: Budget_Speech_2024-2025.pdf...
Loading file: Economic_Survey_2020-2021.pdf...
Loading file: Economic_Survey_2022-2023.pdf...
Loading file: Economic_Survey_2023-2024.pdf...
Loading file: Economic_Survey_2024-2025.pdf...
Loading file: Economix_Survey_2021-2022.pdf...

Successfully loaded a total of 4148 pages from all files.
Split all documents into 13843 chunks.

Creating and persisting the new, expanded vector store... This will take a significant amount of time.

New vector store created successfully!


In [6]:
# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

# Split the document into chunks
splits = text_splitter.split_documents(documents)

print(f"Split the document into {len(splits)} chunks.")

Split the document into 1263 chunks.


In [7]:
# Specify the embedding model you want to use
model_name = "sentence-transformers/all-MiniLM-L6-v2"

# We will use the CPU for embedding
model_kwargs = {'device': 'cpu'}

# Initialize the HuggingFaceEmbeddings model
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs
)

# Define the path for the persistent vector store
persist_directory = r'C:\economic_data_navigator\vector_store'

# Create the vector store from the document splits and save it to disk
print("Creating and persisting vector store... This will take a while.")
print("\nCreating and persisting the new FAISS vector store...")
vectordb = FAISS.from_documents(
    documents=splits,
    embedding=embeddings
)
vectordb.save_local(persist_directory)

print("Vector store created successfully!")

Creating and persisting vector store... This will take a while.

Creating and persisting the new FAISS vector store...
Vector store created successfully!


In [8]:
from langchain_community.vectorstores import FAISS

# Load the persisted vector store from disk using the correct method
vectordb = FAISS.load_local(
    folder_path=persist_directory,
    embeddings=embeddings,
    allow_dangerous_deserialization=True  # Required for loading local FAISS indexes
)

# Define a test query
query = "What is the outlook for the Indian economy in FY26?"

# Perform a similarity search (this part remains the same)
retrieved_docs = vectordb.similarity_search(query, k=3)

# Print the content of the most relevant chunk
print("--- Most Relevant Chunk ---")
print(retrieved_docs[0].page_content)

--- Most Relevant Chunk ---
20 per cent reflecting higher international petroleum prices. Although the high WPI inflation is 
partly due to base effects that will even out, India does need to be wary of imported inflation, 
especially from elevated global energy prices.
Figure 33: Consumer Price Inflation Rates Figure 34: CPI and WPI Inflation
0
2
4
6
8
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
Per cent
AEs EMDEs
-8%
-4%
0%
4%
8%
12%
16%
Apr-20
Jun-20
Aug-20
Oct-20
Dec-20
Feb-21
Apr-21
Jun-21
Aug-21
Oct-21
Dec-21
CPI WPI
Source: World Economic Outlook, January 2022 Update, 
IMF
Note: Figures are annual averages; Figures for 2021 are 
projections. Advanced Economies include 40 economies 
and Emerging Markets and Developing Economies 
(EMDEs) include 156 economies as per IMF classification
Source: MoSPI, DPIIT
1.38 Overall, macro-econo mic stability indicators suggest that the Indian economy is well-
placed to take on the challenges of 2022-23.
Box 2: Global Supply-Side

In [9]:
import os
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

# --- 1. SET UP YOUR API KEY ---
# IMPORTANT: Replace "YOUR_API_KEY" with the key you just created.
# For better security, it's best to set this as an environment variable.
os.environ['GOOGLE_API_KEY'] = "AIzaSyDytLcfHpglegmavoKuF8YJvJjTQRYh-1I"


# --- 2. LOAD YOUR EXISTING VECTOR STORE & EMBEDDINGS ---
# (This is the same code as your test step from before)
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings

model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {'device': 'cpu'}
embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

persist_directory = '../vector_store'
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)


# --- 3. INITIALIZE THE LLM (GEMINI) ---
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash")


# --- 4. CREATE A PROMPT TEMPLATE ---
# This template instructs the LLM to answer the question based ONLY on the provided context.
prompt_template = """
Answer the user's question based only on the following context:

<context>
{context}
</context>

Question: {input}
"""

prompt = ChatPromptTemplate.from_template(prompt_template)


# --- 5. CREATE THE RAG CHAIN ---
# First, create a chain to combine the documents into a single prompt ("stuff" chain)
document_chain = create_stuff_documents_chain(llm, prompt)

# Now, create the main retrieval chain
# This chain takes a question, retrieves documents, and then passes them to the document_chain
retriever = vectordb.as_retriever()
retrieval_chain = create_retrieval_chain(retriever, document_chain)


# --- 6. INVOKE THE CHAIN AND GET AN ANSWER ---
query = "What is the outlook for the Indian economy in FY26?"

# The invoke method runs the entire chain and returns a dictionary
response = retrieval_chain.invoke({"input": query})

# Let's see the full response dictionary
print("--- Full Response Dictionary ---")
print(response)

# Now, let's print just the clean answer
print("\n--- Generated Answer ---")
print(response['answer'])

ModuleNotFoundError: No module named 'langchain_google_genai'