In [1]:
import os
import glob
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

# --- 1. FIND ALL PDF FILES ---
raw_data_path = '../data/raw/'
pdf_files = glob.glob(os.path.join(raw_data_path, '*.pdf'))

if not pdf_files:
    print("Error: No PDF files found in the '/data/raw/' folder.")
else:
    print(f"Found {len(pdf_files)} PDF files to process.")
    
    all_documents = []
    # --- 2. LOAD ALL DOCUMENTS ---
    for file_path in pdf_files:
        print(f"Loading file: {os.path.basename(file_path)}...")
        try:
            loader = PyPDFLoader(file_path)
            documents = loader.load()
            all_documents.extend(documents)
        except Exception as e:
            print(f"--> Error loading {os.path.basename(file_path)}: {e}")

    print(f"\nSuccessfully loaded a total of {len(all_documents)} pages from all files.")

    # --- 3. SPLIT DOCUMENTS INTO CHUNKS ---
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
    splits = text_splitter.split_documents(all_documents)
    print(f"Split all documents into {len(splits)} chunks.")

    # --- 4. GENERATE EMBEDDINGS AND CREATE VECTOR STORE ---
    # This will use the same local model running on your CPU
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    model_kwargs = {'device': 'cpu'}
    embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

    persist_directory = '../vector_store'
    
    # This will overwrite your old vector store with the new, comprehensive one
    print("\nCreating and persisting the new, expanded vector store... This will take a significant amount of time.")
    vectordb = Chroma.from_documents(
        documents=splits,
        embedding=embeddings,
        persist_directory=persist_directory
    )
    
    print("\nNew vector store created successfully!")

Found 15 PDF files to process.
Loading file: Annual_Report_RBI_2020-2021.pdf...
Loading file: Annual_Report_RBI_2021-2022.pdf...
Loading file: Annual_Report_RBI_2022-2023.pdf...
Loading file: Annual_Report_RBI_2023-2024.pdf...
Loading file: Annual_Report_RBI_2024-2025.pdf...
Loading file: Budget_Speech_2020-2021.pdf...
Loading file: Budget_Speech_2021-2022.pdf...
Loading file: Budget_Speech_2022-2023.pdf...
Loading file: Budget_Speech_2023-2024.pdf...
Loading file: Budget_Speech_2024-2025.pdf...
Loading file: Economic_Survey_2020-2021.pdf...
Loading file: Economic_Survey_2022-2023.pdf...
Loading file: Economic_Survey_2023-2024.pdf...
Loading file: Economic_Survey_2024-2025.pdf...
Loading file: Economix_Survey_2021-2022.pdf...

Successfully loaded a total of 4148 pages from all files.
Split all documents into 13843 chunks.


  embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)
  from .autonotebook import tqdm as notebook_tqdm



Creating and persisting the new, expanded vector store... This will take a significant amount of time.

New vector store created successfully!


In [2]:
# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

# Split the document into chunks
splits = text_splitter.split_documents(documents)

print(f"Split the document into {len(splits)} chunks.")

Split the document into 1517 chunks.


In [3]:
# Specify the embedding model you want to use
model_name = "sentence-transformers/all-MiniLM-L6-v2"

# We will use the CPU for embedding
model_kwargs = {'device': 'cpu'}

# Initialize the HuggingFaceEmbeddings model
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs
)

# Define the path for the persistent vector store
persist_directory = r'C:\economic_data_navigator\vector_store'

# Create the vector store from the document splits and save it to disk
print("Creating and persisting vector store... This will take a while.")
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embeddings,
    persist_directory=persist_directory
)

print("Vector store created successfully!")

  embeddings = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Creating and persisting vector store... This will take a while.
Vector store created successfully!


In [4]:
# Load the persisted vector store from disk
vectordb = Chroma(
    persist_directory=persist_directory,
    embedding_function=embeddings
)

# Define a test query
query = "What is the outlook for the Indian economy in FY26?"

# Perform a similarity search
retrieved_docs = vectordb.similarity_search(query, k=3) # k=3 retrieves the top 3 most similar chunks

# Print the content of the most relevant chunk
print("--- Most Relevant Chunk ---")
print(retrieved_docs[0].page_content)

--- Most Relevant Chunk ---
economic stability.
In this global context, India displayed steady economic growth. As per the 
first advance estimates of national accounts, India’s real GDP is estimated to 
grow by 6.4 per cent in FY25. Growth in the first half of FY25 was supported by 
agriculture and services, with rural demand improving on the back of record 
Kharif production and favourable agricultural conditions. The manufacturing 
sector faced pressures due to weak global demand and domestic seasonal 
conditions. Private consumption remained stable, reflecting steady domestic 
demand. Fiscal discipline and strong external balance supported by a services 
trade surplus and healthy remittance growth contributed to macroeconomic 
stability. Together, these factors provided a solid foundation for sustained 
growth amid external uncertainties.
Looking ahead, India’s economic prospects for FY26 are balanced. Headwinds 
to growth include elevated geopolitical and trade uncertainties and p

  vectordb = Chroma(


In [5]:
import os
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

# --- 1. SET UP YOUR API KEY ---
# IMPORTANT: Replace "YOUR_API_KEY" with the key you just created.
# For better security, it's best to set this as an environment variable.
os.environ['GOOGLE_API_KEY'] = "AIzaSyDytLcfHpglegmavoKuF8YJvJjTQRYh-1I"


# --- 2. LOAD YOUR EXISTING VECTOR STORE & EMBEDDINGS ---
# (This is the same code as your test step from before)
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings

model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {'device': 'cpu'}
embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

persist_directory = '../vector_store'
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)


# --- 3. INITIALIZE THE LLM (GEMINI) ---
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash")


# --- 4. CREATE A PROMPT TEMPLATE ---
# This template instructs the LLM to answer the question based ONLY on the provided context.
prompt_template = """
Answer the user's question based only on the following context:

<context>
{context}
</context>

Question: {input}
"""

prompt = ChatPromptTemplate.from_template(prompt_template)


# --- 5. CREATE THE RAG CHAIN ---
# First, create a chain to combine the documents into a single prompt ("stuff" chain)
document_chain = create_stuff_documents_chain(llm, prompt)

# Now, create the main retrieval chain
# This chain takes a question, retrieves documents, and then passes them to the document_chain
retriever = vectordb.as_retriever()
retrieval_chain = create_retrieval_chain(retriever, document_chain)


# --- 6. INVOKE THE CHAIN AND GET AN ANSWER ---
query = "What is the outlook for the Indian economy in FY26?"

# The invoke method runs the entire chain and returns a dictionary
response = retrieval_chain.invoke({"input": query})

# Let's see the full response dictionary
print("--- Full Response Dictionary ---")
print(response)

# Now, let's print just the clean answer
print("\n--- Generated Answer ---")
print(response['answer'])

--- Full Response Dictionary ---
{'input': 'What is the outlook for the Indian economy in FY26?', 'context': [Document(metadata={'moddate': '2025-01-30T12:27:34+05:30', 'page': 45, 'producer': 'Adobe PDF Library 17.0', 'trapped': '/False', 'page_label': '1', 'creationdate': '2025-01-30T12:15:36+05:30', 'total_pages': 482, 'creator': 'Adobe InDesign 18.1 (Windows)', 'source': 'C:\\economic_data_navigator\\data\\raw\\echapter.pdf'}, page_content='economic stability.\nIn this global context, India displayed steady economic growth. As per the \nfirst advance estimates of national accounts, India’s real GDP is estimated to \ngrow by 6.4 per cent in FY25. Growth in the first half of FY25 was supported by \nagriculture and services, with rural demand improving on the back of record \nKharif production and favourable agricultural conditions. The manufacturing \nsector faced pressures due to weak global demand and domestic seasonal \nconditions. Private consumption remained stable, reflecting s