In [1]:
import os
print(os.getcwd())
os.chdir('..')
print(os.getcwd())

/Users/nikhil/Documents/GitHub_portfolio/Enterprise-Grade RAG System/notebook
/Users/nikhil/Documents/GitHub_portfolio/Enterprise-Grade RAG System


In [3]:
%%capture
!pip install -r requirements.txt

In [4]:
from llama_index.readers.file import PyMuPDFReader
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceWindowNodeParser , HierarchicalNodeParser , SemanticSplitterNodeParser,SentenceSplitter
import chromadb
import re
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama

In [6]:
loading_mapping = {'.pdf':PyMuPDFReader()}

documents = SimpleDirectoryReader("./data" , file_extractor = loading_mapping).load_data()

In [7]:
current_section = "Unknown"

for doc in documents:
    match = re.search(r"(\d{4})", doc.metadata.get("file_name", ""))
    if match:
        doc.metadata['year'] = int(match.group(1))
        doc.metadata['company'] = "coca-cola"
        doc.metadata["filing_type"] = "10-K"

    # --- detect ALL CAPS ITEM headers ---
    # normalize spaces, but keep case
    text = doc.text.replace("\n", " ").replace("  ", " ")

    # Regex: ITEM + number + optional letter + . + ALL CAPS TITLE
    match = re.search(r"(ITEM\s+\d+[A-Z]?\.\s+[A-Z\s,&\-]+)", text)
    if match:
        current_section = match.group(0).strip()   # full "ITEM 1. BUSINESS"
        doc.metadata["section"] = current_section
        continue

    # If no new match, keep previous section
    doc.metadata["section"] = current_section

In [9]:
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")
llm = Ollama(model="deepseek-r1:1.5b")

In [12]:
splitter = SentenceSplitter(chunk_size=1024)

index = VectorStoreIndex.from_documents(documents, transformations=[splitter],embed_model=embed_model)

In [14]:
from llama_index.core.vector_stores import MetadataFilters ,MetadataFilter
year_filter = MetadataFilters(
    filters=[
        MetadataFilter(key="year", value=2015 , operator = ">="),
        MetadataFilter(key="year",value=2025, operator = "<="),
    ]
)

In [15]:
vector_retriever = index.as_retriever(similarity_top_k=3,filters=year_filter)

In [61]:
from llama_index.retrievers.bm25 import BM25Retriever



bm25_retriever = BM25Retriever.from_defaults(
    docstore=index.docstore, similarity_top_k=2
)

In [63]:

from llama_index.core.retrievers import QueryFusionRetriever

retriever = QueryFusionRetriever(
    [vector_retriever, bm25_retriever],
    similarity_top_k=3,
    mode="reciprocal_rerank",
    verbose=True,
    use_async=True,
    num_queries=4,
    llm=llm,
    query_gen_prompt=
    "You are a helpful assistant that generates multiple search queries based on a "
    "single input query. Generate {num_queries} search queries, one on each line, "
    "related to the following input query:\n"
    "Query: {query}\n"
    "Queries:\n"
    )

In [37]:
import nest_asyncio

nest_asyncio.apply()

In [73]:
query = "what were the unit case sales figures for Coca-Cola in 2014, 2013, and 2012 respectively?"
ans=bm25_retriever.retrieve(query)

In [75]:
for i in ans:
    print(i.metadata)
    print(i.text)
    print("-"*100)

{'file_path': '/Users/nikhil/Documents/GitHub_portfolio/Enterprise-Grade RAG System/data/2015-cocacola-10k-filing.pdf', 'file_name': '2015-cocacola-10k-filing.pdf', 'file_type': 'application/pdf', 'file_size': 1717600, 'creation_date': '2025-08-30', 'last_modified_date': '2025-05-23', 'total_pages': 216, 'source': '49', 'year': 2015, 'company': 'coca-cola', 'filing_type': '10-K', 'section': 'ITEM 6. SELECTED FINANCIAL DATA T'}
Unit case volume for Bottling Investments decreased 17 percent. This decrease primarily reflects the sale of a majority ownership interest in our previously consolidated bottling operations in the
Philippines to Coca-Cola FEMSA, S.A.B. de C.V. ("Coca-Cola FEMSA") in January 2013, as well as the deconsolidation of our bottling operations in Brazil during July 2013 as a result of their
combination with an independent bottling partner. The unfavorable impact of these transactions on the group's unit case volume results was partially offset by growth in other key mar