In [1]:
import os
print(os.getcwd())
os.chdir('..')
print(os.getcwd())


/Users/nikhil/Documents/GitHub_portfolio/Enterprise-Grade RAG System/notebook
/Users/nikhil/Documents/GitHub_portfolio/Enterprise-Grade RAG System


In [2]:
%%capture
!pip install -r requirements.txt

In [3]:
from llama_index.readers.file import PyMuPDFReader
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceWindowNodeParser , HierarchicalNodeParser , SemanticSplitterNodeParser,SentenceSplitter
import chromadb
import re
from llama_index.core import VectorStoreIndex, StorageContext,get_response_synthesizer
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama
from llama_index.core.query_engine import RetrieverQueryEngine

In [4]:
loading_mapping = {'.pdf':PyMuPDFReader()}

documents = SimpleDirectoryReader("./data" , file_extractor = loading_mapping).load_data()

In [5]:
current_section = "Unknown"

for doc in documents:
    match = re.search(r"(\d{4})", doc.metadata.get("file_name", ""))
    if match:
        doc.metadata['year'] = int(match.group(1))
        doc.metadata['company'] = "coca-cola"
        doc.metadata["filing_type"] = "10-K"

    # --- detect ALL CAPS ITEM headers ---
    # normalize spaces, but keep case
    text = doc.text.replace("\n", " ").replace("  ", " ")

    # Regex: ITEM + number + optional letter + . + ALL CAPS TITLE
    match = re.search(r"(ITEM\s+\d+[A-Z]?\.\s+[A-Z\s,&\-]+)", text)
    if match:
        current_section = match.group(0).strip()   # full "ITEM 1. BUSINESS"
        doc.metadata["section"] = current_section
        continue

    # If no new match, keep previous section
    doc.metadata["section"] = current_section

In [6]:
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")


In [7]:
llm = Ollama(model="deepseek-r1:1.5b")

In [8]:
node_parser = SentenceWindowNodeParser(
    window_size = 3,
    window_metadata_key ="window",
    original_text_metadata_key = "original_text"
)

In [9]:
parser = SemanticSplitterNodeParser(buffer_size =3 ,embed_model=embed_model ,breakpoint_percentile_threshold=95,include_metadata=True)

In [10]:
node = parser.get_nodes_from_documents(documents)

In [11]:
len(node)

4934

In [12]:
node[0].metadata

{'file_path': '/Users/nikhil/Documents/GitHub_portfolio/Enterprise-Grade RAG System/data/2015-cocacola-10k-filing.pdf',
 'file_name': '2015-cocacola-10k-filing.pdf',
 'file_type': 'application/pdf',
 'file_size': 1717600,
 'creation_date': '2025-08-31',
 'last_modified_date': '2025-05-23',
 'total_pages': 216,
 'source': '1',
 'year': 2015,
 'company': 'coca-cola',
 'filing_type': '10-K',
 'section': 'Unknown'}

In [13]:
db =chromadb.PersistentClient(".auto")
chroma_collection = db.get_or_create_collection("new")
vector_store = ChromaVectorStore(chroma_collection = chroma_collection)
storage_context = StorageContext.from_defaults(vector_store = vector_store)

index = VectorStoreIndex(node,storage_context=storage_context,embed_model=embed_model)

In [14]:
from llama_index.core.indices.vector_store.retrievers import VectorIndexAutoRetriever
from llama_index.core.vector_stores.types import MetadataInfo , VectorStoreInfo

In [15]:
vector_store_info = VectorStoreInfo(
    content_info="10-K filings for Coca-Cola",
    metadata_info=[
        MetadataInfo(
            name="year",
            type="int",
            description=(
                "The company filing on year"
            ),
        ),
        MetadataInfo(
            name="company",
            type="str",
            description=(
                "The company name (e.g., Coca-Cola)"
            ),
        ),
        MetadataInfo(
            name="section",
            type="str",
            description=(
                "The section of the filing (e.g., ITEM 1. BUSINESS)"
            ),
        ),
    ]
)

In [37]:
retriever = VectorIndexAutoRetriever(
    index ,vector_store_info = vector_store_info,llm=llm,
    system_prompt=(
        "You must always output JSON in the following schema:\n"
        "{ 'query': '<string>', 'filters': { 'year': <int> } }"
    )
)

In [39]:
query = 'In 2015 document identifies several factors that could negatively impact Coca-Cola’s profitability?'
ans=retriever.retrieve(query)

In [40]:
for i in ans:
    print(i.metadata)
    print(i.text)
    print("-"*100)

{'file_path': '/Users/nikhil/Documents/GitHub_portfolio/Enterprise-Grade RAG System/data/2022-cocacola-10k-filing.pdf', 'file_name': '2022-cocacola-10k-filing.pdf', 'file_type': 'application/pdf', 'file_size': 9762541, 'creation_date': '2025-08-17', 'last_modified_date': '2025-05-23', 'total_pages': 183, 'source': '14', 'year': 2022, 'company': 'coca-cola', 'filing_type': '10-K', 'section': 'ITEM 1A. RISK FACTORS I'}
Business” of this report. Our ability to maintain or gain share of
sales in the global market or in local markets may be limited as a result of actions by competitors. Competitive pressures may cause the Company and our bottling partners to
reduce prices we charge customers or may restrict our and our bottlers’ ability to increase prices, as may be necessary in response to commodity and other cost increases. Such
pressures may also increase marketing costs along with in-store placement and slotting fees. In addition, the rapid growth of e‑commerce may create additional con

In [60]:
from llama_index.llms.ollama import Ollama
llm =Ollama(model="gemma3:latest",request_timeout=120)
response_synthesizer = get_response_synthesizer(response_mode ="compact",llm=llm)
query_engine = RetrieverQueryEngine(retriever = retriever, response_synthesizer = response_synthesizer)
response=query_engine.query("In 2015 document identifies several factors that could negatively impact Coca-Cola’s profitability?")
print(response)

ReadTimeout: timed out

In [20]:
from deepeval import evaluate
from deepeval.test_case import LLMTestCase

from deepeval.metrics import FaithfulnessMetric,AnswerRelevancyMetric,ContextualPrecisionMetric,ContextualRecallMetric,ContextualRelevancyMetric

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [28]:
import json
with open("notebook/coca_cola_qa_dataset.json", "r") as f:
    dataset = json.load(f)

In [43]:
def run_rag(query):
    # Ask your RetrieverQueryEngine
    response = query_engine.query(query)
    
    # Answer generated by your response_synthesizer
    rag_answer = str(response)
    
    # Retrieved contexts (nodes)
    retrieved_docs = [str(node) for node in response.source_nodes]
    
    return rag_answer, retrieved_docs

In [57]:
test_cases = []
for item in dataset[:1]:
    rag_answer, retrieved_docs = run_rag(item["query"])
    
    test_cases.append(
        LLMTestCase(
            input=item["query"],                       # the query
            actual_output=rag_answer,                  # RAG's generated answer
            expected_output=item["reference_answer"],  # gold answer
            retrieval_context=retrieved_docs          # what your retriever returned
        )
    )

ReadTimeout: timed out

In [49]:
!deepeval set-ollama gemma3:latest

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


🙌 Congratulations! You're now using a local Ollama model for all evals that 
require an LLM.


In [None]:
metrics = [
    AnswerRelevancyMetric(),
    FaithfulnessMetric(),
    ContextualPrecisionMetric(),
    ContextualRecallMetric(),
    ContextualRelevancyMetric()
    
]

In [None]:
evaluate(test_cases=test_cases, metrics=metrics)

In [None]:
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams
correctness_metric = GEval(
    name="Correctness",
    criteria="Determine whether the actual output is factually correct based on the expected output.",
    # NOTE: you can only provide either criteria or evaluation_steps, and not both
    evaluation_steps=[
        "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
        "You should also heavily penalize omission of detail",
        "Vague language, or contradicting OPINIONS, are OK"
    ],
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT]
)

In [None]:
evaluate(test_cases=test_cases, metrics=[correctness_metric])