In [1]:
import os
print(os.getcwd())
os.chdir('..')
print(os.getcwd())

/Users/nikhil/Documents/GitHub_portfolio/1-Enterprise-Grade RAG System/notebook
/Users/nikhil/Documents/GitHub_portfolio/1-Enterprise-Grade RAG System


In [3]:
%%capture
!pip install -r requirements.txt


In [45]:
import re
import nest_asyncio

nest_asyncio.apply()
from llama_index.readers.file import PyMuPDFReader
from llama_index.core import SimpleDirectoryReader , VectorStoreIndex, StorageContext,get_response_synthesizer
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.node_parser import SentenceSplitter , SentenceWindowNodeParser , HierarchicalNodeParser , SemanticSplitterNodeParser
from llama_index.llms.ollama import Ollama
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.retrievers import RecursiveRetriever
from llama_index.core.vector_stores import MetadataFilter, MetadataFilters

In [6]:
#load data
loading_mapping = {".pdf":PyMuPDFReader()}

documents = SimpleDirectoryReader("./data",file_extractor = loading_mapping).load_data()

In [7]:
current_section ="unknown"

for doc in documents:
    match = re.search(r"(\d{4})" , doc.metadata.get("file_name",""))
    if match :
        doc.metadata['year']=int(match.group(0))
        doc.metadata['company'] = "Coca-cola"
        doc.metadata["filing_type"] = "10-K"
    text = doc.text.replace("\n"," ").replace("  "," ")
    match = re.search(r"(ITEM\s+\d+[A-Z]?\.\s+[A-Z\s,&\-]+)", text)
    if match:
        current_section = match.group(0).strip()
        doc.metadata['section']=current_section
        continue
    doc.metadata['section']=current_section

In [9]:
documents[22].metadata

{'file_path': '/Users/nikhil/Documents/GitHub_portfolio/1-Enterprise-Grade RAG System/data/2015-cocacola-10k-filing.pdf',
 'file_name': '2015-cocacola-10k-filing.pdf',
 'file_type': 'application/pdf',
 'file_size': 1717600,
 'creation_date': '2025-09-15',
 'last_modified_date': '2025-05-23',
 'total_pages': 216,
 'source': '23',
 'year': 2015,
 'company': 'Coca-cola',
 'filing_type': '10-K',
 'section': 'ITEM 1B. UNRESOLVED STAFF COMMENTS N'}

In [10]:
parent_parser = SentenceSplitter(chunk_size=2048,chunk_overlap=0)
parent_nodes = parent_parser.get_nodes_from_documents(documents)

In [19]:
for i,n in enumerate(parent_nodes):
    n.id_ = f"parent-{i} "

In [21]:
sub_chunk_sizes = [256,512,1024]
child_splitter = [
    SentenceSplitter(chunk_size = c ,chunk_overlap=120) for c in sub_chunk_sizes
]

In [23]:
from llama_index.core.schema import IndexNode
all_nodes =[]
for nodes in parent_nodes:
    for n in child_splitter:
        sub_nodes = n.get_nodes_from_documents([nodes])
        sub_inodes = [
            IndexNode.from_text_node(sn ,nodes.node_id) for sn in sub_nodes
        ]
        all_nodes.extend(sub_inodes)
    original_nodes = IndexNode.from_text_node(nodes,nodes.node_id)
    all_nodes.append(original_nodes)

In [24]:
for n in all_nodes[0]:
    print(n)

('id_', '54f1f3e7-b9ad-4c63-94e1-2e7499d0a1e9')
('embedding', None)
('metadata', {'file_path': '/Users/nikhil/Documents/GitHub_portfolio/1-Enterprise-Grade RAG System/data/2015-cocacola-10k-filing.pdf', 'file_name': '2015-cocacola-10k-filing.pdf', 'file_type': 'application/pdf', 'file_size': 1717600, 'creation_date': '2025-09-15', 'last_modified_date': '2025-05-23', 'total_pages': 216, 'source': '1', 'year': 2015, 'company': 'Coca-cola', 'filing_type': '10-K', 'section': 'unknown'})
('excluded_embed_metadata_keys', ['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'])
('excluded_llm_metadata_keys', ['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'])
('relationships', {<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='6ded93ad-32e7-407f-878c-41951c07452b', node_type='4', metadata={'file_path': '/Users/nikhil/Documents/GitHub_portfolio/1-Enterprise-Grade RAG System/data/2015-cocacola-

In [25]:
all_node_dict = {
    n.node_id: n for n in all_nodes
}

In [29]:
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")


In [30]:
db = chromadb.PersistentClient(".recursivedb")

chroma_collection = db.get_or_create_collection("new_1")

vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex(all_nodes,storage_context=storage_context,embed_model=embed_model)

In [84]:
def get_documents_by_year_from_query_llamaindex(query, index=index):
    # Valid years
    valid_years = [2015, 2016, 2017, 2018, 2019, 2020, 
                   2021, 2022, 2023, 2024, 2025]

    # Extract year from query
    match = re.search(r"\b(\d{4})\b", query)
    
    filters = None  # default: no filter

    if match:
        year = int(match.group(1))
        if year in valid_years:
            filters = MetadataFilters(filters=[
                MetadataFilter(key="year", value=year + 1)
            ])
        elif year < 2015:
            filters = MetadataFilters(filters=[
                MetadataFilter(key="year", value=2015)
            ])

    # Create retriever from index with filter
    retriever = index.as_retriever(
        similarity_top_k=5,
        filters=filters
    )
    vector_retriever_chunk = RecursiveRetriever(
        "vector",
        retriever_dict={"vector": retriever},
        node_dict=all_node_dict,
        verbose=True,)

    # Perform retrieval
    results = vector_retriever_chunk.retrieve(query)
    llm =Ollama(model="gemma3:latest")
    response_synthesizer = get_response_synthesizer(response_mode ="compact",llm=llm)

    query_engine = RetrieverQueryEngine(retriever = vector_retriever_chunk, response_synthesizer = response_synthesizer)
    response=query_engine.query(query)
    
    return results,response

In [86]:

query = "GROSS PROFIT for The Coca-Cola Company in this 10-K in 2013?"
retrive,ans= get_documents_by_year_from_query_llamaindex(query)

[1;3;34mRetrieving with query id None: GROSS PROFIT for The Coca-Cola Company in this 10-K in 2013?
[0m[1;3;38;5;200mRetrieved node with id, entering: parent-140 
[0m[1;3;34mRetrieving with query id parent-140 : GROSS PROFIT for The Coca-Cola Company in this 10-K in 2013?
[0m[1;3;38;5;200mRetrieved node with id, entering: parent-48 
[0m[1;3;34mRetrieving with query id parent-48 : GROSS PROFIT for The Coca-Cola Company in this 10-K in 2013?
[0m[1;3;38;5;200mRetrieved node with id, entering: parent-30 
[0m[1;3;34mRetrieving with query id parent-30 : GROSS PROFIT for The Coca-Cola Company in this 10-K in 2013?
[0m[1;3;38;5;200mRetrieved node with id, entering: parent-74 
[0m[1;3;34mRetrieving with query id parent-74 : GROSS PROFIT for The Coca-Cola Company in this 10-K in 2013?
[0m[1;3;34mRetrieving with query id None: GROSS PROFIT for The Coca-Cola Company in this 10-K in 2013?
[0m[1;3;38;5;200mRetrieved node with id, entering: parent-140 
[0m[1;3;34mRetrieving wit

In [87]:
print(ans)

$28,433


In [72]:
from deepeval import evaluate
from deepeval.test_case import LLMTestCase

from deepeval.metrics import FaithfulnessMetric,AnswerRelevancyMetric,ContextualPrecisionMetric,ContextualRecallMetric,ContextualRelevancyMetric

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [76]:
import json
with open("coca_cola_2015_qa.json", "r") as f:
    dataset = json.load(f)

In [88]:
def run_rag(query):
    # Ask your RetrieverQueryEngine
    retri,response = get_documents_by_year_from_query_llamaindex(query)
    
    # Answer generated by your response_synthesizer
    rag_answer = str(response)
    
    # Retrieved contexts (nodes)
    retrieved_docs = [str(node) for node in retri]
    
    return rag_answer, retrieved_docs

In [92]:
test_cases = []
for item in dataset:
    rag_answer, retrieved_docs = run_rag(item["question"])
    
    test_cases.append(
        LLMTestCase(
            input=item["question"],                       # the query
            actual_output=rag_answer,                  # RAG's generated answer
            expected_output=item["answer"],  # gold answer
            retrieval_context=retrieved_docs          # what your retriever returned
        )
    )

[1;3;34mRetrieving with query id None: What was the total net operating revenue in 2014?
[0m[1;3;38;5;200mRetrieved node with id, entering: parent-49 
[0m[1;3;34mRetrieving with query id parent-49 : What was the total net operating revenue in 2014?
[0m[1;3;38;5;200mRetrieved node with id, entering: parent-74 
[0m[1;3;34mRetrieving with query id parent-74 : What was the total net operating revenue in 2014?
[0m[1;3;38;5;200mRetrieved node with id, entering: parent-30 
[0m[1;3;34mRetrieving with query id parent-30 : What was the total net operating revenue in 2014?
[0m[1;3;38;5;200mRetrieved node with id, entering: parent-48 
[0m[1;3;34mRetrieving with query id parent-48 : What was the total net operating revenue in 2014?
[0m[1;3;34mRetrieving with query id None: What was the total net operating revenue in 2014?
[0m[1;3;38;5;200mRetrieved node with id, entering: parent-49 
[0m[1;3;34mRetrieving with query id parent-49 : What was the total net operating revenue in 20

In [93]:
!deepeval set-ollama gemma3:latest

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


🙌 Congratulations! You're now using a local Ollama model for all evals that 
require an LLM.


In [94]:
metrics = [
    AnswerRelevancyMetric(),
    FaithfulnessMetric(),
    ContextualPrecisionMetric(),
    ContextualRecallMetric(),
    ContextualRelevancyMetric()
    
]

In [95]:
import json
evaluate(test_cases=test_cases, metrics=metrics)

Evaluating 10 test case(s) in parallel: | |  0% (0/10) [Time Taken: 00:00, ?test

KeyboardInterrupt: 

In [97]:
import json
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams
correctness_metric = GEval(
    name="Correctness",
    criteria="Determine whether the actual output is factually correct based on the expected output.",
    # NOTE: you can only provide either criteria or evaluation_steps, and not both
    evaluation_steps=[
        "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
        "You should also heavily penalize omission of detail",
        "Vague language, or contradicting OPINIONS, are OK"
    ],
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT]
)

In [99]:
evaluate(test_cases=test_cases, metrics=[correctness_metric])


Evaluating 10 test case(s) in parallel: | |  0% (0/10) [Time Taken: 20:22, ?test[A

Evaluating 10 test case(s) in parallel: | | 10% (1/10) [Time Taken: 04:10, 250.5[A
Evaluating 10 test case(s) in parallel: |▏| 20% (2/10) [Time Taken: 04:12, 104.4[A
Evaluating 10 test case(s) in parallel: |▎| 30% (3/10) [Time Taken: 04:14, 57.62[A
Evaluating 10 test case(s) in parallel: |▍| 40% (4/10) [Time Taken: 04:16, 35.45[A
Evaluating 10 test case(s) in parallel: |▌| 50% (5/10) [Time Taken: 04:18, 23.44[A
Evaluating 10 test case(s) in parallel: |▌| 60% (6/10) [Time Taken: 04:20, 16.17[A
Evaluating 10 test case(s) in parallel: |▋| 70% (7/10) [Time Taken: 04:22, 11.52[A
Evaluating 10 test case(s) in parallel: |▊| 80% (8/10) [Time Taken: 04:23,  8.39[A
Evaluating 10 test case(s) in parallel: |▉| 90% (9/10) [Time Taken: 04:26,  6.43[A
Evaluating 10 test case(s) in parallel: |█|100% (10/10) [Time Taken: 04:28, 26.8[A



Metrics Summary

  - ❌ Correctness (GEval) (score: 0.1, threshold: 0.5, strict: False, evaluation model: gemma3:latest (Ollama), reason: The actual output discusses strategies like 'brand-building initiatives' and 'execution at the point of sale', but does not mention any challenges like 'economic and political instability' or 'currency volatility' as outlined in the expected output. It focuses solely on operational strategies rather than challenges., error: None)

For test case:

  - input: What challenges did Coca-Cola face in emerging and developing markets?
  - actual output: Coca-Cola focuses on ensuring that our customers have the right product and package offerings and the right promotional tools to deliver enhanced value to themselves and the Company. We are constantly looking to build new beverage consumption occasions in our customers' outlets through unique and innovative consumer experiences, product availability and delivery systems, and beverage merchandising and displa




EvaluationResult(test_results=[TestResult(name='test_case_8', success=False, metrics_data=[MetricData(name='Correctness (GEval)', threshold=0.5, success=False, score=0.1, reason="The actual output discusses strategies like 'brand-building initiatives' and 'execution at the point of sale', but does not mention any challenges like 'economic and political instability' or 'currency volatility' as outlined in the expected output. It focuses solely on operational strategies rather than challenges.", strict_mode=False, evaluation_model='gemma3:latest (Ollama)', error=None, evaluation_cost=0.0, verbose_logs='Criteria:\nDetermine whether the actual output is factually correct based on the expected output. \n \nEvaluation Steps:\n[\n    "Check whether the facts in \'actual output\' contradicts any facts in \'expected output\'",\n    "You should also heavily penalize omission of detail",\n    "Vague language, or contradicting OPINIONS, are OK"\n]')], conversational=False, multimodal=False, input=