In [1]:
import os
print(os.getcwd())
os.chdir('..')
print(os.getcwd())

/Users/nikhil/Documents/GitHub_portfolio/1-Enterprise-Grade RAG System/notebook
/Users/nikhil/Documents/GitHub_portfolio/1-Enterprise-Grade RAG System


In [3]:
%%capture 
!pip install -r requirements.txt

In [5]:
import regex as re
import chromadb
from llama_index.readers.file import PyMuPDFReader
from llama_index.core.node_parser import HierarchicalNodeParser
from llama_index.core import SimpleDirectoryReader
from llama_index.core import VectorStoreIndex, StorageContext, get_response_synthesizer
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.query_engine import RetrieverQueryEngine

In [6]:
loading_mapping = {'.pdf':PyMuPDFReader()}

documents = SimpleDirectoryReader("./data" , file_extractor = loading_mapping).load_data()

In [7]:
current_section = "Unknown"

for doc in documents:
    match = re.search(r"(\d{4})", doc.metadata.get("file_name", ""))
    if match:
        doc.metadata['year'] = int(match.group(1))
        doc.metadata['company'] = "coca-cola"
        doc.metadata["filing_type"] = "10-K"

    # --- detect ALL CAPS ITEM headers ---
    # normalize spaces, but keep case
    text = doc.text.replace("\n", " ").replace("  ", " ")

    # Regex: ITEM + number + optional letter + . + ALL CAPS TITLE
    match = re.search(r"(ITEM\s+\d+[A-Z]?\.\s+[A-Z\s,&\-]+)", text)
    if match:
        current_section = match.group(0).strip()   # full "ITEM 1. BUSINESS"
        doc.metadata["section"] = current_section
        continue

    # If no new match, keep previous section
    doc.metadata["section"] = current_section


In [8]:
node_parser = HierarchicalNodeParser.from_defaults(chunk_sizes=[1024, 512, 256])

In [10]:
nodes = node_parser.get_nodes_from_documents(documents)

In [11]:
from llama_index.core.node_parser import get_leaf_nodes, get_root_nodes
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core.vector_stores import MetadataFilters ,MetadataFilter

In [12]:
leaf_nodes = get_leaf_nodes(nodes)
root_nodes = get_root_nodes(nodes)

In [13]:
len(nodes),len(root_nodes),len(leaf_nodes)

(23154, 2991, 14095)

In [17]:
docstore = SimpleDocumentStore()

# insert nodes into docstore
docstore.add_documents(nodes)

storage_context = StorageContext.from_defaults(docstore=docstore)


In [23]:
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")

In [24]:
index = VectorStoreIndex(leaf_nodes,storage_context=storage_context, embed_model= embed_model)

In [25]:
year_filter = MetadataFilters(
    filters=[
        MetadataFilter(key="year", value=2015 , operator = ">="),
        MetadataFilter(key="year",value=2025, operator = "<="),
    ]
)

In [26]:
retriever = index.as_retriever(similarity_top_k=5,
                              filters=year_filter)

In [28]:
from llama_index.core.retrievers.auto_merging_retriever import AutoMergingRetriever
retriever = AutoMergingRetriever(retriever, storage_context, verbose=True)

In [29]:
query = "Operating Segments in 2015"
ans = retriever.retrieve(query)

> Merging 1 nodes into parent node.
> Parent node id: e2102b55-0e36-40e5-8267-9443f0e4382f.
> Parent node text: Operating Segments
The Company's operating structure is the basis for our internal financial repo...



In [30]:
len(ans)

5

In [31]:
for i in ans:
    print(i.metadata)
    print(i.text)
    print("-"*100)

{'file_path': '/Users/nikhil/Documents/GitHub_portfolio/1-Enterprise-Grade RAG System/data/2016-cocacola-10k-filing.pdf', 'file_name': '2016-cocacola-10k-filing.pdf', 'file_type': 'application/pdf', 'file_size': 1744160, 'creation_date': '2025-08-17', 'last_modified_date': '2025-05-23', 'total_pages': 218, 'source': '138', 'year': 2016, 'company': 'coca-cola', 'filing_type': '10-K', 'section': 'ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTARY DATA  TABLE OF CONTENTS P'}
NOTE 19: OPERATING SEGMENTS
As of December 31, 2015, our organizational structure consisted of the following operating segments: Eurasia and Africa; Europe; Latin America; North America; Asia Pacific;
Bottling Investments; and Corporate.
136
----------------------------------------------------------------------------------------------------
{'file_path': '/Users/nikhil/Documents/GitHub_portfolio/1-Enterprise-Grade RAG System/data/2015-cocacola-10k-filing.pdf', 'file_name': '2015-cocacola-10k-filing.pdf', 'file_type': 'app

In [39]:
from llama_index.llms.ollama import Ollama
llm =Ollama(model="gemma3:latest")
response_synthesizer = get_response_synthesizer(response_mode ="compact",llm=llm)
query_engine = RetrieverQueryEngine(retriever = retriever, response_synthesizer = response_synthesizer)
response=query_engine.query("What was Coca-Cola’s total unit case volume in 2014?")
print(response)

> Merging 2 nodes into parent node.
> Parent node id: a4acff55-7c52-4f1a-b006-302a8bd66215.
> Parent node text: The unit case volume in 2015 and 2014 reflects the
discontinuance of certain brands owned by our ...

In 2014, unit case volume in the United States represented 19 percent of the Company's worldwide unit case volume. Of the U.S. unit case volume for 2014, 68 percent was attributable to sparkling beverages and 32 percent to still beverages. Trademark Coca-Cola Beverages accounted for 45 percent of U.S. unit case volume for 2014. Unit case volume outside the United States represented 81 percent of the Company's worldwide unit case volume for 2014.


In [41]:

from deepeval import evaluate
from deepeval.test_case import LLMTestCase

from deepeval.metrics import FaithfulnessMetric,AnswerRelevancyMetric,ContextualPrecisionMetric,ContextualRecallMetric,ContextualRelevancyMetric

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [43]:
with open("notebook/coca_cola_qa_unified.json", "r") as f:
    dataset = json.load(f)

In [45]:
def run_rag(query):
    # Ask your RetrieverQueryEngine
    response = query_engine.query(query)
    
    # Answer generated by your response_synthesizer
    rag_answer = str(response)
    
    # Retrieved contexts (nodes)
    retrieved_docs = [str(node) for node in response.source_nodes]
    
    return rag_answer, retrieved_docs

In [49]:
test_cases = []
for item in dataset:
    rag_answer, retrieved_docs = run_rag(item["question"])
    
    test_cases.append(
        LLMTestCase(
            input=item["question"],                       # the query
            actual_output=rag_answer,                  # RAG's generated answer
            expected_output=item["answer"],  # gold answer
            retrieval_context=retrieved_docs          # what your retriever returned
        )
    )

> Merging 1 nodes into parent node.
> Parent node id: 18198548-f024-4e99-8e1f-316f700bf110.
> Parent node text: Operating Segments
The Company's operating structure is the basis for our internal financial repo...

> Merging 1 nodes into parent node.
> Parent node id: e2102b55-0e36-40e5-8267-9443f0e4382f.
> Parent node text: Operating Segments
The Company's operating structure is the basis for our internal financial repo...

> Merging 2 nodes into parent node.
> Parent node id: 8d05aecd-8ee8-4241-94d0-19fcdf1d0069.
> Parent node text: General
The Coca-Cola Company is a total beverage company, and beverage products bearing our trad...

> Merging 1 nodes into parent node.
> Parent node id: 15b42d39-43ef-400c-8d28-059102a937f3.
> Parent node text: In addition, from time to time we
establish and publicly announce goals and commitments to reduce...

> Merging 1 nodes into parent node.
> Parent node id: 928ffbd9-b23c-4811-a59d-b4c151a3d82d.
> Parent node text: Effective January 1, 2021, we tr

In [50]:

!deepeval set-ollama deepseek-r1:1.5b

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


🙌 Congratulations! You're now using a local Ollama model for all evals that 
require an LLM.


In [51]:
metrics = [
    AnswerRelevancyMetric(),
    FaithfulnessMetric(),
    ContextualPrecisionMetric(),
    ContextualRecallMetric(),
    ContextualRelevancyMetric()
    
]

In [59]:
evaluate(test_cases=test_cases, metrics=metrics)

Evaluating 4 test case(s) in parallel: |█|100% (4/4) [Time Taken: 03:39, 54.84s/



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: deepseek-r1:1.5b (Ollama), reason: The score is 1.00 because the response accurately and completely addresses the question regarding Coca-Cola's internal financial reporting operating segments as of 2018, with no irrelevant information present., error: None)
  - ✅ Faithfulness (score: 0.8, threshold: 0.5, strict: False, evaluation model: deepseek-r1:1.5b (Ollama), reason: The score is 0.80 because the actual output contradicts the retrieval context by stating that as of December 31, 2018, the Company’s operating structure included Europe, Middle East and Africa, rather than the specifically mentioned Latin America as detailed in the retrieval context., error: None)
  - ✅ Contextual Precision (score: 1.0, threshold: 0.5, strict: False, evaluation model: deepseek-r1:1.5b (Ollama), reason: The score is 1.00 because the first two retrieval contexts successfully provided the precise detai




EvaluationResult(test_results=[TestResult(name='test_case_1', success=True, metrics_data=[MetricData(name='Answer Relevancy', threshold=0.5, success=True, score=1.0, reason="The score is 1.00 because the response accurately and completely addresses the question regarding Coca-Cola's internal financial reporting operating segments as of 2018, with no irrelevant information present.", strict_mode=False, evaluation_model='deepseek-r1:1.5b (Ollama)', error=None, evaluation_cost=0.0, verbose_logs='Statements:\n[\n    "As of December 31, 2018, the Company’s operating structure included Europe, Middle East and Africa.",\n    "It also included Latin America.",\n    "It also included North America.",\n    "It also included Asia Pacific.",\n    "It also included Bottling Investments.",\n    "It also included Corporate.",\n    "Corporate consists of a center focused on strategic initiatives.",\n    "Corporate consists of a center focused on policy and governance.",\n    "Corporate consists of an 

## Overall Metric Pass Rates

Answer Relevancy: 80.00% pass rate

Faithfulness: 70.00% pass rate


Contextual Precision: 100.00% pass rate


Contextual Recall: 90.00% pass rate


Contextual Relevancy: 100.00% pass rate

In [57]:
!deepeval set-ollama gemma3:latest

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


🙌 Congratulations! You're now using a local Ollama model for all evals that 
require an LLM.


In [60]:
import json
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams
correctness_metric = GEval(
    name="Correctness",
    criteria="Determine whether the actual output is factually correct based on the expected output.",
    # NOTE: you can only provide either criteria or evaluation_steps, and not both
    evaluation_steps=[
        "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
        "You should also heavily penalize omission of detail",
        "Vague language, or contradicting OPINIONS, are OK"
    ],
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT]
)

In [61]:
evaluate(test_cases=test_cases, metrics=[correctness_metric])

Evaluating 4 test case(s) in parallel: |█|100% (4/4) [Time Taken: 00:07,  1.81s/



Metrics Summary

  - ✅ Correctness (GEval) (score: 0.9, threshold: 0.5, strict: False, evaluation model: gemma3:latest (Ollama), reason: The actual output describes the transformation as a 'networked global organization' combining 'scale with local execution' and 'faster scaling of products,' aligning with the expected output's description of a networked global organization focused on combining scale with local execution., error: None)

For test case:

  - input: In the 2022 filing, how did Coca-Cola change its organizational structure effective January 1, 2021?
  - actual output: Effective January 1, 2021, the company transformed its organizational structure in an effort to better enable it to capture growth in the fast-changing marketplace by building a networked global organization designed to combine the power of scale with the deep knowledge required to win locally. They created new operating units focused on regional and local execution. These operating units, which sit under f




EvaluationResult(test_results=[TestResult(name='test_case_3', success=True, metrics_data=[MetricData(name='Correctness (GEval)', threshold=0.5, success=True, score=0.9, reason="The actual output describes the transformation as a 'networked global organization' combining 'scale with local execution' and 'faster scaling of products,' aligning with the expected output's description of a networked global organization focused on combining scale with local execution.", strict_mode=False, evaluation_model='gemma3:latest (Ollama)', error=None, evaluation_cost=0.0, verbose_logs='Criteria:\nDetermine whether the actual output is factually correct based on the expected output. \n \nEvaluation Steps:\n[\n    "Check whether the facts in \'actual output\' contradicts any facts in \'expected output\'",\n    "You should also heavily penalize omission of detail",\n    "Vague language, or contradicting OPINIONS, are OK"\n]')], conversational=False, multimodal=False, input='In the 2022 filing, how did Co

Overall Metric Pass Rates

Correctness (GEval): 40.00% pass rate