In [1]:
#directory reader
import os
print(os.getcwd())
os.chdir('..')
print(os.getcwd())

/Users/nikhil/Documents/GitHub_portfolio/Enterprise-Grade RAG System/notebook
/Users/nikhil/Documents/GitHub_portfolio/Enterprise-Grade RAG System


In [3]:
%%capture
!pip install -r requirements.txt

In [40]:
# load the function required
import regex as re
import chromadb
from llama_index.readers.file import PyMuPDFReader
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.core import VectorStoreIndex, StorageContext , get_response_synthesizer
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.postprocessor import MetadataReplacementPostProcessor
from llama_index.llms.ollama import Ollama
from llama_index.core.query_engine import RetrieverQueryEngine


In [7]:
loading_mapping = {'.pdf':PyMuPDFReader()}

documents = SimpleDirectoryReader("./data" , file_extractor = loading_mapping).load_data()

In [8]:
len(documents)

2147

In [9]:
import re
current_section = "Unknown"

for doc in documents:
    match = re.search(r"(\d{4})", doc.metadata.get("file_name", ""))
    if match:
        doc.metadata['year'] = int(match.group(1))
        doc.metadata['company'] = "coca-cola"
        doc.metadata["filing_type"] = "10-K"

    # --- detect ALL CAPS ITEM headers ---
    # normalize spaces, but keep case
    text = doc.text.replace("\n", " ").replace("  ", " ")

    # Regex: ITEM + number + optional letter + . + ALL CAPS TITLE
    match = re.search(r"(ITEM\s+\d+[A-Z]?\.\s+[A-Z\s,&\-]+)", text)
    if match:
        current_section = match.group(0).strip()   # full "ITEM 1. BUSINESS"
        doc.metadata["section"] = current_section
        continue

    # If no new match, keep previous section
    doc.metadata["section"] = current_section



In [13]:
documents[12].metadata

{'file_path': '/Users/nikhil/Documents/GitHub_portfolio/Enterprise-Grade RAG System/data/2015-cocacola-10k-filing.pdf',
 'file_name': '2015-cocacola-10k-filing.pdf',
 'file_type': 'application/pdf',
 'file_size': 1717600,
 'creation_date': '2025-08-31',
 'last_modified_date': '2025-05-23',
 'total_pages': 216,
 'source': '13',
 'year': 2015,
 'company': 'coca-cola',
 'filing_type': '10-K',
 'section': 'ITEM 1.  BUSINESS I'}

In [15]:
node_parser = SentenceWindowNodeParser(
    window_size = 3,
    window_metadata_key ="window",
    original_text_metadata_key = "original_text"
)

In [17]:
nodes = node_parser.get_nodes_from_documents(documents)

In [19]:
embed_model = HuggingFaceEmbedding(model_name='BAAI/bge-base-en-v1.5')

In [20]:
db = chromadb.PersistentClient("data/.sentencedb")

chroma_collection = db.get_or_create_collection("new")

vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex(nodes,storage_context=storage_context,embed_model=embed_model)


In [22]:
from llama_index.core.vector_stores import MetadataFilters, MetadataFilter

year_filter = MetadataFilters(
    filters=[
        MetadataFilter(key="year", value=2015, operator=">="),
        MetadataFilter(key="year", value=2025, operator="<="),
    ]
)

In [23]:
retriever = index.as_retriever(
    similarity_top_k=3,
    filters = year_filter,
    # the target key defaults to `window` to match the node_parser's default
    node_postprocessors=[
        MetadataReplacementPostProcessor(target_metadata_key="window")
    ],
)

In [125]:
query = "What was Coca-Cola‚Äôs total unit case volume in 2014?"
ans = retriever.retrieve(query)

In [127]:
for i in ans:
    print(i.metadata)
    print(i.text)
    print("-"*100)

{'file_path': '/Users/nikhil/Documents/GitHub_portfolio/Enterprise-Grade RAG System/data/2015-cocacola-10k-filing.pdf', 'file_name': '2015-cocacola-10k-filing.pdf', 'file_type': 'application/pdf', 'file_size': 1717600, 'creation_date': '2025-08-31', 'last_modified_date': '2025-05-23', 'total_pages': 216, 'source': '8', 'year': 2015, 'company': 'coca-cola', 'filing_type': '10-K', 'section': 'ITEM 1.  BUSINESS I', 'window': "Unit case volume outside the United States represented 81 percent of the Company's worldwide unit case volume for 2014.  The countries outside the United States in which our unit case volumes\nwere the largest in 2014 were Mexico, China, Brazil and Japan, which together accounted for 31 percent of our worldwide unit case volume.  Of the non-U.S.  unit case volume for 2014, 74 percent\nwas attributable to sparkling beverages and 26 percent to still beverages.  Trademark Coca-Cola Beverages accounted for 47 percent of non-U.S.  unit case volume for 2014.\n Our five lar

In [44]:
llm =Ollama(model="gemma3:latest")

In [153]:
response_synthesizer = get_response_synthesizer(response_mode ="compact",llm=llm)

In [155]:
query_engine = RetrieverQueryEngine(retriever = retriever, response_synthesizer = response_synthesizer)

In [157]:
response=query_engine.query("What was Coca-Cola‚Äôs total unit case volume in 2014?")

In [158]:
print(response)

74 percent was attributable to sparkling beverages and 26 percent to still beverages.


In [79]:

from deepeval import evaluate
from deepeval.test_case import LLMTestCase

from deepeval.metrics import FaithfulnessMetric,AnswerRelevancyMetric,ContextualPrecisionMetric,ContextualRecallMetric,ContextualRelevancyMetric

In [64]:
with open("notebook/coca_cola_qa_dataset.json", "r") as f:
    dataset = json.load(f)

In [99]:
def run_rag(query):
    # Ask your RetrieverQueryEngine
    response = query_engine.query(query)
    
    # Answer generated by your response_synthesizer
    rag_answer = str(response)
    
    # Retrieved contexts (nodes)
    retrieved_docs = [str(node) for node in response.source_nodes]
    
    return rag_answer, retrieved_docs


In [103]:
test_cases = []
for item in dataset:
    rag_answer, retrieved_docs = run_rag(item["query"])
    
    test_cases.append(
        LLMTestCase(
            input=item["query"],                       # the query
            actual_output=rag_answer,                  # RAG's generated answer
            expected_output=item["reference_answer"],  # gold answer
            retrieval_context=retrieved_docs          # what your retriever returned
        )
    )

In [105]:
test_cases

[LLMTestCase(input='What was Coca-Cola‚Äôs total unit case volume in 2014?', actual_output='74 percent was attributable to sparkling beverages and 26 percent to still beverages.', expected_output='Coca-Cola sold 28.6 billion unit cases of products in 2014.', context=None, retrieval_context=['Node ID: f0d88745-e9ec-4983-b347-191a3995b9a2\nText: unit case volume for 2014, 74 percent was attributable to\nsparkling beverages and 26 percent to still beverages.\nScore:  0.606\n', 'Node ID: bb150266-ea86-461c-9704-e2c449ca3a62\nText: unit case volume for 2014, 68 percent was attributable to\nsparkling beverages and 32 percent to still beverages.\nScore:  0.604\n', 'Node ID: 8e29258c-89dc-4d61-aa49-645e9fe91e77\nText: We believe unit case volume is one of the measures of the\nunderlying strength of the Coca-Cola system because it measures trends\nat the consumer level.\nScore:  0.603\n'], additional_metadata=None, comments=None, tools_called=None, expected_tools=None, reasoning=None, name=None

In [183]:

!deepeval set-ollama gemma3:latest


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


üôå Congratulations! You're now using a local Ollama model for all evals that 
require an LLM.


In [107]:
metrics = [
    AnswerRelevancyMetric(),
    FaithfulnessMetric(),
    ContextualPrecisionMetric(),
    ContextualRecallMetric(),
    ContextualRelevancyMetric()
    
]

In [119]:
evaluate(test_cases=test_cases, metrics=metrics)

Evaluating 10 test case(s) in parallel: |‚ñà|100% (10/10) [Time Taken: 02:14, 13.4



Metrics Summary

  - ‚úÖ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: deepseek-r1:1.5b (Ollama), reason: The answer is not relevant to the question asked because it discusses unit case volumes, which are unrelated to specific countries outside the U.S. but instead relates to the volume of cases per country., error: None)
  - ‚ùå Faithfulness (score: 0.25, threshold: 0.5, strict: False, evaluation model: deepseek-r1:1.5b (Ollama), reason: The score is 0.25 because none of the contradictions provided support this score., error: None)
  - ‚úÖ Contextual Precision (score: 0.9166666666666666, threshold: 0.5, strict: False, evaluation model: deepseek-r1:1.5b (Ollama), reason: The contextual precision score is 0.92, which means that {quotations from retrieval contexts with a yes verdict} are ranked higher than {quotations from retrieval contexts with a no verdict}. Since there are 3 retrieval contexts and only one (with a 'no' verdict) has no mention of Mex




EvaluationResult(test_results=[TestResult(name='test_case_9', success=False, metrics_data=[MetricData(name='Answer Relevancy', threshold=0.5, success=True, score=1.0, reason='The answer is not relevant to the question asked because it discusses unit case volumes, which are unrelated to specific countries outside the U.S. but instead relates to the volume of cases per country.', strict_mode=False, evaluation_model='deepseek-r1:1.5b (Ollama)', error=None, evaluation_cost=0.0, verbose_logs='Statements:\n[] \n \nVerdicts:\n[]'), MetricData(name='Faithfulness', threshold=0.5, success=False, score=0.25, reason='The score is 0.25 because none of the contradictions provided support this score.', strict_mode=False, evaluation_model='deepseek-r1:1.5b (Ollama)', error=None, evaluation_cost=0.0, verbose_logs='Truths (limit=None):\n[\n    "Mexico",\n    "China",\n    "Brazil",\n    "Japan",\n    "accounted for 31 percent of our worldwide unit case volume."\n] \n \nClaims:\n[\n    "Mexico is a count

In [179]:
import json
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams
correctness_metric = GEval(
    name="Correctness",
    criteria="Determine whether the actual output is factually correct based on the expected output.",
    # NOTE: you can only provide either criteria or evaluation_steps, and not both
    evaluation_steps=[
        "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
        "You should also heavily penalize omission of detail",
        "Vague language, or contradicting OPINIONS, are OK"
    ],
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT]
)

In [185]:
evaluate(test_cases=test_cases, metrics=[correctness_metric])

Evaluating 10 test case(s) in parallel: |‚ñà|100% (10/10) [Time Taken: 00:50,  5.0



Metrics Summary

  - ‚ùå Correctness (GEval) (score: 0.3, threshold: 0.5, strict: False, evaluation model: deepseek-r1:1.5b (Ollama), reason: The actual output (26 percent) contradicts the expected output (32% of U.S. unit case volume). While the actual output provides a numerical value, it differs from the percentage specified in the expected output., error: None)

For test case:

  - input: What percentage of Coca-Cola‚Äôs U.S. unit case volume in 2014 was from still beverages?
  - actual output: 26 percent.
  - expected output: 32% of U.S. unit case volume came from still beverages in 2014.
  - context: None
  - retrieval context: ['Node ID: f0d88745-e9ec-4983-b347-191a3995b9a2\nText: unit case volume for 2014, 74 percent was attributable to\nsparkling beverages and 26 percent to still beverages.\nScore:  0.635\n', 'Node ID: bb150266-ea86-461c-9704-e2c449ca3a62\nText: unit case volume for 2014, 68 percent was attributable to\nsparkling beverages and 32 percent to still beverages.\




EvaluationResult(test_results=[TestResult(name='test_case_4', success=False, metrics_data=[MetricData(name='Correctness (GEval)', threshold=0.5, success=False, score=0.3, reason='The actual output (26 percent) contradicts the expected output (32% of U.S. unit case volume). While the actual output provides a numerical value, it differs from the percentage specified in the expected output.', strict_mode=False, evaluation_model='deepseek-r1:1.5b (Ollama)', error=None, evaluation_cost=0.0, verbose_logs='Criteria:\nDetermine whether the actual output is factually correct based on the expected output. \n \nEvaluation Steps:\n[\n    "Check whether the facts in \'actual output\' contradicts any facts in \'expected output\'",\n    "You should also heavily penalize omission of detail",\n    "Vague language, or contradicting OPINIONS, are OK"\n]')], conversational=False, multimodal=False, input='What percentage of Coca-Cola‚Äôs U.S. unit case volume in 2014 was from still beverages?', actual_outp

Overall Metric Pass Rates

Correctness (GEval): 10.00% pass rate