<a href="https://colab.research.google.com/github/arockiasachin/ContextualFinAi/blob/main/NaiveRAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup

In [None]:
# Install necessary libraries
!pip install -U langchain langchain-openai langchain-chroma pyprojroot python-dotenv pypdf langchain-community langchain google-generativeai

Collecting langchain-chroma
  Downloading langchain_chroma-0.1.4-py3-none-any.whl.metadata (1.6 kB)
Collecting pyprojroot
  Downloading pyprojroot-0.3.0-py3-none-any.whl.metadata (4.8 kB)
Downloading langchain_chroma-0.1.4-py3-none-any.whl (10 kB)
Downloading pyprojroot-0.3.0-py3-none-any.whl (7.6 kB)
Installing collected packages: pyprojroot, langchain-chroma
Successfully installed langchain-chroma-0.1.4 pyprojroot-0.3.0


In [None]:
%pip install -Uq "unstructured[all-docs]" pillow lxml pillow
%pip install -Uq chromadb tiktoken
%pip install -Uq langchain langchain-community langchain-openai langchain-groq
%pip install -Uq python_dotenv

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m149.4/149.4 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.0/117.0 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m3.9 MB/s[0m et

In [None]:
import os

# keys for the services we will use

os.environ["OPENAI_API_KEY"] = "OpenAI Key here"
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["UNSTRUCTURED_API_KEY"] = "Langchain API key here"
os.environ["UNSTRUCTURED_API_URL"] = "https://api.unstructuredapp.io/general/v0/general"
os.environ["GOOGLE_API_KEY"] = "Genai API Key"

## Chunking + Embedding using Openai and ChromaDB

In [None]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda
from langchain.schema.output_parser import StrOutputParser
import shutil
import sqlite3

# Configuration
doc_dir = r"D:\WorkingDir\data"  # Directory where PDF files are stored
vectordb_dir = r"D:\WorkingDir\vectordb"  # Directory to save the Chroma vector database
chunk_size = 500  # Chunk size for text splitting
chunk_overlap = 50  # Overlap between chunks
embedding_model = "text-embedding-3-small"  # OpenAI embedding model
llm_model = "gpt-4o-mini"  # LLM model to generate responses

# Step 1: Delete existing vector database and recreate
if os.path.exists(vectordb_dir):
    shutil.rmtree(vectordb_dir)
    print("Existing vector database deleted.")
os.makedirs(vectordb_dir)
print(f"VectorDB directory created at: {os.path.abspath(vectordb_dir)}")

# Debug: Check permissions of the created directory
if not os.access(vectordb_dir, os.W_OK):
    raise PermissionError(f"Write permission denied for directory: {vectordb_dir}")
else:
    print("Write permission verified for VectorDB directory.")

# Debug: Confirm directory content after recreation
print("Contents of VectorDB directory:", os.listdir(vectordb_dir))

# Load PDF files
docs = []
for file in os.listdir(doc_dir):
    if file.endswith(".pdf"):
        print(f"Loading file: {file}")
        loader = PyPDFLoader(os.path.join(doc_dir, file))
        docs.extend(loader.load_and_split())
print("Loaded documents:", len(docs))

# Split documents into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
doc_splits = text_splitter.split_documents(docs)
print("Total document chunks:", len(doc_splits))

# Debug: Check chunk contents
if len(doc_splits) == 0:
    raise ValueError("No document chunks were created. Check input PDFs.")
else:
    print("Document chunks successfully created.")

# Debug: Confirm embeddings are initialized
print("Initializing embeddings with model:", embedding_model)
embedding_function = OpenAIEmbeddings(model=embedding_model)




In [None]:

embedding_function = OpenAIEmbeddings(model=embedding_model)
# Create and save to Chroma VectorDB
try:
    vectordb = Chroma.from_documents(
        documents=doc_splits,
        persist_directory=vectordb_dir,
        embedding=embedding_function
    )
    print("VectorDB created and saved successfully.")
    print("Number of vectors in database:", len(vectordb._collection.get()['ids']))
except Exception as e:
    print(f"Error creating VectorDB: {e}")
    print("Debug: Permissions check for VectorDB directory")
    print("Directory permissions:", oct(os.stat(vectordb_dir).st_mode))
    raise


VectorDB created and saved successfully.
Number of vectors in database: 3816


In [None]:
pip install instructor deepeval chromadb

## Custom GenAi Function for evaluation

In [None]:
from pydantic import BaseModel
import google.generativeai as genai
import instructor

from deepeval.models import DeepEvalBaseLLM


class CustomGeminiFlash(DeepEvalBaseLLM):
    def __init__(self):
        self.model = genai.GenerativeModel(model_name="models/gemini-1.5-pro")

    def load_model(self):
        return self.model

    def generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        client = self.load_model()
        instructor_client = instructor.from_gemini(
            client=client,
            mode=instructor.Mode.GEMINI_JSON,
        )
        resp = instructor_client.messages.create(
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            response_model=schema,
        )
        return resp

    async def a_generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        return self.generate(prompt, schema)

    def get_model_name(self):
        return "gemini-1.5-pro"

In [None]:
from pydantic import BaseModel

class ResponseSchema(BaseModel):
    message: str


In [None]:
import os
import google.generativeai as genai

# Set the API key as an environment variable
os.environ["GOOGLE_API_KEY"] = "Genai API key here"

# Configure the GenAI client
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))


In [None]:
# Initialize the custom Gemini model

gemini_model = CustomGeminiFlash()

# Define the input prompt
prompt = "Write a short story about a heroic cat."

# Define the response schema
class ResponseSchema(BaseModel):
    message: str

# Generate the response
response = gemini_model.generate(prompt, schema=ResponseSchema)

# Print the response
print(response.message)


Whiskers, a small grey tabby with emerald eyes, wasn't your average house cat. He preferred patrolling the backyard to napping, and bird-watching held more appeal than catnip. One breezy afternoon, a careless flick of a neighbor's cigarette ignited a dry patch of grass near Mrs. Higgins' shed. Whiskers, perched on the fence, saw the danger.  He raced towards the house, yowling and scratching at the back door until Mrs. Higgins, startled, opened it. Seeing the smoke, she gasped and rushed to call the fire department.  Whiskers, knowing time was of the essence, darted towards the shed where Tilly, Mrs. Higgins' beloved tortoise, was housed for the afternoon. Braving the heat, he squeezed through a crack, nudged Tilly awake, and guided her slowly towards the exit.  Firefighters arrived just as Whiskers, coughing but triumphant, emerged from the smoke-filled shed with Tilly in tow. The small fire was quickly extinguished, and Whiskers, soot-smudged but proud, was hailed a hero. He received

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI

## Langchain Chain

In [None]:
# 1. Set up the retriever
retriever = vectordb.as_retriever(search_kwargs={"k": 3})

# 2. Define the Large Language Model (LLM)
llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro", temperature=0)

# 3. Define the prompt template
prompt = ChatPromptTemplate.from_template(
    "Answer the following question using only the provided context:\n\n"
    "Context:\n{context}\n\n"
    "Question:\n{question}\n\n"
    "Answer:"
)

# 4. Define document formatting function (combines retrieved documents)
def format_docs(docs):
    """Format retrieved documents into a single string."""
    return "\n\n".join(doc.page_content for doc in docs)

# 5. Combine components into a runnable chain
retrieval_chain = (
    {
        "context": retriever | RunnableLambda(format_docs),  # Retrieve and format context
        "question": RunnablePassthrough()  # Pass the question directly
    }
    | prompt   # Pass into the prompt template
    | llm      # Generate response using the LLM
    | StrOutputParser()  # Parse LLM's response into a string
)

# Step 3: Query the RAG pipeline
query = "What is AWS?"
response = retrieval_chain.invoke(query)

# Display Results
print("\nLLM Response:")
print(response)


LLM Response:
AWS offers a broad set of on-demand technology services, including compute, storage, database, analytics, and machine learning, and other services to developers and enterprises of all sizes.



In [None]:
retriever.invoke("What is the summary of the 2021 shareholder letter?")

## Generating TestData

In [None]:
import pandas as pd

# Load the CSV file containing the test dataset
csv_file = "D:\RagEval\master_testsetv1.csv"  # Replace with the correct path
df = pd.read_csv(csv_file)

# Preview the dataset
print(df.head())


def format_docs_with_metadata(docs):
    """Format retrieved documents into a string with metadata."""
    return "\n\n".join(
        f"Page: {doc.metadata.get('page', 'N/A')}\nContent: {doc.page_content}"
        for doc in docs
    )
print("Preparing evaluation inputs...")

In [None]:
from deepeval.test_case import LLMTestCase
def format_docs_with_metadata(docs):
    """Format retrieved documents into a list of strings with metadata."""
    return [
        f"Page: {doc.metadata.get('page', 'N/A')}\nContent: {doc.page_content}"
        for doc in docs
    ]

test_cases = []

for _, row in df.iterrows():
    query = row['user_input']  # Query column
    ground_truth = row['reference']  # Ground truth column
    reference_contexts = row['reference_contexts']  # Reference contexts

    # Retrieve documents and combine contexts (replace with actual retriever)
    retrieved_docs = retriever.invoke(query)  # Replace with your retriever logic
    combined_contexts = format_docs_with_metadata(retrieved_docs)

    # Generate response using your RAG pipeline (replace with actual logic)
    generated_answer = retrieval_chain.invoke(query)  # Replace with your retrieval chain logic

    # Add a test case
    test_cases.append(
        LLMTestCase(
            input=query,
            actual_output=generated_answer,
            expected_output=ground_truth,
            retrieval_context=combined_contexts
        )
    )


In [None]:
import json
# Serialize the test cases to JSON format
def save_test_cases_to_file(test_cases, file_path= r"D:\WorkingDir\Vanilla\test_cases.json"):
    """Save test cases to a JSON file."""
    serialized_test_cases = [
        {
            "input": test_case.input,
            "actual_output": test_case.actual_output,
            "expected_output": test_case.expected_output,
            "retrieval_context": test_case.retrieval_context,
        }
        for test_case in test_cases
    ]

    with open(file_path, "w", encoding="utf-8") as file:
        json.dump(serialized_test_cases, file, indent=4)

# Save the test cases
file_path= r"D:\WorkingDir\Vanilla\test_cases.json"
save_test_cases_to_file(test_cases, file_path)

print("Test cases have been saved to 'test_cases.json'.")

Test cases have been saved to 'test_cases.json'.


In [None]:
test_cases[:2]

## Sample LLM Case

In [None]:
from deepeval.test_case import LLMTestCase

example_case_1 = LLMTestCase(
    input="What is the capital of France?",
    actual_output="The capital of France is London.",
    expected_output="The capital of France is Paris.",
    retrieval_context=[
        "Paris is the capital of France and is located in Europe."
    ]
)

example_case_2 = LLMTestCase(
    input="Who wrote 'Pride and Prejudice'?",
    actual_output="It was written by Charles Dickens.",
    expected_output="It was written by Jane Austen.",
    retrieval_context=[
        "'Pride and Prejudice' is a novel by Jane Austen."
    ]
)

# Add more test cases as needed


## DeepEval Evaluation

In [None]:
from deepeval import evaluate
from deepeval.test_case import LLMTestCase
from deepeval.metrics import (
    ContextualPrecisionMetric,
    ContextualRecallMetric,
    ContextualRelevancyMetric,
    AnswerRelevancyMetric,
    FaithfulnessMetric
)


In [None]:


# Retrieval Metrics
contextual_precision = ContextualPrecisionMetric(threshold=0.7,model=CustomGeminiFlash())
contextual_recall = ContextualRecallMetric(threshold=0.7,model=CustomGeminiFlash())
contextual_relevancy = ContextualRelevancyMetric(threshold=0.7,model=CustomGeminiFlash())

# Generation Metrics
answer_relevancy = AnswerRelevancyMetric(threshold=0.7, model=CustomGeminiFlash())
faithfulness = FaithfulnessMetric(threshold=0.7, model=CustomGeminiFlash())



# Evaluate with all metrics
results = evaluate(
    test_cases=test_cases,
    run_async=True,
    max_concurrent=200,
    show_indicator=True,
    metrics=[
        contextual_precision,
        contextual_recall,
        contextual_relevancy,
        answer_relevancy,
        faithfulness
    ]
)

In [None]:

import json

# Assuming `results` is your EvaluationResult object
# Define a function to convert custom objects to dictionaries
def evaluation_result_to_dict(evaluation_result):
    return {
        "test_results": [
            {
                "name": test_result.name,
                "success": test_result.success,
                "metrics_data": [
                    {
                        "name": metric.name,
                        "threshold": metric.threshold,
                        "success": metric.success,
                        "score": metric.score,
                        "reason": metric.reason,
                        "strict_mode": metric.strict_mode,
                        "evaluation_model": metric.evaluation_model,
                        "error": metric.error,
                        "evaluation_cost": metric.evaluation_cost,
                        "verbose_logs": metric.verbose_logs,
                    }
                    for metric in test_result.metrics_data
                ],
                "input": test_result.input,
                "actual_output": test_result.actual_output,
                "expected_output": test_result.expected_output,
                "retrieval_context": test_result.retrieval_context,
            }
            for test_result in evaluation_result.test_results
        ]
    }

# Convert results to a JSON-compatible dictionary
results_dict = evaluation_result_to_dict(results)

# Save to a JSON file
file_path = r"D:\WorkingDir\Vanilla\EvalResult.json"
with open(file_path, "w", encoding="utf-8") as file:
    json.dump(results_dict, file, indent=4)

print(f"Evaluation results have been saved to {file_path}")


Evaluation results have been saved to D:\WorkingDir\Vanilla\EvalResult.json
