In [20]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.vectorstores import FAISS

llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-pro",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

In [21]:
import getpass
import os
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.llms import HuggingFaceHub

def _set_env(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass.getpass(f"{var}: ")

os.environ["LANGCHAIN_TRACING_V2"]="true"
os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"]="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"
os.environ["LANGCHAIN_PROJECT"]="langchain-project"

EMBEDDING_MODEL_NAME = "BAAI/bge-small-en-v1.5"
EMBEDDING_PATH = "C:/code/qp-ai-assessment/api/misc/embeddings/"
MODEL_KWARGS = {"device": "cpu"}
ENCODE_KWARGS = {"normalize_embeddings": True}



huggingface_embeddings = HuggingFaceBgeEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    model_kwargs=MODEL_KWARGS,
    encode_kwargs=ENCODE_KWARGS
)



In [22]:
from langsmith import Client


# QA
inputs = [
    "What are some key achievements and recognitions in Aniket Thorat's professional career?",
    "What are Aniket Thorat's core competencies in AI and machine learning?",
    "What is Aniket Thorat's current role and responsibilities at Persistent Systems?",
    "What educational background and certifications does Aniket Thorat have?",
    "Can you describe some of the projects Aniket Thorat has undertaken?"
]

outputs = [
    "Aniket Thorat has received several notable achievements and recognitions: 1. Topper in GEMS category for Persistent 2022 batch. 2. Multiple Bravo Awards from CTO for exceptional project contributions. 3. Best Project Honor in February 2019 by Rotary Club of Poona West. 4. Project with Best Social Impact in March 2019 by ZEAL Institutes. 5. Conducts AI tools workshops for clients and internal teams. These highlight his technical expertise, project success, and commitment to knowledge sharing in AI and software engineering.",

    "Aniket Thorat's core competencies in AI and machine learning include: 1. Machine Learning & Deep Learning. 2. Data Preprocessing. 3. Natural Language Processing. 4. Neural Network Architecture Design. 5. Model Deployment & Optimization. 6. AI Frameworks & Libraries. 7. Computer Vision. 8. Algorithm Development. 9. Data Analysis & Visualization. These skills demonstrate his comprehensive expertise in various aspects of AI and machine learning.",

    "At Persistent Systems, Aniket Thorat is a Senior Software Engineer. His responsibilities include: 1. Designing and implementing AI-driven solutions for complex customer challenges. 2. Developing AI Ops applications and collaborating with hyperscale providers. 3. Working with pre-sales and delivery excellence teams to demonstrate AI capabilities. 4. Evaluating emerging technologies to improve developer productivity. 5. Managing the inter-company Knowledge Traverser Platform. 6. Designing and implementing structured knowledge graphs for unstructured data exploration.",

    "Aniket Thorat's educational background and certifications include: 1. B.E. (Computer Engineering) from Sinhgad Institute of Technology and Science (SITS), completed in 2022. 2. Data Science Specialization certification from IBM, obtained in 2022. This combination of formal education and specialized certification demonstrates his strong foundation in computer engineering and data science.",

    "Aniket Thorat has undertaken several significant projects: 1. Production Failure and Root Cause Analysis (RCA) Automation: Automated prediction and identification of production failures to minimize downtime and enhance system reliability. 2. MingMate (14 Weeks): Improved developer productivity by integrating AI and Generative AI into software development processes. 3. Root Navigation for Blind Person (5 Months): Developed an assistant tool to help visually impaired individuals navigate and find routes. These projects showcase his ability to apply AI solutions to diverse real-world problems."
]


qa_pairs = [{"question": q, "answer": a} for q, a in zip(inputs, outputs)]

# Create dataset
client = Client()
dataset_name = "RAG_test_LCEL22"
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="QA pairs about LCEL.",
)
client.create_examples(
    inputs=[{"question": q} for q in inputs],
    outputs=[{"answer": a} for a in outputs],
    dataset_id=dataset.id,
)

In [23]:
### INDEX
'''
from bs4 import BeautifulSoup as Soup
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS

# Load docs
url = "https://python.langchain.com/v0.1/docs/expression_language/"
loader = RecursiveUrlLoader(
    url=url, max_depth=20, extractor=lambda x: Soup(x, "html.parser").text
)
docs = loader.load()

# Split into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=4500, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

# Embed and store in Chroma
vectorstore = FAISS.from_documents(documents=splits, embedding=huggingface_embeddings)

# Index
retriever = vectorstore.as_retriever()
'''
vectorstore = FAISS.load_local("C:/code/qp-ai-assessment/api/misc/embeddings", huggingface_embeddings,allow_dangerous_deserialization=True)
retriever = vectorstore.as_retriever()


In [26]:
from typing import List
from langchain_community.llms import HuggingFaceHub
from langchain.schema import Document
from langsmith import traceable

class RagBot:
    def __init__(self, retriever, repo_id: str = "mistralai/Mistral-7B-v0.1"):
        self._retriever = retriever
        self._llm = HuggingFaceHub(
            repo_id=repo_id,
            model_kwargs={"temperature": 0.1, "max_length": 500}
        )

    @traceable()
    def retrieve_docs(self, question):
        return self._retriever.get_relevant_documents(question)

    @traceable()
    def invoke_llm(self, question: str, docs: List[Document]):
        docs_content = "\n\n".join([doc.page_content for doc in docs])
        system_prompt = (
            "You are a helpful AI code assistant with expertise in LCEL. "
            "Use the following docs to produce a concise code solution to the user question.\n\n"
            f"## Docs\n\n{docs_content}\n\n"
        )
        
        input_text = f"{system_prompt}Human: {question}\nAssistant:"
        
        response = self._llm(input_text)
        
        return {
            "answer": response,
            "contexts": [doc.page_content for doc in docs],
        }

    @traceable()
    def get_answer(self, question: str):
        docs = self.retrieve_docs(question)
        return self.invoke_llm(question, docs)

# Initialize the RagBot with a retriever
rag_bot = RagBot(retriever)


In [27]:
response = rag_bot.get_answer("What is LCEL?")
response["answer"][:150]

'You are a helpful AI code assistant with expertise in LCEL. Use the following docs to produce a concise code solution to the user question.\n\n## Docs\n\n'

In [28]:
# RAG chain
def predict_rag_answer(example: dict):
    """Use this for answer evaluation"""
    response = rag_bot.get_answer(example["question"])
    return {"answer": response["answer"]}

def predict_rag_answer_with_context(example: dict):
    """Use this for evaluation of retrieved documents and hallucinations"""
    response = rag_bot.get_answer(example["question"])
    return {"answer": response["answer"], "contexts": response["contexts"]}

### Reference Answer

In [29]:
from langsmith.evaluation import LangChainStringEvaluator, evaluate
from langchain.llms import HuggingFaceEndpoint
from langchain.evaluation import load_evaluator

# Define the LLM


# Evaluator
qa_evaluator = [
    LangChainStringEvaluator(
        "cot_qa",
        config={"llm": llm},  # Pass the LLM to the evaluator
        prepare_data=lambda run, example: {
            "prediction": run.outputs["answer"],
            "reference": example.outputs["answer"],
            "input": example.inputs["question"],
        },
    )
]

#dataset_name = "RAG_test_LCEL"

experiment_results = evaluate(
    predict_rag_answer,
    data=dataset_name,
    evaluators=qa_evaluator,
    experiment_prefix="rag-qa-mistral",
    metadata={"variant": "LCEL context, Mistral-7B"},
)


View the evaluation results for experiment: 'rag-qa-mistral-79ab9e2e' at:
https://smith.langchain.com/o/b59f3614-9d00-5c85-a713-5c5f2675f256/datasets/e4013450-3b90-4ce2-8f20-1e24f36e61ff/compare?selectedSessions=f23a9953-6be4-43b3-91c2-2d812280dd2b




0it [00:00, ?it/s]

### Answer Hallucination

In [30]:
from langsmith.evaluation import LangChainStringEvaluator


answer_hallucination_evaluator = LangChainStringEvaluator(
    "labeled_score_string",
    config={
        "criteria": {
            "accuracy": """Is the Assistant's Answer grounded in the Ground Truth documentation? A score of [[1]] means that the
            Assistant answer contains is not at all based upon / grounded in the Ground Truth documentation. A score of [[5]] means 
            that the Assistant answer contains some information (e.g., a hallucination) that is not captured in the Ground Truth 
            documentation. A score of [[10]] means that the Assistant answer is fully based upon the in the Ground Truth documentation."""
        },
        "llm": llm,  # Explicitly pass the Hugging Face model here
        # If you want the score to be saved on a scale from 0 to 1
        "normalize_by": 10,
    },
    prepare_data=lambda run, example: {
        "prediction": run.outputs["answer"],
        "reference": run.outputs["contexts"],
        "input": example.inputs["question"],
    },
)
dataset_name = "RAG_test_LCEL22"
experiment_results = evaluate(
    predict_rag_answer_with_context,
    data=dataset_name,
    evaluators=[answer_hallucination_evaluator],
    experiment_prefix="rag-qa-oai-hallucination",
    # Any experiment metadata can be specified here
    metadata={
        "variant": "LCEL context, gpt-3.5-turbo",
    },
)

View the evaluation results for experiment: 'rag-qa-oai-hallucination-06a72931' at:
https://smith.langchain.com/o/b59f3614-9d00-5c85-a713-5c5f2675f256/datasets/e4013450-3b90-4ce2-8f20-1e24f36e61ff/compare?selectedSessions=79d79738-4051-411b-a939-bf69987c60e9




0it [00:00, ?it/s]

### Document Relevance to Question

In [33]:
from langsmith.evaluation import LangChainStringEvaluator, evaluate
import textwrap

docs_relevance_evaluator = LangChainStringEvaluator(
    "score_string",
    config={
        "criteria": {
            "document_relevance": textwrap.dedent(
                """The response is a set of documents retrieved from a vectorstore. The input is a question
            used for retrieval. You will score whether the Assistant's response (retrieved docs) is relevant to the Ground Truth 
            question. A score of [[1]] means that none of the  Assistant's response documents contain information useful in answering or addressing the user's input.
            A score of [[5]] means that the Assistant answer contains some relevant documents that can at least partially answer the user's question or input. 
            A score of [[10]] means that the user input can be fully answered using the content in the first retrieved doc(s)."""
            )
        },
        "llm": llm,
        # If you want the score to be saved on a scale from 0 to 1
        "normalize_by": 10,
    },
    prepare_data=lambda run, example: {
        "prediction": run.outputs["contexts"],
        "input": example.inputs["question"],
    },
)


dataset_name = "RAG_test_LCEL22"
experiment_results = evaluate(
    predict_rag_answer_with_context,
    data=dataset_name,
    evaluators=[docs_relevance_evaluator],
    experiment_prefix="rag-qa-oai-doc-relevance",
    # Any experiment metadata can be specified here
    metadata={
        "variant": "LCEL context, gpt-3.5-turbo",
    },
)

This chain was only tested with GPT-4. Performance may be significantly worse with other models.


View the evaluation results for experiment: 'rag-qa-oai-doc-relevance-579f0392' at:
https://smith.langchain.com/o/b59f3614-9d00-5c85-a713-5c5f2675f256/datasets/e4013450-3b90-4ce2-8f20-1e24f36e61ff/compare?selectedSessions=6785e49e-2b66-4875-b099-83a46ea144bb




0it [00:00, ?it/s]

### Evaluating intermediate traces


In [37]:
from langchain_openai import ChatOpenAI
from langsmith.schemas import Example, Run
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

def document_relevance_grader(root_run: Run, example: Example) -> dict:
    """
    A simple evaluator that checks to see if retrieved documents are relevant to the question
    """

    # Get documents and question
    rag_pipeline_run = next(run for run in root_run.child_runs if run.name == "get_answer")
    retrieve_run = next(run for run in rag_pipeline_run.child_runs if run.name == "retrieve_docs")
    doc_txt = "\n\n".join(doc.page_content for doc in retrieve_run.outputs["output"])
    question = retrieve_run.inputs["question"] 

    # Data model for grade
    class GradeDocuments(BaseModel):
        """Binary score for relevance check on retrieved documents."""
        binary_score: int = Field(description="Documents are relevant to the question, 1 or 0")
    
    # LLM with function call 
    structured_llm_grader = llm.with_structured_output(GradeDocuments)
    
    # Prompt 
    system = """You are a grader assessing relevance of a retrieved document to a user question. \n 
        If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. \n
        It does not need to be a stringent test. The goal is to filter out erroneous retrievals. \n
        Give a binary score 1 or 0 score, where 1 means that the document is relevant to the question."""
    
    grade_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system),
            ("human", "Retrieved document: \n\n {document} \n\n User question: {question}"),
        ]
    )
    
    retrieval_grader = grade_prompt | structured_llm_grader
    score = retrieval_grader.invoke({"question": question, "document": doc_txt})
    return {"key": "document_relevance", "score": int(score.binary_score)}

def answer_hallucination_grader(root_run: Run, example: Example) -> dict:
    """
    A simple evaluator that checks to see the answer is grounded in the documents
    """

    # Get documents and answer
    rag_pipeline_run = next(run for run in root_run.child_runs if run.name == "get_answer")
    retrieve_run = next(run for run in rag_pipeline_run.child_runs if run.name == "retrieve_docs")
    doc_txt = "\n\n".join(doc.page_content for doc in retrieve_run.outputs["output"])
    generation = rag_pipeline_run.outputs["answer"]
    
    # Data model
    class GradeHallucinations(BaseModel):
        """Binary score for hallucination present in generation answer."""
    
        binary_score: int = Field(description="Answer is grounded in the facts, 1 or 0")
    
    # LLM with function call 
    structured_llm_grader = llm.with_structured_output(GradeHallucinations)
    
    # Prompt 
    system = """You are a grader assessing whether an LLM generation is grounded in / supported by a set of retrieved facts. \n 
         Give a binary score 1 or 0, where 1 means that the answer is grounded in / supported by the set of facts."""
    hallucination_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system),
            ("human", "Set of facts: \n\n {documents} \n\n LLM generation: {generation}"),
        ]
    )
    
    hallucination_grader = hallucination_prompt | structured_llm_grader
    score = hallucination_grader.invoke({"documents": doc_txt, "generation": generation})
    return {"key": "answer_hallucination", "score": int(score.binary_score)}

from langsmith.evaluation import evaluate

dataset_name = "RAG_test_LCEL22"
experiment_results = evaluate(
    predict_rag_answer,
    data=dataset_name,
    evaluators=[document_relevance_grader,answer_hallucination_grader],
    experiment_prefix= "LCEL context"
)

View the evaluation results for experiment: 'LCEL context-b816af4e' at:
https://smith.langchain.com/o/b59f3614-9d00-5c85-a713-5c5f2675f256/datasets/e4013450-3b90-4ce2-8f20-1e24f36e61ff/compare?selectedSessions=f035bb9e-193a-4708-80d8-5c0762bfd45b




0it [00:00, ?it/s]

{'name': 'GradeDocuments', 'description': 'Binary score for relevance check on retrieved documents.', 'parameters': {'type_': 6, 'properties': {'binary_score': {'type_': 3, 'description': 'Documents are relevant to the question, 1 or 0', 'format_': '', 'nullable': False, 'enum': [], 'max_items': '0', 'min_items': '0', 'properties': {}, 'required': []}}, 'required': ['binary_score'], 'format_': '', 'description': '', 'nullable': False, 'enum': [], 'max_items': '0', 'min_items': '0'}}
{'name': 'GradeDocuments', 'description': 'Binary score for relevance check on retrieved documents.', 'parameters': {'type_': 6, 'properties': {'binary_score': {'type_': 3, 'description': 'Documents are relevant to the question, 1 or 0', 'format_': '', 'nullable': False, 'enum': [], 'max_items': '0', 'min_items': '0', 'properties': {}, 'required': []}}, 'required': ['binary_score'], 'format_': '', 'description': '', 'nullable': False, 'enum': [], 'max_items': '0', 'min_items': '0'}}
{'name': 'GradeDocuments