#### Chatbot Evaluation

In [11]:
import os
from dotenv import load_dotenv
load_dotenv()


os.environ['LANGSMITH_API_KEY']=os.getenv('LANGSMITH_API_KEY')
os.environ['OPENAI_API_KEY']=os.getenv('OPENAI_API_KEY')
os.environ['LANGSMITH_TRACING']='true'

In [12]:
from langsmith import Client
client=Client()

dataset_name = 'simple_chatbot_evaluation'
dataset = client.create_dataset(dataset_name=dataset_name)

examples = [
    {
        "inputs": {"question": "What is an iterator in Python?"},
        "outputs": {"answer": "An iterator in Python is an object that implements the __iter__() and __next__() methods, allowing iteration over elements one at a time."}
    },
    {
        "inputs": {"question": "What is the purpose of the attention mechanism in NLP?"},
        "outputs": {"answer": "The attention mechanism helps models focus on the most relevant parts of the input sequence when generating output, improving performance in tasks like translation and summarization."}
    },
    {
        "inputs": {"question": "Who is the founder of OpenAI?"},
        "outputs": {"answer": "OpenAI was co-founded by Elon Musk, Sam Altman, Greg Brockman, Ilya Sutskever, John Schulman, and Wojciech Zaremba."}
    },
    {
        "inputs": {"question": "What is RAG in LLMs?"},
        "outputs": {"answer": "RAG, or Retrieval-Augmented Generation, combines retrieval from an external knowledge base with generative models to provide more accurate and grounded responses."}
    },
    {
        "inputs": {"question": "Tell me about the dummy company LexoraTech."},
        "outputs": {"answer": "LexoraTech is a fictional AI research and consulting firm specializing in machine learning solutions, regulatory compliance, and enterprise automation."}
    },
    {
        "inputs": {"question": "What is LangChain?"},
        "outputs": {"answer": "LangChain is a framework for building applications with large language models by integrating retrieval, chaining, and external tools."}
    },
    {
        "inputs": {"question": "What is the capital of France?"},
        "outputs": {"answer": "The capital of France is Paris."}
    },
    {
        "inputs": {"question": "Who won the FIFA World Cup in 2018?"},
        "outputs": {"answer": "France won the FIFA World Cup in 2018, defeating Croatia 4-2 in the final."}
    },
    {
        "inputs": {"question": "Explain the difference between iterable and iterator in Python."},
        "outputs": {"answer": "An iterable is any object capable of returning its elements one at a time, like lists or strings, and implements __iter__(). An iterator is the object returned by __iter__() that produces elements using __next__()."}
    },
    {
        "inputs": {"question": "What is the main benefit of Transformers over RNNs?"},
        "outputs": {"answer": "Transformers can process sequences in parallel using self-attention, avoiding the sequential bottleneck of RNNs and enabling better handling of long-range dependencies."}
    }
]

# Upload each example
for ex in examples:
    client.create_example(
        dataset_id=dataset.id,
        inputs=ex["inputs"],
        outputs=ex["outputs"]
    )

print("All examples uploaded successfully!")


LangSmithConflictError: Conflict for /datasets. HTTPError('409 Client Error: Conflict for url: https://api.smith.langchain.com/datasets', '{"detail":"Dataset with this name already exists."}')

#### Define Metrics(LLM as a Judge)

In [22]:
import openai
from langsmith import wrappers


openai_client=wrappers.wrap_openai(
    openai.OpenAI()
)

eval_instruction="you are an expert professor specialized in grading student's answer to questions " 

In [14]:
def correctness(inputs:dict,outputs:dict,reference_outputs:dict) -> bool:
    user_content=f"""You are grading the following question:
    {inputs['question']}
Here is the real answer:
{reference_outputs['answer']}
You are grading the following predicted answer:
{outputs['response']}
Respond with CORRECT or INCORRECT:
Grade:
"""
    response=openai.chat.completions.create(
    model='gpt-4o-mini',
    temperature=0,
    messages=[
        {'role':"system",'content':eval_instruction},
        {'role':'user','content':user_content}
    ]
).choices[0].message.content
    

    return response=="CORRECT"
    


In [15]:
def concision(outputs:dict,reference_outputs:dict)->bool:
    return int(len(outputs['response'])) < 2 * len(reference_outputs['answer'])

#### Run Evaluation

In [23]:
default_instruction="Respond to the user question in a short,concise manner (one short sentence)."

def my_app(question:str,model:str='gpt-4o-mini',instruction:str=default_instruction)-> str:
    return openai_client.chat.completions.create(
        model=model,
        temperature=0,
        messages=[{'role':'system','content':instruction},
                  {'role':'user','content':question}],
    ).choices[0].message.content

In [19]:
from langchain_groq import ChatGroq

In [24]:
def ls_target(inputs:str) -> dict:
    return {'response':my_app[inputs['question']]}

In [25]:
# experiment_result=client.evaluate(
#     ls_target,
#     data=dataset_name,
#     evaluators=[correctness,concision],
#     experiment_prefix="openai-4o-mini-chatbot"
# )