In [None]:
%pip install sentence_transformers

In [None]:

# Step 1: Define Sample Documents
documents = [
    {"section": "Employee Info", "content": "John's pay is processed on the 1st of every month."},
    {"section": "Employee Info", "content": "Mark is on a leave of absence until next Monday."},
    {"section": "Employee Info", "content": "Julie is a software engineer."},
    {"section": "Employee Info", "content": "Julie's pay is processed on the 1st of every month."},
    {"section": "Employee Info", "content": "Mark is a product manager."},
    {"section": "Employee Info", "content": "John is an AI architect and has salary of 500K USD."},
]

# Step 2: Get Content Texts
content_corpus = [doc["content"] for doc in documents]
content_corpus

from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")
doc_vectors = model.encode(content_corpus)

doc_vectors
print(doc_vectors.shape)


In [None]:
import os
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv(override=True, dotenv_path="../.env.local")
my_api_key = os.getenv("OPEN_AI_API_KEY")

my_client = OpenAI(api_key=my_api_key)
# my_client


# Define your target function that performs retrieval per-question
def ask_question_open_ai(prompt, context=''):
    """Call the LLM with the provided prompt and context.

    IMPORTANT: use the passed-in prompt (not a global variable) so each
    evaluation example can be answered correctly.
    """
    llm_response = my_client.chat.completions.create(
        model="gpt-5-nano",
        messages=[
            {"role": "system", "content": '''
             You are an assistant who answers only based on the given context.
             '''},
            {"role": "user", "content": f"Context: {context}\n\nUser Question: {prompt}"}
        ]

    )
    return llm_response.choices[0].message.content


In [None]:
# from langchain_openai import ChatOpenAI
from langsmith import traceable # Need to enable tracing on LangSmith

# Define your target function that performs retrieval per-question
@traceable 
def ask_question(inputs):
    question = inputs["question"]
    # compute embedding for the question
    query_vec = model.encode([question])[0]

    # compute cosine similarities between query and doc_vectors
    similarities = model.similarity(query_vec, doc_vectors)

    import numpy as np

    # # Ensure it's a 1D numpy array
    similarities = np.asarray(similarities).squeeze()

    # Now get top 3
    top_3_indices = np.argsort(similarities)[::-1][:3]
    top_scores = similarities[top_3_indices]
    top_scores

    top_docs = [documents[i]['content'] for i in top_3_indices]

    # # pick top-3 supporting docs and build context
    # top_3_indices = np.argsort(sims)[::-1][:3]
    top_docs = [content_corpus[i] for i in top_3_indices]
    context = "\n---\n".join(top_docs)

    # call LLM with question and its retrieved context
    answer = ask_question_open_ai(question, context)
    
    return {"answer": answer}


In [None]:
ask_question({"question": "When is John's pay processed?"})

##### Run LangSmith Evaluation

In [None]:
from langsmith import Client

client = Client()

dataset_name = "2025Dec-Employee-Info-QA-Dataset-5"
dataset = client.create_dataset(dataset_name=dataset_name)

examples = [
    {"input": "When is John's pay processed?", "output": "John's pay is processed on the 1st of every month."},
    {"input": "What is Julie's job title?", "output": "Julie is a software engineer."},
    {"input": "What is John's salary?", "output": "John has a salary of 500K USD."},
    {"input": "What is Mark's current work status?", "output": "Mark is on a leave of absence until next Monday."},
]

for ex in examples:
    client.create_example(inputs={"question": ex["input"]}, outputs={"answer": ex["output"]}, dataset_id=dataset.id)

print(f" Dataset '{dataset_name}' created with {len(examples)} examples.")


In [None]:
import os
import json
from openai import OpenAI
from langsmith.evaluation import RunEvaluator

class SimpleCorrectness(RunEvaluator):
    """LLM-as-a-judge correctness evaluator (version-safe)."""

    def evaluate_run(self, run, example, **kwargs):

        question = example.inputs.get("question", "")
        reference = example.outputs.get("answer", "")
        prediction = run.outputs.get("answer")
        
       
        client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

        response = client.chat.completions.create(
            model="gpt-5-nano",
            messages=[
                {
                    "role": "system",
                    "content": "You are a semantic correctness evaluator."
                },
                {
                    "role": "user",
                    "content": f"""
                        Question: {question}
                        Reference answer: {reference}
                        Model prediction: {prediction}

                        Return JSON only:
                        {{"score": <number between 0 and 1>, "reason": "<short explanation>"}}
                        """
                }
            ],
        )

        content = response.choices[0].message.content
        data = json.loads(content)

        score = float(data["score"])
        reason = data["reason"]

        # return max(0.0, min(1.0, score)), reason
        return {
            "key": "correctness",
            "score": max(0.0, min(1.0, score)),
            "commentary": reason,
        }



In [None]:
from langsmith.evaluation import evaluate

simple_correctness = SimpleCorrectness()

results = evaluate(
    ask_question, # the function to evaluate
    data=dataset_name, # the dataset to use
    evaluators=[simple_correctness], # the evaluators to use
    experiment_prefix="langsmith_eval_test",
)


In [None]:
for r in results:  
    print (r)

In [None]:
for r in results:   # each r is a dict
    example = r["example"]
    eval_results = r["evaluation_results"]["results"]
    run = r["run"]

    print(f"Question: {example.inputs['question']}")
    print(f"Expected: {example.outputs['answer']}")

    # Extract model output
    
    if hasattr(run, "outputs") and "answer" in run.outputs:
        print(f"Predicted: {run.outputs['answer']}")
    else:
        print("Predicted: (no output found)")

    # Print evaluator results
    for e in eval_results:
        # print(e)
        print(f"Evaluator: {e.key}, Score: {e.score}, Explanation: {getattr(e, 'reason', None)}")
