In [None]:

# Step 1: Define Sample Documents
documents = [
    {"section": "Employee Info", "content": "John's pay is processed on the 1st of every month."},
    {"section": "Employee Info", "content": "Mark is on a leave of absence until next Monday."},
    {"section": "Employee Info", "content": "Julie is a software engineer."},
    {"section": "Employee Info", "content": "Julie's pay is processed on the 1st of every month."},
    {"section": "Employee Info", "content": "Mark is a product manager."},
    {"section": "Employee Info", "content": "John is an AI architect and has salary of 500K USD."},
]

# Step 2: Get Content Texts
content_corpus = [doc["content"] for doc in documents]
content_corpus

from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")
doc_vectors = model.encode(content_corpus)

doc_vectors
print(doc_vectors.shape)


["John's pay is processed on the 1st of every month.",
 'Mark is on a leave of absence until next Monday.',
 'Julie is a software engineer.',
 "Julie's pay is processed on the 1st of every month.",
 'Mark is a product manager.',
 'John is an AI architect and has salary of 500K USD.']

In [64]:
# Step 3: User Query and Semantic Matching
import numpy as np

query = "Tell me about John's role."
query_vec = model.encode([query])[0]
# query_vec


similarities = model.similarity(query_vec, doc_vectors)

# Ensure it's a 1D numpy array
similarities = np.asarray(similarities).squeeze()

# Now get top 3
top_3_indices = np.argsort(similarities)[::-1][:3]
top_scores = similarities[top_3_indices]
top_scores

top_docs = [documents[i]['content'] for i in top_3_indices]

top_docs
context = "\n---\n".join(top_docs)
top_docs, context

(["John's pay is processed on the 1st of every month.",
  'John is an AI architect and has salary of 500K USD.',
  'Mark is a product manager.'],
 "John's pay is processed on the 1st of every month.\n---\nJohn is an AI architect and has salary of 500K USD.\n---\nMark is a product manager.")

In [65]:
import os
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv(override=True, dotenv_path="../.env")
my_api_key = os.getenv("OPEN_AI_API_KEY")

my_client = OpenAI(api_key=my_api_key)
# my_client

def ask_question_open_ai(prompt, context=''):
    """Call the LLM with the provided prompt and context.

    IMPORTANT: use the passed-in prompt (not a global variable) so each
    evaluation example can be answered correctly.
    """
    llm_response = my_client.chat.completions.create(
        model="gpt-5-nano",
        messages=[
            {"role": "system", "content": '''
             You are an assistant who answers only based on the given context.
             '''},
            {"role": "user", "content": f"Context: {context}\n\nUser Question: {prompt}"}
        ]

    )
    return llm_response.choices[0].message.content


In [69]:
print (query)
response = ask_question_open_ai(query, context)

response

Tell me about John's role.


'John is an AI architect.'

#### Create a LangSmith Dataset

In [70]:
from langsmith import Client

client = Client()

dataset_name = "Employee-Info-QA-Dataset_3"
dataset = client.create_dataset(dataset_name=dataset_name)

examples = [
    {"input": "When is John's pay processed?", "output": "John's pay is processed on the 1st of every month."},
    {"input": "What is Julie's job title?", "output": "Julie is a software engineer."},
    {"input": "What is John's salary?", "output": "John has a salary of 500K USD."},
    {"input": "What is Mark's current work status?", "output": "Mark is on a leave of absence until next Monday."},
]

for ex in examples:
    client.create_example(inputs={"question": ex["input"]}, outputs={"answer": ex["output"]}, dataset_id=dataset.id)

print(f" Dataset '{dataset_name}' created with {len(examples)} examples.")


 Dataset 'Employee-Info-QA-Dataset_3' created with 4 examples.


##### Run LangSmith Evaluation

In [71]:
# %pip install --upgrade --no-cache-dir langsmith

# import langsmith
# print(langsmith.__version__)

In [72]:
# !pip index versions langsmith

In [80]:
from langsmith.evaluation import evaluate, RunEvaluator

from langsmith.evaluation import RunEvaluator
from openai import OpenAI
import os

class SimpleCorrectness(RunEvaluator):
    """Use an LLM to evaluate correctness of prediction vs reference."""

    def evaluate_run(self, run, example, **kwargs):
        ref = example.outputs.get("answer", "").strip()
        pred = run.outputs.get("answer", "").strip()

        if not ref or not pred:
            return {
                "key": "correctness",
                "score": 0.0,
                "explanation": "Missing reference or prediction"
            }

        # Replace substring match with LLM call
        score, reason = self.llm_judge(ref, pred, example.inputs.get("question", ""))

        return {
            "key": "correctness",
            "score": score,
            "explanation": reason
        }

    # Separate LLM evaluation logic
    def llm_judge(self, reference: str, prediction: str, question: str = ""):
        """Ask the LLM to grade the prediction against the reference."""
        client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

        system_prompt = (
            "You are an impartial evaluator that rates how correct a model's answer is "
            "compared to a ground truth reference. "
            "Respond with a numeric score between 0 and 1, where 1 = fully correct, 0 = incorrect."
        )

        user_prompt = f"""
        Question: {question}
        Reference answer: {reference}
        Model prediction: {prediction}

        Rate correctness (0–1). Then briefly justify your score.
        Return your answer in JSON as:
        {{"score": <number>, "reason": "<short explanation>"}}.
        """

        resp = client.chat.completions.create(
            model="gpt-4o-mini",
            temperature=0,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt},
            ],
        )

        content = resp.choices[0].message.content.strip()
        score, reason = 0.0, "Unparsed LLM response"

        # Try to extract score/reason if JSON-like
        import json, re
        try:
            if "{" in content:
                parsed = json.loads(content)
                score = float(parsed.get("score", 0))
                reason = parsed.get("reason", reason)
            else:
                # Fallback: parse first number
                num_match = re.search(r"([01](?:\.\d+)?)", content)
                if num_match:
                    score = float(num_match.group(1))
                reason = content
        except Exception as e:
            reason = f"Parse error: {e}. Raw: {content}"

        # Clamp score to [0, 1]
        score = max(0.0, min(1.0, score))
        return score, reason

# Instantiate the evaluator
simple_correctness = SimpleCorrectness()


In [81]:

# Define your target function that performs retrieval per-question
def ask_question(inputs):
    question = inputs["question"]
    # compute embedding for the question
    query_vec = model.encode([question])[0]

    # compute cosine similarities between query and doc_vectors
    import numpy as np
    q_norm = np.linalg.norm(query_vec) + 1e-8
    doc_norms = np.linalg.norm(doc_vectors, axis=1) + 1e-8
    sims = np.dot(doc_vectors, query_vec) / (doc_norms * q_norm)

    # pick top-3 supporting docs and build context
    top_3_indices = np.argsort(sims)[::-1][:3]
    top_docs = [content_corpus[i] for i in top_3_indices]
    context = "\n---\n".join(top_docs)

    # call LLM with question and its retrieved context
    answer = ask_question_open_ai(question, context)
    return {"answer": answer}


In [None]:

# Run evaluation
results = evaluate(
    ask_question,
    data="Employee-Info-QA-Dataset_2",
    evaluators=[simple_correctness],
    experiment_prefix="langsmith_eval_test",
)


View the evaluation results for experiment: 'langsmith_eval_test-890e4da5' at:
https://smith.langchain.com/o/79133fd6-316c-4b99-b886-38847131d1e1/datasets/e46dc4f2-57e6-4bbc-971b-5ddfb58ad91c/compare?selectedSessions=239cddfc-0fbd-4cbd-9ac9-b6343bd069f3




0it [00:00, ?it/s]

In [None]:
for r in results:   # each r is a dict
    example = r["example"]
    eval_results = r["evaluation_results"]["results"]

    print(f"\n Question: {example.inputs['question']}")
    print(f"Expected: {example.outputs['answer']}")

    # Extract model output
    run = r["run"]
    if hasattr(run, "outputs") and "answer" in run.outputs:
        print(f"Predicted: {run.outputs['answer']}")
    else:
        print("Predicted: (no output found)")

    # Print evaluator results
    for e in eval_results:
        print(f"Evaluator: {e.key}, Score: {e.score}, Explanation: {getattr(e, 'explanation', None)}")



 Question: What is Mark's current work status?
Expected: Mark is on a leave of absence until next Monday.
Predicted: Mark is currently on a leave of absence until next Monday.
Evaluator: correctness, Score: 0.0, Explanation: None

 Question: What is John's salary?
Expected: John has a salary of 500K USD.
Predicted: John's salary is 500K USD (500,000 USD).
Evaluator: correctness, Score: 0.0, Explanation: None

 Question: What is Julie's job title?
Expected: Julie is a software engineer.
Predicted: Software engineer.
Evaluator: correctness, Score: 0.0, Explanation: None

 Question: When is John's pay processed?
Expected: John's pay is processed on the 1st of every month.
Predicted: John's pay is processed on the 1st of every month.
Evaluator: correctness, Score: 1.0, Explanation: None
