In [1]:
from config import settings
import os

os.environ["LANGSMITH_TRACING"] = str(settings.LANGSMITH_TRACING)
os.environ["LANGSMITH_API_KEY"] = settings.LANGSMITH_API_KEY
os.environ["OPENAI_API_KEY"] = settings.OPENAI_API_KEY

### Setup LLM

In [None]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.8)

### Setup RAG

In [26]:
import bs4
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

THINKPYTHON_CHAPTERS = [
    "https://allendowney.github.io/ThinkPython/chap01.html",
    "https://allendowney.github.io/ThinkPython/chap02.html",
    "https://allendowney.github.io/ThinkPython/chap03.html",
]

def load_python_docs():
    bs4_strainer = bs4.SoupStrainer(
        name=("h1", "h2", "h3", "p", "pre", "code")
    )

    loader = WebBaseLoader(
        web_paths=THINKPYTHON_CHAPTERS,
        bs_kwargs={"parse_only": bs4_strainer},
    )

    docs = loader.load()

    for d in docs:
        d.metadata["topic"] = "python"
        d.metadata["source"] = "thinkpython"

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        add_start_index=True,
    )

    return splitter.split_documents(docs)


In [27]:

all_splits = load_python_docs()

print(f"Split blog post into {len(all_splits)} sub-documents.")

Split blog post into 68 sub-documents.


In [37]:
from langchain_openai import OpenAIEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore

embeddings = OpenAIEmbeddings()

vector_store = InMemoryVectorStore(embeddings)

document_ids = vector_store.add_documents(documents=all_splits)

retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 5}
)


In [38]:
retriever.invoke("What is a function in Python?")

[Document(id='e19b4585-043b-4c6a-949b-a9eba7e2ec15', metadata={'source': 'thinkpython', 'topic': 'python', 'start_index': 0}, page_content='Think PythonFront MatterChaptersEnd MatterFunctions Contents You can order print and ebook versions of Think Python 3e from\nBookshop.org and\nAmazon.3. Functions#In the previous chapter we used several functions provided by Python, like int and float, and a few provided by the math module, like sqrt and pow.\nIn this chapter, you will learn how to create your own functions and run them.\nAnd we’ll see how one function can call another.\nAs examples, we’ll display lyrics from Monty Python songs.\nThese silly examples demonstrate an important feature – the ability to write your own functions is the foundation of programming.This chapter also introduces a new statement, the for loop, which is used to repeat a computation.3.1. Defining new functions#A function definition specifies the name of a new function and the sequence of statements that run when

### RAG Chain with Logging + Latency

In [138]:
import time
import json
from uuid import uuid4
import traceback

def rag_chain(query, retriever, llm):
    start = time.time()
    error = None
    traceback_str = None

    try:
        docs = retriever.invoke(query)
        context = "\n\n".join(d.page_content for d in docs)
        answer = llm.invoke(
            f"Answer using the following context:\n{context}\n\nQ: {query}"
        ).content

    except Exception as e:
        answer = None
        context = None
        error = str(e)
        traceback_str = traceback.format_exc()

    latency = time.time() - start

    log = {
        "id": str(uuid4()),
        "query": query,
        "answer": answer,
        "retrieved_context": context,
        "latency": latency,
        "error": error,
        "traceback": traceback_str,
    }

    with open("logs/rag_logs.jsonl", "a") as f:
        f.write(json.dumps(log) + "\n")

    return answer, context


In [139]:
answer, context = rag_chain(
    query="What is recursion in Python?",
    retriever=retriever,
    llm=llm
)
print("Context:", context)
print("Answer:", answer)

Context: If you are curious about any of them, ask for more information.In this chapter we imported the math module and used some of the variable and functions it provides. Ask an assistant, “What variables and functions are in the math module?” and “Other than math, what modules are considered core Python?”2.11.2. Exercise#Repeating my advice from the previous chapter, whenever you learn a new feature, you should make errors on purpose to see what goes wrong.We’ve seen that n = 17 is legal. What about 17 = n?How about x = y = 1?In some languages every statement ends with a semi-colon (;). What
happens if you put a semi-colon at the end of a Python statement?What if you put a period at the end of a statement?What happens if you spell the name of a module wrong and try to import maath?2.11.3. Exercise#Practice using the Python interpreter as a calculator:Part 1.  The volume of a sphere with radius \(r\) is \(\frac{4}{3} \pi r^3\).

Think PythonFront MatterChaptersEnd MatterFunctions Con

### Generate Synthetic Queries using LLM

In [48]:
FEATURES = [
    "Concept Explanation",
    "Code Example",
#     "Debugging",
#     "Comparison",
#     "Best Practices",
]

DIMENSIONS = {
    "difficulty": ["Beginner", "Intermediate", "Advanced"],
    "ambiguity": ["Clear", "Ambiguous"],
    # "answer_style": ["Concise", "Example-heavy"],
}

PERSONAS = [
    "Beginner Student",
    "Working Developer",
    # "Interview Candidate",
    # "Python Enthusiast",
]


In [148]:
import itertools
import json
import pandas as pd

def generate_synthetic_queries():
    rows = []

    # Iterate through your personas and features
    for persona, feature, difficulty in itertools.product(
        PERSONAS, FEATURES, DIMENSIONS["difficulty"]
    ):
        print(f"Generating → {persona} | {feature} | {difficulty}")

        # --- UPDATED PROMPT ---
        prompt = f"""
        You are a {persona}.
        Topic: {feature}
        Difficulty: {difficulty}

        Generate EXACTLY 3 pairs of Python questions and their ideal, correct answers.
        
        Return the output as a JSON List of Objects. 
        Each object must have these exact keys:
        - "question": The question string
        - "ground_truth": The detailed, correct answer string

        Rules:
        - The "ground_truth" should be factual and concise.
        - No markdown formatting in the JSON keys.
        - Return ONLY valid JSON.
        """

        response = llm.invoke(prompt)

        try:
            # Parse the JSON response
            data = json.loads(response.content)
        except json.JSONDecodeError:
            print("⚠️ JSON parse failed, skipping this combination")
            continue

        # Extract both question and ground_truth
        for item in data:
            # specific check to ensure structure is correct
            if isinstance(item, dict) and "question" in item and "ground_truth" in item:
                q = item["question"]
                gt = item["ground_truth"]
                
                if len(q.strip()) > 10:
                    rows.append({
                        "persona": persona,
                        "feature": feature,
                        "difficulty": difficulty,
                        "query": q.strip(),
                        "ground_truth": gt.strip() # <--- NEW COLUMN
                    })

    # Save the dataframe with the new column
    df = pd.DataFrame(rows)
    df.to_csv("datasets/synthetic_queries_with_gt.csv", index=False)

    print(f"✅ Generated {len(df)} questions with ground truth")
    return df

In [149]:
generate_synthetic_queries()

Generating → Beginner Student | Concept Explanation | Beginner
Generating → Beginner Student | Concept Explanation | Intermediate
Generating → Beginner Student | Concept Explanation | Advanced
Generating → Beginner Student | Code Example | Beginner
Generating → Beginner Student | Code Example | Intermediate
Generating → Beginner Student | Code Example | Advanced
Generating → Working Developer | Concept Explanation | Beginner
Generating → Working Developer | Concept Explanation | Intermediate
Generating → Working Developer | Concept Explanation | Advanced
Generating → Working Developer | Code Example | Beginner
Generating → Working Developer | Code Example | Intermediate
Generating → Working Developer | Code Example | Advanced
✅ Generated 36 questions with ground truth


Unnamed: 0,persona,feature,difficulty,query,ground_truth
0,Beginner Student,Concept Explanation,Beginner,What is a variable in Python?,A variable in Python is a named location in me...
1,Beginner Student,Concept Explanation,Beginner,What is a list in Python?,"A list in Python is a mutable, ordered collect..."
2,Beginner Student,Concept Explanation,Beginner,What does 'if __name__ == '__main__':' do in P...,The statement 'if __name__ == '__main__':' che...
3,Beginner Student,Concept Explanation,Intermediate,"What is a list in Python, and how is it differ...",A list in Python is a mutable collection of it...
4,Beginner Student,Concept Explanation,Intermediate,Explain the difference between '==' and 'is' o...,"'==' checks for value equality, meaning it tes..."
5,Beginner Student,Concept Explanation,Intermediate,"What is a dictionary in Python, and how do you...",A dictionary in Python is an unordered collect...
6,Beginner Student,Concept Explanation,Advanced,What is the difference between a shallow copy ...,A shallow copy creates a new object but insert...
7,Beginner Student,Concept Explanation,Advanced,What are Python decorators and how do they work?,Python decorators are functions that modify th...
8,Beginner Student,Concept Explanation,Advanced,Explain the concept of generators in Python.,Generators are a type of iterable that allows ...
9,Beginner Student,Code Example,Beginner,What is a variable in Python?,A variable in Python is a named location in me...


### Run RAG Chain on Synthetic Queries

In [152]:
def run_rag_on_synthetic():
    # Load the new dataset containing ground_truth
    df = pd.read_csv("datasets/synthetic_queries_with_gt.csv")
    results = []

    for _, row in df.iterrows():
        # Run your actual RAG system
        answer, context = rag_chain(
            row["query"], 
            retriever=retriever, 
            llm=llm
        )
        
        # Save results, preserving the ground_truth from the input row
        results.append({
            **row.to_dict(),          # Copies persona, feature, query, AND ground_truth
            "answer": answer,         # The answer your RAG system gave
            "retrieved_context": context
        })

    # Save to CSV (This is now ready for Ragas evaluation)
    pd.DataFrame(results).to_csv("datasets/rag_results.csv", index=False)

In [153]:
run_rag_on_synthetic()

### RAGAS Evaluation

In [154]:
from openai import AsyncOpenAI
from ragas.llms.base import llm_factory
from ragas.embeddings.base import embedding_factory

openai_client = AsyncOpenAI()


ragas_embeddings = embedding_factory(
    provider="openai",
    model="text-embedding-ada-002",
    client=openai_client,
    interface="modern",
)

ragas_llm = llm_factory(
    "gpt-4o-mini",
    client=openai_client,
)


In [155]:
import pandas as pd
from datasets import Dataset
from ragas import evaluate
# UPDATED IMPORTS to fix DeprecationWarning
from ragas.metrics import (
    AnswerRelevancy,
    Faithfulness,
    ContextPrecision,
    ContextRecall,
)

def run_ragas():
    df = pd.read_csv("datasets/rag_results.csv")

    # 1. Map columns to Ragas requirements
    data_dict = {
        "user_input": df["query"].tolist(),          # Ragas v1.0 prefers 'user_input' over 'question'
        "response": df["answer"].fillna("").tolist(), # Ragas v1.0 prefers 'response' over 'answer'
        "retrieved_contexts": df["retrieved_context"] # Ragas v1.0 prefers 'retrieved_contexts'
            .fillna("")
            .apply(lambda x: [x]) 
            .tolist(),
        # CRITICAL FIX: Map your ground truth column to "reference"
        "reference": df["ground_truth"].tolist() 
    }

    rag_dataset = Dataset.from_dict(data_dict)

    # 2. Instantiate metrics
    metrics_list = [
        AnswerRelevancy(),
        Faithfulness(),
        ContextPrecision(),
        ContextRecall(),
    ]

    scores = evaluate(
        dataset=rag_dataset,
        metrics=metrics_list,
        llm=ragas_llm,
        embeddings=ragas_embeddings,
        raise_exceptions=False,
    )
    
    # ... rest of your export logic ...
    return scores.to_pandas()

  from ragas.metrics import (
  from ragas.metrics import (
  from ragas.metrics import (
  from ragas.metrics import (


In [156]:
rag_eval = run_ragas()

Evaluating:   0%|          | 0/144 [00:00<?, ?it/s]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Exception raised in Job[8]: AttributeError('OpenAIEmbeddings' object has no attribute 'embed_query')
LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Exception raised in Job[0]: AttributeError('OpenAIEmbeddings' object has no attribute 'embed_query')
Evaluating:   1%|          | 1/144 [00:05<14:16,  5.99s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Exception raised in Job[12]: AttributeError('OpenAIEmbeddings' object has no attribute 'embed_query')
LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Exception raised in Job[4]: AttributeError('OpenAIEmbeddings' object has no attribute 'embed_query')
Evaluating:   7%|▋         | 10/144 [00:39<07:06,  3.18s/it]LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Exception raise

In [161]:
rag_eval.head()

Unnamed: 0,user_input,retrieved_contexts,response,reference,answer_relevancy,faithfulness,context_precision,context_recall
0,What is a variable in Python?,[Think PythonFront MatterChaptersEnd MatterVar...,A variable in Python is a name that refers to ...,A variable in Python is a named location in me...,,1.0,1.0,0.5
1,What is a list in Python?,[None break except in r...,"In Python, a list is a built-in data structure...","A list in Python is a mutable, ordered collect...",,0.0,0.0,0.0
2,What does 'if __name__ == '__main__':' do in P...,[When you create a variable outside of any fun...,The statement `if __name__ == '__main__':` in ...,The statement 'if __name__ == '__main__':' che...,,0.0,0.0,1.0
3,"What is a list in Python, and how is it differ...",[None break except in r...,"In Python, a list is a mutable, ordered collec...",A list in Python is a mutable collection of it...,,0.0,0.0,1.0
4,Explain the difference between '==' and 'is' o...,"[Like scientists, they observe the behavior of...","In Python, the `==` operator and the `is` oper...","'==' checks for value equality, meaning it tes...",,0.0,0.0,0.0


In [162]:
rag_eval.to_csv("datasets/rag_evaluation_results.csv", index=False)

In [159]:
import json
import numpy as np

def latency_report():
    latencies = []
    with open("logs/rag_logs.jsonl") as f:
        for line in f:
            latencies.append(json.loads(line)["latency"])

    return {
        "avg_latency": np.mean(latencies),
        "p95_latency": np.percentile(latencies, 95),
        "p99_latency": np.percentile(latencies, 99),
    }


In [160]:
latency_report()

{'avg_latency': np.float64(6.217635737525092),
 'p95_latency': np.float64(10.250079989433289),
 'p99_latency': np.float64(10.540699219703674)}