# Eval Dataset

In [None]:
from openai import OpenAI
import os
import psycopg2
import instructor
from pydantic import BaseModel
import openai


In [2]:
postgres_username = os.getenv("POSTGRES_USERNAME")
postgres_pwd = os.getenv("POSTGRES_PASSWORD")

In [3]:
conn = psycopg2.connect(
    dbname="postgresdb",
    user=postgres_username,
    password=postgres_pwd,
    host="host.docker.internal",  # e.g., "localhost"
    port="5433"        # default PostgreSQL port
)
cursor = conn.cursor()

In [4]:
cursor.execute("SELECT * FROM public.us_attractions")
rows = cursor.fetchall()

# for row in rows:
#     print(row)
row_list = [", ".join(map(str, row)) for row in rows]
formatted_rows = "\n".join(row_list)
row_list[:2]

['Forsyth Park, Park, 4.8, 16538.0, Park, Tourist attraction, Forsyth Park, Savannah, GA 31401, Savannah, USA, GA, None, Nature, 79382.4, 4.67, Atlanta, Augusta, Chattanooga, Savannah',
 'The Cathedral Basilica of St. John the Baptist, Catholic cathedral, 4.8, 5911.0, Catholic cathedral, Catholic church, Tourist attraction, The Cathedral Basilica of St. John the Baptist, 222 E Harris St, Savannah, GA 31401, Savannah, USA, GA, None, Religious, 28372.8, 4.8, Atlanta, Augusta, Chattanooga, Savannah']

## Render prompt to generate synthetic Eval reference dataset

In [5]:
import json

output_schema = {
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "question": {
                "type": "string",
                "description": "Suggested question.",
            },
            "chunk_ids": {
                "type": "array",
                "items": {
                    "type": "integer",
                    "description": "Index of the chunk that could be used to answer the question.",
                },
            },
            "answer_example": {
                "type": "string",
                "description": "Suggested answer grounded in the contexr.",
            },
            "reasoning": {
                "type": "string",
                "description": "Reasoning why the question could be answered with the chunks.",
            },
        },
    },
}


SYSTEM_PROMPT = f"""
I am building a RAG application. I have a collection of 50 chunks of text.
The RAG application will act as a travel booking assistant that can answer questions about US tourist attraction places we have available.
I will provide all of the available locations to you with indexes of each chunk.
Each chunk represents a record from SQL table with the following schema:
Columns:
- name VARCHAR(250)
- main_category VARCHAR(250)
- rating REAL
- reviews REAL
- categories VARCHAR(250)
- address VARCHAR(250)
- city VARCHAR(250)
- country VARCHAR(250)
- state VARCHAR(250)
- zipcode INTEGER
- broader_category VARCHAR(250)
- weighted_score REAL
- weighted_average REAL
- all_cities VARCHAR(250)
I want you to come up with 30 questions to which the answers could be grounded in the chunk context.
As an output I need you to provide me the list of questions and the indexes of the chunks that could be used to answer them.
Also, provide an example answer to the question given the context of the chunks.
Also, provide the reason why you chose the chunks to answer the questions.
Try to have a mix of questions that could use multipple chunks and questions that could use single chunk.
Also, include 5 questions that can't be answered with the available chunks.


{json.dumps(output_schema, indent=2)}


I need to be able to parse the json output.
"""

USER_PROMPT = f"""
Here is the list of chunks, each list element is a dictionary with id and text:
{[{"id": i, "text": data} for i, data in enumerate(row_list)]}
"""

In [10]:
print(USER_PROMPT)


Here is the list of chunks, each list element is a dictionary with id and text:
[{'id': 0, 'text': 'Forsyth Park, Park, 4.8, 16538.0, Park, Tourist attraction, Forsyth Park, Savannah, GA 31401, Savannah, USA, GA, None, Nature, 79382.4, 4.67, Atlanta, Augusta, Chattanooga, Savannah'}, {'id': 1, 'text': 'The Cathedral Basilica of St. John the Baptist, Catholic cathedral, 4.8, 5911.0, Catholic cathedral, Catholic church, Tourist attraction, The Cathedral Basilica of St. John the Baptist, 222 E Harris St, Savannah, GA 31401, Savannah, USA, GA, None, Religious, 28372.8, 4.8, Atlanta, Augusta, Chattanooga, Savannah'}, {'id': 2, 'text': 'Fort Pulaski National Monument, Monument, 4.8, 5221.0, Monument, Historical place, Historical landmark, Tourist attraction, Fort Pulaski National Monument, 101 Fort Pulaski Rd, Savannah, GA, Savannah, USA, GA, None, Cultural, 25060.8, 4.53, Atlanta, Augusta, Chattanooga, Savannah'}, {'id': 3, 'text': 'Fountain at Forsyth Park, Historical landmark, 4.8, 4234.

## Generate synthetic eval reference data

In [6]:
response = openai.chat.completions.create(
    model="gpt-4.1",
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT}
    ]
)

print(response.choices[0].message.content)

```json
[
  {
    "question": "What are some top-rated tourist attractions in Savannah, Georgia?",
    "chunk_ids": [0, 1, 2, 3, 4, 5, 9],
    "answer_example": "Some top-rated tourist attractions in Savannah, Georgia include Forsyth Park (4.8 rating), The Cathedral Basilica of St. John the Baptist (4.8), Fort Pulaski National Monument (4.8), and Hearse Ghost Tours (4.9).",
    "reasoning": "Chunks 0–5 and 9 represent highly-rated tourist attractions in Savannah, GA, based on their ratings and city information."
  },
  {
    "question": "Are there any Catholic cathedrals listed as attractions?",
    "chunk_ids": [1, 90, 272, 763, 1178, 1479, 1517, 1620, 1783, 1862, 1950, 2177, 2722, 2917],
    "answer_example": "Some Catholic cathedrals featured include The Cathedral Basilica of St. John the Baptist in Savannah, Cathedral of St. John the Evangelist in Lafayette, and the Cathedral Basilica of the Immaculate Conception in Denver.",
    "reasoning": "Chunks contain 'Catholic cathedral' in

## Clean up output and make it json parseable

In [7]:
import json

json_output = response.choices[0].message.content
json_output = json_output.replace("```json", "")
json_output = json_output.replace("```", "")
# json_output = json_output.replace("// Questions that CANNOT be answered from the available chunks", "")
json_output = json.loads(json_output)

## Upload dataset to LangSmith

In [8]:
from langsmith import Client
import os

client = Client(api_key=os.environ["LANGSMITH_API_KEY"])

dataset_name = "travel-rag-evaluation-dataset"
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Dataset for evaluating RAG pipeline"
)

In [14]:

for item in json_output:
    client.create_example(
        dataset_id=dataset.id,
        inputs={"question": item["question"]},
        outputs={
            "ground_truth": item["answer_example"],
            "context_ids": item["chunk_ids"],
            "contexts": [row_list[idx] for idx in item["chunk_ids"]]
        }
    )
     