**Step 0: Imports, constants, and API Keys!**

In [1]:
from dotenv import load_dotenv
import os

load_dotenv('../app/.env')

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

if not OPENAI_API_KEY:
    print("Error retrieving API keys")

**Step 1: Generate synthetic data**

In [11]:
from ragas.testset import TestsetGenerator
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

generator_llm = ChatOpenAI()
critic_llm = ChatOpenAI(model="gpt-4o")
embeddings = OpenAIEmbeddings()

# Initialize data generator and set up distributions
generator = TestsetGenerator.from_langchain(llm=ChatOpenAI())

In [12]:
# load the document corpus, then generate the synthetic test set
myfile = "source_documents.json"

import json
from langchain.schema import Document

# Load JSON data
with open(myfile, 'r') as file:
    data = json.load(file)

# Convert JSON data into a list of LangChain Document objects
docs = [
    Document(page_content=item["page_content"], metadata=item.get("metadata",{}))
    for item in data
]

print(f"loaded {len(docs)} docs")


loaded 216 docs


In [13]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=3000, # use a different chunk size from the app       
    chunk_overlap=100,     
)

split_docs = []

for doc in docs:

    splits = text_splitter.split_text(doc.page_content)
    for i,split in enumerate(splits):
        metadata_with_chunk = {**doc.metadata, "chunk_id": i}
            
        # Create the document with the updated metadata
        split_doc = Document(page_content=split, metadata=metadata_with_chunk)
        split_docs.append(split_doc)

print(f"len(docs): {len(docs)}, len(split_docs):{len(split_docs)}")

len(docs): 216, len(split_docs):687


Keep the default distributions: 50% reasoning, 30% simple, 20% multi-context

In [None]:
testset = generator.generate_with_langchain_docs(split_docs, 30, with_debugging_logs=True)

Applying [SummaryExtractor, HeadlinesExtractor]:  34%|███▍      | 466/1374 [00:50<01:51,  8.17it/s]Prompt fix_output_format failed to parse output: The output parser failed to parse the output after 0 retries.
Prompt headlines_extractor_prompt failed to parse output: The output parser failed to parse the output after 0 retries.
unable to apply transformation: The output parser failed to parse the output after 0 retries.
Applying [SummaryExtractor, HeadlinesExtractor]:  67%|██████▋   | 923/1374 [01:43<00:52,  8.64it/s]Prompt headlines_extractor_prompt failed to parse output: The output parser failed to parse the output after 0 retries.
unable to apply transformation: The output parser failed to parse the output after 0 retries.
Applying HeadlineSplitter:   0%|          | 1/687 [00:00<01:45,  6.51it/s]                          unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
Applying [E

**Step 2: Save for later re-use**

In [9]:
import pandas as pd

# Generating the test data costs money, time, and compute, so make sure to save it for later re-use
test_df = testset.to_pandas().to_csv("ragas_test_data.csv",index=False)
