**Step 0: Imports, constants, and API Keys!**

In [1]:
!pip install -q langchain==0.2.16 langchain_core==0.2.38 langchain_community==0.2.16 pymupdf openai 
!pip install -q langchain_openai==0.1.23 langchain-qdrant qdrant_client asyncio ragas==0.1.14 pandas
!pip install -q langsmith

In [3]:
import os
import openai
from getpass import getpass

# collect OpenAI key
openai.api_key = getpass("OpenAI API Key: ")
os.environ["OPENAI_API_KEY"] = openai.api_key

**Step 1: Generate synthetic data**

In [6]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context, conditional
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from vars import GENERATOR_LLM, CRITIC_LLM

generator_llm = ChatOpenAI(model=GENERATOR_LLM)
critic_llm = ChatOpenAI(model=CRITIC_LLM)
embeddings = OpenAIEmbeddings()

# Initialize data generator and set up distributions
generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

distributions = {
    simple: 0.5,
    multi_context: 0.3,
    reasoning: 0.1,
    conditional: 0.1
}

In [7]:
# re-chunk the data using a different size, then generate the synthetic test set
from vars import RAGAS_CHUNK_SIZE, RAGAS_OVERLAP, N_EVAL_QUESTIONS

importlib.reload(vanilla_rag)
for pdf in PDFS:
    ragas_chunks = await vanilla_rag.load_and_chunk_pdf(pdf,RAGAS_CHUNK_SIZE,RAGAS_OVERLAP)

testset = generator.generate_with_langchain_docs(ragas_chunks, N_EVAL_QUESTIONS, distributions, with_debugging_logs=True)

Loading https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf...
Chunking...
Loading https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf...
Chunking...


embedding nodes:   0%|          | 0/520 [00:00<?, ?it/s]

Filename and doc_id are the same for all nodes.


Generating:   0%|          | 0/30 [00:00<?, ?it/s]

[ragas.testset.filters.DEBUG] context scoring: {'clarity': 1, 'depth': 1, 'structure': 1, 'relevance': 1, 'score': 1.0}
[ragas.testset.evolutions.INFO] retrying evolution: 0 times
[ragas.testset.filters.DEBUG] context scoring: {'clarity': 1, 'depth': 2, 'structure': 1, 'relevance': 2, 'score': 1.5}
[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['AI life cycle', 'Harmful Bias', 'Fact-checking techniques', 'GAI systems', 'Information Integrity']
[ragas.testset.filters.DEBUG] context scoring: {'clarity': 1, 'depth': 2, 'structure': 1, 'relevance': 2, 'score': 1.5}
[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Organizational risk tolerance', 'GAI system outputs', 'Safety and validity review', 'Information integrity', 'Security anomalies']
[ragas.testset.filters.DEBUG] context scoring: {'clarity': 2, 'depth': 3, 'structure': 2, 'relevance': 3, 'score': 2.5}
[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Sensitive information', 'Adversarial attack

**Step 2: Save for later re-use**

In [9]:
import pandas as pd
from vars import TEST_DATASET_FILE

# Generating the test data costs money, time, and compute, so make sure to save it for later re-use
test_df = testset.to_pandas().to_csv(TEST_DATASET_FILE,index=False)
