# Dataset Creation

## RAG Dataset Evaluation Creation

In [2]:
import sys
import os

# sys.path.append(".")
# sys.path.append(os.path.abspath('.'))  # Add project root
# sys.path.append(os.path.abspath('./src'))  # Add src directory

# Go up 2 directories to reach project root
project_root = os.path.abspath('../..')  # Go up 2 levels
sys.path.insert(0, project_root)


from evaluations.utils.dataset_generator import DatasetGenerator
import json
import pandas as pd

  llm = Ollama(model=OLLAMA_MODEL, temperature=DEFAULT_TEMPERATURE)


In [8]:
#!/usr/bin/env python3
"""
Standalone script to create RAG evaluation dataset.
This shows exactly what the Jupyter notebook does to generate the evaluation data.
"""

def main():
    """Create and explore a RAG evaluation dataset."""

    print("=== RAG Evaluation Dataset Creation ===\n")

    # 1. Initialize Dataset Generator
    print("1. Initializing Dataset Generator...")
    generator = DatasetGenerator(output_dir="../../evaluations/datasets/rag_evaluation")
    print(f"   Output directory: {generator.output_dir}")

    # 2. Create Evaluation Dataset
    print("\n2. Creating evaluation dataset...")
    city = "seattle"
    num_listings = 100
    chunk_size = 100

    print(f"   City: {city}")
    print(f"   Number of listings: {num_listings}")
    print(f"   Chunk size: {chunk_size} words")

    try:
        chunks_file, queries_file = generator.create_rag_evaluation_dataset(
            city=city,
            num_listings=num_listings,
            chunk_size=chunk_size,
            seed=42,  # For reproducibility
        )

        print(f"\n   ✅ Dataset created successfully!")
        print(f"   📁 Chunks file: {chunks_file}")
        print(f"   📁 Queries file: {queries_file}")

    except Exception as e:
        print(f"   ❌ Error creating dataset: {e}")
        return

    # 3. Load and Explore the Generated Dataset
    print("\n3. Exploring generated dataset...")
    chunks, queries = generator.load_evaluation_dataset(chunks_file, queries_file)

    print(f"   Total chunks: {len(chunks)}")
    print(f"   Total queries: {len(queries)}")
    print(f"   Average chunks per listing: {len(chunks) / num_listings:.1f}")
    print(f"   Average queries per listing: {len(queries) / num_listings:.1f}")

    # 4. Show Sample Data
    print("\n4. Sample chunk:")
    print(json.dumps(chunks[0], indent=2))

    print("\n5. Sample query:")
    print(json.dumps(queries[0], indent=2))

    # 6. Analyze Query Categories
    print("\n6. Query category distribution:")
    categories = [q.get("category", "unknown") for q in queries]
    category_counts = pd.Series(categories).value_counts()

    for category, count in category_counts.items():
        print(f"   {category}: {count} queries")

    # 7. Analyze Chunk Statistics
    print("\n7. Chunk length statistics:")
    chunk_lengths = [len(chunk["chunk_text"].split()) for chunk in chunks]

    print(f"   Mean length: {pd.Series(chunk_lengths).mean():.1f} words")
    print(f"   Median length: {pd.Series(chunk_lengths).median():.1f} words")
    print(f"   Min length: {min(chunk_lengths)} words")
    print(f"   Max length: {max(chunk_lengths)} words")
    print(f"   Target chunk size: {chunk_size} words")

    # 8. Show Example Listing with its Chunks and Queries
    print("\n8. Example listing breakdown:")

    # Get a random listing ID
    import random

    sample_listing_id = random.choice([q["listing_id"] for q in queries])

    # Find chunks for this listing
    listing_chunks = [c for c in chunks if c["listing_id"] == sample_listing_id]
    listing_queries = [q for q in queries if q["listing_id"] == sample_listing_id]

    print(f"   Listing ID: {sample_listing_id}")
    print(
        f"   Listing Name: {listing_chunks[0]['listing_name'] if listing_chunks else 'Unknown'}"
    )
    print(f"   Number of chunks: {len(listing_chunks)}")
    print(f"   Number of queries: {len(listing_queries)}")

    if listing_chunks:
        print(f"\n   First chunk preview:")
        print(f"   {listing_chunks[0]['chunk_text'][:200]}...")

    print(f"\n   Sample queries for this listing:")
    for i, query in enumerate(listing_queries[:3]):
        print(f"   {i+1}. Q: {query['query']}")
        print(f"      Expected: {query['expected_answer']}")
        print(f"      Category: {query['category']}")

    print(f"\n=== Dataset Creation Complete! ===")
    print(f"You can now use these files with the RAG evaluator:")
    print(f"- Chunks: {chunks_file}")
    print(f"- Queries: {queries_file}")


if __name__ == "__main__":
    main()


=== RAG Evaluation Dataset Creation ===

1. Initializing Dataset Generator...
   Output directory: ../../evaluations/datasets/rag_evaluation

2. Creating evaluation dataset...
   City: seattle
   Number of listings: 100
   Chunk size: 100 words
Successfully loaded 6770 records from /Users/aus10powell/Documents/Projects/AirRanker/data/seattle/listings.parquet
Generated 250 chunks and 596 queries
Chunks saved to: ../../evaluations/datasets/rag_evaluation/chunks_seattle_100listings_20250529_204239.json
Queries saved to: ../../evaluations/datasets/rag_evaluation/queries_seattle_100listings_20250529_204239.json

   ✅ Dataset created successfully!
   📁 Chunks file: ../../evaluations/datasets/rag_evaluation/chunks_seattle_100listings_20250529_204239.json
   📁 Queries file: ../../evaluations/datasets/rag_evaluation/queries_seattle_100listings_20250529_204239.json

3. Exploring generated dataset...
   Total chunks: 250
   Total queries: 596
   Average chunks per listing: 2.5
   Average queries 