In [1]:
# 03_retrieval_testing.ipynb - Vector Database and Retrieval Evaluation

import os
import time
import pickle
import numpy as np
import pandas as pd
from pathlib import Path
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
import uuid
import warnings
warnings.filterwarnings('ignore')

# Setup paths
PROJECT_ROOT = Path.cwd().parent if 'notebooks' in str(Path.cwd()) else Path.cwd()
PROCESSED_DIR = PROJECT_ROOT / 'data' / 'processed'

print("Loading processed chunks...")
chunks_file = PROCESSED_DIR / 'processed_chunks.pkl'

if not chunks_file.exists():
    print("ERROR: Processed chunks not found. Please run 02_chunking_strategy.ipynb first.")
    exit()

with open(chunks_file, 'rb') as f:
    chunks = pickle.load(f)

print(f"Loaded {len(chunks)} chunks")
print(f"Average tokens per chunk: {np.mean([c['token_count'] for c in chunks]):.1f}")

# Initialize embedding model
print("\nInitializing embedding model...")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embedding_dimension = embedding_model.get_sentence_embedding_dimension()
print(f"Model loaded. Embedding dimension: {embedding_dimension}")

# Generate embeddings for all chunks
print(f"\nGenerating embeddings for {len(chunks)} chunks...")
start_time = time.time()

chunk_embeddings = []
for i, chunk in enumerate(chunks):
    embedding = embedding_model.encode(chunk['text'])
    chunk_embeddings.append(embedding)
    
    if (i + 1) % 5 == 0 or i == len(chunks) - 1:
        print(f"  Generated {i + 1}/{len(chunks)} embeddings")

embedding_time = time.time() - start_time
print(f"Embeddings generated in {embedding_time:.2f} seconds")
print(f"Average time per chunk: {embedding_time/len(chunks):.3f} seconds")

# Initialize Qdrant client
print(f"\nConnecting to Qdrant...")
try:
    qdrant_client = QdrantClient("localhost", port=6333)
    collections = qdrant_client.get_collections()
    print(f"Connected to Qdrant. Existing collections: {len(collections.collections)}")
except Exception as e:
    print(f"ERROR: Failed to connect to Qdrant: {e}")
    print("Make sure Qdrant is running: docker run -p 6333:6333 qdrant/qdrant")
    exit()

# Create collection
collection_name = "pandas_docs_improved"
print(f"\nCreating collection: {collection_name}")

try:
    # Delete if exists
    try:
        qdrant_client.delete_collection(collection_name)
        print("  Deleted existing collection")
    except:
        pass
    
    # Create new collection
    qdrant_client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=embedding_dimension, distance=Distance.COSINE)
    )
    print("  Collection created successfully")
except Exception as e:
    print(f"  ERROR creating collection: {e}")
    exit()

# Prepare and insert points
print(f"\nPreparing data points...")
points = []

for i, (chunk, embedding) in enumerate(zip(chunks, chunk_embeddings)):
    
    # Create rich metadata
    point = PointStruct(
        id=str(uuid.uuid4()),
        vector=embedding.tolist(),
        payload={
            "text": chunk['text'],
            "token_count": chunk['token_count'],
            "content_type": chunk['content_type'],
            "chunk_index": chunk['chunk_index'],
            "global_chunk_id": chunk['global_chunk_id'],
            "source_pages": chunk['source_pages'],
            "page_count": chunk['page_count'],
            "code_score": chunk['features']['code_score'],
            "concept_score": chunk['features']['concept_score'],
            "is_code_heavy": chunk['features']['is_code_heavy'],
            "is_concept_heavy": chunk['features']['is_concept_heavy'],
            "preview": chunk['text'][:300] + "..." if len(chunk['text']) > 300 else chunk['text']
        }
    )
    points.append(point)

print(f"Prepared {len(points)} points for insertion")

# Insert points
print(f"Inserting points into Qdrant...")
try:
    result = qdrant_client.upsert(
        collection_name=collection_name,
        points=points
    )
    print(f"  Inserted {len(points)} points successfully")
    print(f"  Operation result: {result}")
except Exception as e:
    print(f"  ERROR inserting points: {e}")
    exit()

# Verify insertion
print(f"\nVerifying insertion...")
try:
    count_result = qdrant_client.count(collection_name)
    print(f"  Total points in collection: {count_result.count}")
    
    if count_result.count != len(chunks):
        print(f"  WARNING: Expected {len(chunks)} points, found {count_result.count}")
    else:
        print(f"  Verification successful")
except Exception as e:
    print(f"  ERROR during verification: {e}")

# Create comprehensive test queries
test_queries = [
    # Basic concepts
    "What is a pandas DataFrame?",
    "What is a pandas Series?", 
    "Difference between Series and DataFrame",
    
    # How-to questions
    "How to create a DataFrame?",
    "How to read CSV files in pandas?",
    "How to select data from DataFrame?",
    "How to filter DataFrame rows?",
    
    # Specific functions
    "pandas groupby function",
    "pandas merge function", 
    "pandas concat function",
    "DataFrame indexing methods",
    
    # Code examples
    "pandas DataFrame code examples",
    "Series creation syntax",
    "pandas data manipulation examples",
    
    # Advanced topics
    "pandas data cleaning techniques",
    "handling missing data in pandas",
    "pandas performance optimization",
    "pandas date and time handling"
]

def evaluate_retrieval(query, collection_name, top_k=3, show_results=False):
    """Evaluate retrieval quality for a single query"""
    
    # Generate query embedding
    query_embedding = embedding_model.encode(query)
    
    # Search
    search_results = qdrant_client.query_points(
        collection_name=collection_name,
        query=query_embedding.tolist(),
        limit=top_k
    )
    
    results = search_results.points
    
    if show_results:
        print(f"\nQuery: '{query}'")
        print(f"Found {len(results)} results:")
        
        for i, result in enumerate(results, 1):
            score = result.score
            content_type = result.payload['content_type']
            tokens = result.payload['token_count']
            pages = result.payload['page_count']
            code_score = result.payload['code_score']
            concept_score = result.payload['concept_score']
            
            # Score quality indicator
            quality = "Excellent" if score > 0.8 else "Good" if score > 0.6 else "Fair" if score > 0.4 else "Poor"
            
            print(f"\n  Result {i}: {score:.4f} ({quality})")
            print(f"    Content: {content_type} | Tokens: {tokens} | Pages: {pages}")
            print(f"    Code Score: {code_score} | Concept Score: {concept_score}")
            print(f"    Preview: {result.payload['preview']}")
    
    return results

# Run comprehensive evaluation
print(f"\nRunning comprehensive retrieval evaluation...")
print(f"Testing {len(test_queries)} queries with improved chunks:")

evaluation_results = {}
total_scores = []
quality_distribution = {"Excellent": 0, "Good": 0, "Fair": 0, "Poor": 0}

for i, query in enumerate(test_queries):
    results = evaluate_retrieval(query, collection_name, top_k=3, show_results=False)
    
    if results:
        top_score = results[0].score
        avg_score = np.mean([r.score for r in results])
        
        # Categorize quality
        if top_score > 0.8:
            quality = "Excellent"
        elif top_score > 0.6:
            quality = "Good"
        elif top_score > 0.4:
            quality = "Fair"
        else:
            quality = "Poor"
        
        quality_distribution[quality] += 1
        total_scores.append(top_score)
        
        evaluation_results[query] = {
            'top_score': top_score,
            'avg_score': avg_score,
            'quality': quality,
            'results_count': len(results)
        }
        
        print(f"  {i+1:2d}. {quality:9s} ({top_score:.3f}) - {query}")
    else:
        print(f"  {i+1:2d}. No Results - {query}")

# Calculate comprehensive metrics
print(f"\nRetrieval Quality Analysis:")
print(f"Total queries tested: {len(test_queries)}")
print(f"Average top score: {np.mean(total_scores):.3f}")
print(f"Median top score: {np.median(total_scores):.3f}")
print(f"Standard deviation: {np.std(total_scores):.3f}")

print(f"\nQuality Distribution:")
for quality, count in quality_distribution.items():
    percentage = (count / len(test_queries)) * 100
    print(f"  {quality:9s}: {count:2d} queries ({percentage:4.1f}%)")

# Improvement analysis (compare with baseline expectations)
excellent_rate = quality_distribution["Excellent"] / len(test_queries)
good_plus_rate = (quality_distribution["Excellent"] + quality_distribution["Good"]) / len(test_queries)

print(f"\nPerformance Metrics:")
print(f"  Excellent retrieval rate: {excellent_rate:.1%}")
print(f"  Good+ retrieval rate: {good_plus_rate:.1%}")
print(f"  Average relevance score: {np.mean(total_scores):.3f}")

# Show best and worst performing queries
if evaluation_results:
    best_queries = sorted(evaluation_results.items(), key=lambda x: x[1]['top_score'], reverse=True)[:3]
    worst_queries = sorted(evaluation_results.items(), key=lambda x: x[1]['top_score'])[:3]
    
    print(f"\nBest Performing Queries:")
    for query, results in best_queries:
        print(f"  {results['top_score']:.3f} - {query}")
    
    print(f"\nWorst Performing Queries:")
    for query, results in worst_queries:
        print(f"  {results['top_score']:.3f} - {query}")

# Detailed analysis for a few sample queries
print(f"\nDetailed Analysis for Sample Queries:")

sample_queries = [
    "What is a pandas DataFrame?",
    "How to create a DataFrame?", 
    "pandas groupby function"
]

for query in sample_queries:
    results = evaluate_retrieval(query, collection_name, top_k=3, show_results=True)

print(f"\nRetrieval testing complete!")
print(f"Collection '{collection_name}' ready with {len(chunks)} high-quality chunks.")

# Save evaluation results
eval_file = PROCESSED_DIR / 'retrieval_evaluation.pkl'
with open(eval_file, 'wb') as f:
    pickle.dump({
        'test_queries': test_queries,
        'evaluation_results': evaluation_results,
        'quality_distribution': quality_distribution,
        'metrics': {
            'avg_score': np.mean(total_scores),
            'median_score': np.median(total_scores),
            'std_score': np.std(total_scores),
            'excellent_rate': excellent_rate,
            'good_plus_rate': good_plus_rate
        }
    }, f)

print(f"\nEvaluation results saved to: {eval_file}")
print(f"Ready for LLM integration testing!")

  from .autonotebook import tqdm as notebook_tqdm


Loading processed chunks...
Loaded 13 chunks
Average tokens per chunk: 1160.5

Initializing embedding model...
Model loaded. Embedding dimension: 384

Generating embeddings for 13 chunks...
  Generated 5/13 embeddings
  Generated 10/13 embeddings
  Generated 13/13 embeddings
Embeddings generated in 1.29 seconds
Average time per chunk: 0.099 seconds

Connecting to Qdrant...
Connected to Qdrant. Existing collections: 0

Creating collection: pandas_docs_improved
  Deleted existing collection
  Collection created successfully

Preparing data points...
Prepared 13 points for insertion
Inserting points into Qdrant...
  Inserted 13 points successfully
  Operation result: operation_id=0 status=<UpdateStatus.COMPLETED: 'completed'>

Verifying insertion...
  Total points in collection: 13
  Verification successful

Running comprehensive retrieval evaluation...
Testing 18 queries with improved chunks:
   1. Good      (0.615) - What is a pandas DataFrame?
   2. Fair      (0.578) - What is a pandas