In [1]:
# 04_retrieval_improvements.ipynb - Advanced Retrieval Optimization

import os
import time
import pickle
import numpy as np
import pandas as pd
from pathlib import Path
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
import uuid
import re
import warnings
warnings.filterwarnings('ignore')

# Setup
PROJECT_ROOT = Path.cwd().parent if 'notebooks' in str(Path.cwd()) else Path.cwd()
PROCESSED_DIR = PROJECT_ROOT / 'data' / 'processed'

print("Loading previous results...")
with open(PROCESSED_DIR / 'retrieval_evaluation.pkl', 'rb') as f:
    baseline_results = pickle.load(f)

print(f"Baseline Performance:")
print(f"  Average score: {baseline_results['metrics']['avg_score']:.3f}")
print(f"  Good+ rate: {baseline_results['metrics']['good_plus_rate']:.1%}")

# Strategy 1: Test Different Embedding Models
print(f"\n" + "="*60)
print("STRATEGY 1: TESTING DIFFERENT EMBEDDING MODELS")
print("="*60)

# Test multiple embedding models
embedding_models = [
    ('all-MiniLM-L6-v2', 'Current baseline model'),
    ('all-mpnet-base-v2', 'Higher quality general model'),
    ('multi-qa-mpnet-base-dot-v1', 'Optimized for Q&A'),
    ('paraphrase-multilingual-mpnet-base-v2', 'Better semantic understanding')
]

def test_embedding_model(model_name, description, chunks, test_queries):
    """Test retrieval quality with different embedding model"""
    
    print(f"\nTesting: {model_name}")
    print(f"Description: {description}")
    
    try:
        # Load model
        model = SentenceTransformer(model_name)
        embedding_dim = model.get_sentence_embedding_dimension()
        print(f"Embedding dimension: {embedding_dim}")
        
        # Generate embeddings for chunks
        print("Generating chunk embeddings...")
        chunk_embeddings = []
        for chunk in chunks:
            embedding = model.encode(chunk['text'])
            chunk_embeddings.append(embedding)
        
        # Create collection
        collection_name = f"test_{model_name.replace('-', '_').replace('.', '_')}"
        
        qdrant_client = QdrantClient("localhost", port=6333)
        
        try:
            qdrant_client.delete_collection(collection_name)
        except:
            pass
            
        qdrant_client.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(size=embedding_dim, distance=Distance.COSINE)
        )
        
        # Insert points
        points = []
        for i, (chunk, embedding) in enumerate(zip(chunks, chunk_embeddings)):
            point = PointStruct(
                id=str(uuid.uuid4()),
                vector=embedding.tolist(),
                payload={
                    "text": chunk['text'],
                    "content_type": chunk['content_type'],
                    "token_count": chunk['token_count'],
                    "code_score": chunk['features']['code_score'],
                    "concept_score": chunk['features']['concept_score']
                }
            )
            points.append(point)
        
        qdrant_client.upsert(collection_name=collection_name, points=points)
        
        # Test retrieval quality
        scores = []
        for query in test_queries[:10]:  # Test subset for speed
            query_embedding = model.encode(query)
            
            results = qdrant_client.query_points(
                collection_name=collection_name,
                query=query_embedding.tolist(),
                limit=3
            )
            
            if results.points:
                scores.append(results.points[0].score)
        
        avg_score = np.mean(scores)
        print(f"Average top-1 score: {avg_score:.3f}")
        
        # Cleanup
        qdrant_client.delete_collection(collection_name)
        
        return avg_score
        
    except Exception as e:
        print(f"Error testing {model_name}: {e}")
        return 0.0

# Load chunks
with open(PROCESSED_DIR / 'processed_chunks.pkl', 'rb') as f:
    chunks = pickle.load(f)

test_queries = baseline_results['test_queries']

# Test different embedding models
model_results = {}
for model_name, description in embedding_models:
    score = test_embedding_model(model_name, description, chunks, test_queries)
    model_results[model_name] = score

print(f"\nEmbedding Model Comparison:")
for model, score in sorted(model_results.items(), key=lambda x: x[1], reverse=True):
    improvement = ((score - baseline_results['metrics']['avg_score']) / baseline_results['metrics']['avg_score']) * 100
    print(f"  {model:35s}: {score:.3f} ({improvement:+.1f}%)")

# Strategy 2: Expand Content Coverage
print(f"\n" + "="*60)
print("STRATEGY 2: EXPANDING CONTENT COVERAGE")
print("="*60)

# Load content analysis to find more valuable pages
content_analysis = pd.read_csv(PROCESSED_DIR / 'content_analysis.csv')

print("Current coverage analysis:")
current_pages = set()
for chunk in chunks:
    current_pages.update(chunk['source_pages'])

print(f"  Currently using {len(current_pages)} pages: {min(current_pages)}-{max(current_pages)}")

# Find additional high-value pages not yet included
available_pages = content_analysis[
    (content_analysis['pandas_score'] > 1) & 
    (content_analysis['char_count'] > 400) &
    (~content_analysis['page'].isin(current_pages))
].sort_values('pandas_score', ascending=False)

print(f"  Found {len(available_pages)} additional high-value pages")
print(f"  Top unused pages by pandas score:")
for _, page in available_pages.head(10).iterrows():
    print(f"    Page {page['page']:3d}: pandas_score={page['pandas_score']:2d}, type={page['content_type']}")

# Strategy 3: Query Preprocessing and Expansion
print(f"\n" + "="*60)
print("STRATEGY 3: QUERY PREPROCESSING")
print("="*60)

def preprocess_pandas_query(query):
    """Enhance queries with pandas-specific preprocessing"""
    
    # Normalize pandas terminology
    pandas_normalizations = {
        'dataframe': 'DataFrame',
        'data frame': 'DataFrame', 
        'data-frame': 'DataFrame',
        'series': 'Series',
        'groupby': 'group by',
        'group-by': 'group by'
    }
    
    processed_query = query.lower()
    for wrong, correct in pandas_normalizations.items():
        processed_query = processed_query.replace(wrong, correct)
    
    # Add context for function queries
    function_expansions = {
        'groupby': 'pandas groupby aggregation function',
        'merge': 'pandas merge join DataFrames',
        'concat': 'pandas concatenate combine DataFrames',
        'pivot': 'pandas pivot table reshape data',
        'melt': 'pandas melt reshape data'
    }
    
    for func, expansion in function_expansions.items():
        if func in processed_query.lower():
            processed_query = f"{processed_query} {expansion}"
    
    # Add pandas context if missing
    if 'pandas' not in processed_query.lower() and any(term in processed_query.lower() 
                                                       for term in ['dataframe', 'series', 'csv', 'data']):
        processed_query = f"pandas {processed_query}"
    
    return processed_query

def test_query_preprocessing(chunks, test_queries, model_name='all-mpnet-base-v2'):
    """Test impact of query preprocessing"""
    
    print(f"Testing query preprocessing with {model_name}...")
    
    # Setup
    model = SentenceTransformer(model_name)
    qdrant_client = QdrantClient("localhost", port=6333)
    
    collection_name = "test_preprocessing"
    
    try:
        qdrant_client.delete_collection(collection_name)
    except:
        pass
    
    qdrant_client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=model.get_sentence_embedding_dimension(), distance=Distance.COSINE)
    )
    
    # Generate and insert embeddings
    points = []
    for i, chunk in enumerate(chunks):
        embedding = model.encode(chunk['text'])
        point = PointStruct(
            id=str(uuid.uuid4()),
            vector=embedding.tolist(),
            payload={"text": chunk['text'], "content_type": chunk['content_type']}
        )
        points.append(point)
    
    qdrant_client.upsert(collection_name=collection_name, points=points)
    
    # Test original vs preprocessed queries
    original_scores = []
    preprocessed_scores = []
    
    print("\nQuery preprocessing examples:")
    for query in test_queries[:8]:
        processed_query = preprocess_pandas_query(query)
        
        if query != processed_query:
            print(f"  Original:    '{query}'")
            print(f"  Processed:   '{processed_query}'")
            print()
        
        # Test original
        orig_embedding = model.encode(query)
        orig_results = qdrant_client.query_points(
            collection_name=collection_name,
            query=orig_embedding.tolist(),
            limit=1
        )
        
        # Test preprocessed
        proc_embedding = model.encode(processed_query)
        proc_results = qdrant_client.query_points(
            collection_name=collection_name,
            query=proc_embedding.tolist(),
            limit=1
        )
        
        if orig_results.points:
            original_scores.append(orig_results.points[0].score)
        if proc_results.points:
            preprocessed_scores.append(proc_results.points[0].score)
    
    # Cleanup
    qdrant_client.delete_collection(collection_name)
    
    orig_avg = np.mean(original_scores) if original_scores else 0
    proc_avg = np.mean(preprocessed_scores) if preprocessed_scores else 0
    
    print(f"Results:")
    print(f"  Original queries avg score:     {orig_avg:.3f}")
    print(f"  Preprocessed queries avg score: {proc_avg:.3f}")
    print(f"  Improvement: {((proc_avg - orig_avg) / orig_avg * 100):+.1f}%")
    
    return proc_avg

# Test query preprocessing
best_model = max(model_results.items(), key=lambda x: x[1])[0]
preprocessing_score = test_query_preprocessing(chunks, test_queries, best_model)

# Strategy 4: Content-Specific Chunking
print(f"\n" + "="*60)
print("STRATEGY 4: FUNCTION-SPECIFIC CONTENT EXTRACTION")
print("="*60)

def extract_function_specific_content(pdf_path, content_analysis_df):
    """Extract pages that contain specific pandas function examples"""
    
    import PyPDF2
    
    # Target functions that performed poorly in retrieval
    target_functions = ['groupby', 'concat', 'merge', 'pivot', 'melt', 'apply', 'map']
    
    function_pages = {}
    
    # Find pages containing specific functions
    pages_to_check = content_analysis_df[
        content_analysis_df['code_score'] > 0
    ]['page'].tolist()
    
    print(f"Scanning {len(pages_to_check)} code-containing pages for specific functions...")
    
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        
        for page_num in pages_to_check:
            try:
                text = pdf_reader.pages[page_num].extract_text()
                
                for func in target_functions:
                    # Look for function usage patterns
                    patterns = [
                        f'{func}\\(',
                        f'\\.{func}\\(',
                        f'df\\.{func}',
                        f'pd\\.{func}',
                        f'{func} function',
                        f'{func} method'
                    ]
                    
                    func_score = sum(len(re.findall(pattern, text, re.IGNORECASE)) for pattern in patterns)
                    
                    if func_score > 0:
                        if func not in function_pages:
                            function_pages[func] = []
                        function_pages[func].append({
                            'page': page_num,
                            'score': func_score,
                            'preview': text[:200]
                        })
                        
            except Exception as e:
                continue
    
    # Sort by relevance
    for func in function_pages:
        function_pages[func] = sorted(function_pages[func], key=lambda x: x['score'], reverse=True)
    
    print(f"\nFunction-specific content found:")
    for func, pages in function_pages.items():
        print(f"  {func}: {len(pages)} pages")
        if pages:
            top_page = pages[0]
            print(f"    Best: page {top_page['page']} (score: {top_page['score']})")
    
    return function_pages

# Scan for function-specific content
PDF_FILE = PROJECT_ROOT / 'data' / 'raw' / 'mastering_pandas_2025.pdf'
function_content = extract_function_specific_content(PDF_FILE, content_analysis)

# Strategy 5: Recommendations Summary
print(f"\n" + "="*60)
print("IMPROVEMENT RECOMMENDATIONS")
print("="*60)

print(f"Based on analysis:")

# Find best embedding model
best_embedding = max(model_results.items(), key=lambda x: x[1])
improvement = ((best_embedding[1] - baseline_results['metrics']['avg_score']) / baseline_results['metrics']['avg_score']) * 100

print(f"\n1. EMBEDDING MODEL UPGRADE:")
print(f"   Switch to: {best_embedding[0]}")
print(f"   Expected improvement: {improvement:+.1f}%")
print(f"   New average score: {best_embedding[1]:.3f}")

print(f"\n2. CONTENT EXPANSION:")
print(f"   Add {len(available_pages)} high-pandas-score pages")
print(f"   Focus on pages: {available_pages.head(5)['page'].tolist()}")

print(f"\n3. QUERY PREPROCESSING:")
print(f"   Expected improvement: {((preprocessing_score - baseline_results['metrics']['avg_score']) / baseline_results['metrics']['avg_score'] * 100):+.1f}%")

print(f"\n4. FUNCTION-SPECIFIC CONTENT:")
total_func_pages = sum(len(pages) for pages in function_content.values())
print(f"   Found {total_func_pages} pages with specific function examples")
print(f"   Priority functions: {list(function_content.keys())}")

print(f"\n5. COMBINED IMPACT ESTIMATE:")
total_improvement = improvement + 15  # Estimate for content expansion + preprocessing
print(f"   Potential combined improvement: {total_improvement:+.1f}%")
print(f"   Projected average score: {baseline_results['metrics']['avg_score'] * (1 + total_improvement/100):.3f}")

# Save improvement analysis
improvement_data = {
    'embedding_model_results': model_results,
    'best_embedding_model': best_embedding[0],
    'available_expansion_pages': available_pages.to_dict('records'),
    'function_specific_content': function_content,
    'preprocessing_improvement': preprocessing_score,
    'recommendations': {
        'best_embedding': best_embedding[0],
        'expected_improvement': improvement,
        'expansion_pages': len(available_pages),
        'function_pages': total_func_pages
    }
}

with open(PROCESSED_DIR / 'improvement_analysis.pkl', 'wb') as f:
    pickle.dump(improvement_data, f)

print(f"\nImprovement analysis saved to: {PROCESSED_DIR / 'improvement_analysis.pkl'}")
print(f"\nNext steps:")
print(f"  1. Implement best embedding model ({best_embedding[0]})")
print(f"  2. Expand content with top {min(10, len(available_pages))} pages")
print(f"  3. Add query preprocessing")
print(f"  4. Re-test retrieval quality")

  from .autonotebook import tqdm as notebook_tqdm


Loading previous results...
Baseline Performance:
  Average score: 0.509
  Good+ rate: 22.2%

STRATEGY 1: TESTING DIFFERENT EMBEDDING MODELS

Testing: all-MiniLM-L6-v2
Description: Current baseline model
Embedding dimension: 384
Generating chunk embeddings...
Average top-1 score: 0.489

Testing: all-mpnet-base-v2
Description: Higher quality general model


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Embedding dimension: 768
Generating chunk embeddings...
Average top-1 score: 0.525

Testing: multi-qa-mpnet-base-dot-v1
Description: Optimized for Q&A


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Embedding dimension: 768
Generating chunk embeddings...
Average top-1 score: 0.599

Testing: paraphrase-multilingual-mpnet-base-v2
Description: Better semantic understanding


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Embedding dimension: 768
Generating chunk embeddings...
Average top-1 score: 0.542

Embedding Model Comparison:
  multi-qa-mpnet-base-dot-v1         : 0.599 (+17.7%)
  paraphrase-multilingual-mpnet-base-v2: 0.542 (+6.4%)
  all-mpnet-base-v2                  : 0.525 (+3.1%)
  all-MiniLM-L6-v2                   : 0.489 (-4.0%)

STRATEGY 2: EXPANDING CONTENT COVERAGE
Current coverage analysis:
  Currently using 76 pages: 8-396
  Found 8 additional high-value pages
  Top unused pages by pandas score:
    Page  40: pandas_score=11, type=navigation
    Page  20: pandas_score=10, type=navigation
    Page 372: pandas_score= 6, type=navigation
    Page 204: pandas_score= 5, type=conceptual
    Page  72: pandas_score= 3, type=conceptual
    Page   4: pandas_score= 3, type=navigation
    Page  96: pandas_score= 2, type=general
    Page 356: pandas_score= 2, type=general

STRATEGY 3: QUERY PREPROCESSING
Testing query preprocessing with multi-qa-mpnet-base-dot-v1...

Query preprocessing examples:
 