In [1]:
# 05_implement_improvements.ipynb - Implementing Retrieval Improvements

import os
import time
import pickle
import numpy as np
import pandas as pd
from pathlib import Path
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
import uuid
import re
import PyPDF2
import warnings
warnings.filterwarnings('ignore')

# Setup
PROJECT_ROOT = Path.cwd().parent if 'notebooks' in str(Path.cwd()) else Path.cwd()
PROCESSED_DIR = PROJECT_ROOT / 'data' / 'processed'
PDF_FILE = PROJECT_ROOT / 'data' / 'raw' / 'mastering_pandas_2025.pdf'

print("Loading improvement analysis...")
with open(PROCESSED_DIR / 'improvement_analysis.pkl', 'rb') as f:
    improvement_data = pickle.load(f)

with open(PROCESSED_DIR / 'retrieval_evaluation.pkl', 'rb') as f:
    baseline_results = pickle.load(f)

print("Baseline Performance:")
print(f"  Average score: {baseline_results['metrics']['avg_score']:.3f}")
print(f"  Good+ rate: {baseline_results['metrics']['good_plus_rate']:.1%}")

print(f"\nImprovement Plan:")
print(f"  Best embedding model: {improvement_data['best_embedding_model']}")
print(f"  Additional pages to add: {improvement_data['recommendations']['expansion_pages']}")
print(f"  Expected improvement: +{improvement_data['recommendations']['expected_improvement']:.1f}%")

# Step 1: Extract Additional High-Value Pages
print(f"\n" + "="*60)
print("STEP 1: EXTRACTING ADDITIONAL HIGH-VALUE CONTENT")
print("="*60)

def extract_additional_pages(pdf_path, expansion_pages, max_pages=8):
    """Extract content from additional high-value pages"""
    
    print(f"Extracting content from {min(len(expansion_pages), max_pages)} additional pages...")
    
    additional_content = []
    pages_to_process = expansion_pages[:max_pages]
    
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        
        for page_info in pages_to_process:
            page_num = page_info['page']
            
            try:
                text = pdf_reader.pages[page_num].extract_text()
                
                if text.strip() and len(text) > 200:
                    additional_content.append({
                        'page_num': page_num,
                        'text': text,
                        'content_type': page_info['content_type'],
                        'pandas_score': page_info['pandas_score'],
                        'char_count': page_info['char_count']
                    })
                    print(f"  Extracted page {page_num} ({page_info['content_type']}, pandas_score={page_info['pandas_score']})")
                    
            except Exception as e:
                print(f"  Error extracting page {page_num}: {e}")
    
    print(f"Successfully extracted {len(additional_content)} additional pages")
    return additional_content

# Extract additional pages
expansion_pages = improvement_data['available_expansion_pages']
additional_pages = extract_additional_pages(PDF_FILE, expansion_pages, max_pages=8)

# Step 2: Improved Content Processing
print(f"\n" + "="*60)
print("STEP 2: IMPROVED CONTENT PROCESSING")
print("="*60)

# Load existing chunks
with open(PROCESSED_DIR / 'processed_chunks.pkl', 'rb') as f:
    existing_chunks = pickle.load(f)

print(f"Existing chunks: {len(existing_chunks)}")
print(f"Additional pages: {len(additional_pages)}")

# Import chunking functions from previous notebooks
def clean_pdf_text(text):
    """Clean PDF extraction artifacts"""
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)
    text = re.sub(r'\bwher\s+e\b', 'where', text)
    text = re.sub(r'\btransfor\s+ms\b', 'transforms', text)
    text = re.sub(r'\bData\s+Frame\b', 'DataFrame', text)
    text = re.sub(r'\bdata\s+frame\b', 'DataFrame', text)
    text = re.sub(r'\bGroup\s+By\b', 'GroupBy', text)
    text = re.sub(r'\bgroup\s+by\b', 'groupby', text, flags=re.IGNORECASE)
    text = re.sub(r'([.!?])\s*([A-Z])', r'\1 \2', text)
    text = re.sub(r'\n\s*\n', '\n\n', text)
    return text.strip()

def detect_content_features(text):
    """Analyze content characteristics"""
    code_patterns = [
        r'import\s+\w+', r'from\s+\w+\s+import', r'pd\.\w+', r'df\.\w+', 
        r'print\s*\(', r'=\s*pd\.', r'\.groupby\(', r'\.merge\(',
        r'\.iloc\[', r'\.loc\[', r'def\s+\w+', r'class\s+\w+'
    ]
    code_score = sum(len(re.findall(pattern, text, re.IGNORECASE)) for pattern in code_patterns)
    
    pandas_concepts = [
        'DataFrame', 'Series', 'Index', 'groupby', 'merge', 'concat',
        'pivot', 'melt', 'apply', 'lambda', 'iloc', 'loc', 'query'
    ]
    concept_score = sum(text.lower().count(concept.lower()) for concept in pandas_concepts)
    
    has_headers = bool(re.search(r'^[A-Z][^.!?]*:?\s*$', text, re.MULTILINE))
    has_code_blocks = bool(re.search(r'```|>>>|\n\s*\w+\s*=', text))
    has_examples = bool(re.search(r'example|Example|for instance|For instance', text, re.IGNORECASE))
    
    return {
        'code_score': code_score,
        'concept_score': concept_score,
        'has_headers': has_headers,
        'has_code_blocks': has_code_blocks,
        'has_examples': has_examples,
        'is_code_heavy': code_score > 3,
        'is_concept_heavy': concept_score > 5
    }

import tiktoken
tokenizer = tiktoken.get_encoding("cl100k_base")

def count_tokens(text):
    return len(tokenizer.encode(text))

def process_additional_content(additional_pages, target_tokens=1000):
    """Process additional pages into chunks"""
    
    additional_chunks = []
    
    for page_data in additional_pages:
        cleaned_text = clean_pdf_text(page_data['text'])
        total_tokens = count_tokens(cleaned_text)
        
        # If page is substantial enough to be its own chunk
        if total_tokens >= 400:
            features = detect_content_features(cleaned_text)
            
            additional_chunks.append({
                'text': cleaned_text,
                'token_count': total_tokens,
                'content_type': page_data['content_type'],
                'chunk_index': 0,
                'global_chunk_id': len(existing_chunks) + len(additional_chunks),
                'source_pages': [page_data['page_num']],
                'page_count': 1,
                'features': features,
                'is_additional': True,
                'pandas_score': page_data['pandas_score']
            })
            
            print(f"  Created chunk from page {page_data['page_num']} ({total_tokens} tokens)")
    
    return additional_chunks

# Process additional content
additional_chunks = process_additional_content(additional_pages)

# Combine existing and additional chunks
all_chunks = existing_chunks + additional_chunks

print(f"\nCombined content:")
print(f"  Original chunks: {len(existing_chunks)}")
print(f"  Additional chunks: {len(additional_chunks)}")
print(f"  Total chunks: {len(all_chunks)}")
print(f"  Average tokens: {np.mean([c['token_count'] for c in all_chunks]):.1f}")

# Step 3: Implement Best Embedding Model
print(f"\n" + "="*60)
print("STEP 3: IMPLEMENTING BEST EMBEDDING MODEL")
print("="*60)

best_model_name = improvement_data['best_embedding_model']
print(f"Switching to: {best_model_name}")

# Initialize improved embedding model
embedding_model = SentenceTransformer(best_model_name)
embedding_dimension = embedding_model.get_sentence_embedding_dimension()
print(f"Model loaded. Embedding dimension: {embedding_dimension}")

# Generate embeddings for all chunks
print(f"\nGenerating embeddings for {len(all_chunks)} chunks...")
start_time = time.time()

chunk_embeddings = []
for i, chunk in enumerate(all_chunks):
    embedding = embedding_model.encode(chunk['text'])
    chunk_embeddings.append(embedding)
    
    if (i + 1) % 5 == 0 or i == len(all_chunks) - 1:
        print(f"  Generated {i + 1}/{len(all_chunks)} embeddings")

embedding_time = time.time() - start_time
print(f"Embeddings generated in {embedding_time:.2f} seconds")

# Step 4: Implement Query Preprocessing
print(f"\n" + "="*60)
print("STEP 4: IMPLEMENTING QUERY PREPROCESSING")
print("="*60)

def preprocess_pandas_query(query):
    """Advanced pandas-specific query preprocessing"""
    
    # Normalize pandas terminology
    pandas_normalizations = {
        'dataframe': 'DataFrame',
        'data frame': 'DataFrame', 
        'data-frame': 'DataFrame',
        'series': 'Series',
        'groupby': 'group by aggregation',
        'group-by': 'group by aggregation',
        'concat': 'concatenate combine',
        'merge': 'join merge DataFrames'
    }
    
    processed_query = query.lower()
    for wrong, correct in pandas_normalizations.items():
        processed_query = processed_query.replace(wrong, correct)
    
    # Add context for function queries
    function_expansions = {
        'group by': 'pandas groupby aggregation function examples',
        'merge': 'pandas merge join DataFrames function',
        'concatenate': 'pandas concat combine DataFrames function',
        'pivot': 'pandas pivot table reshape data function',
        'melt': 'pandas melt reshape data function'
    }
    
    for func, expansion in function_expansions.items():
        if func in processed_query:
            processed_query = f"{processed_query} {expansion}"
    
    # Add pandas context if missing
    if 'pandas' not in processed_query and any(term in processed_query 
                                              for term in ['DataFrame', 'Series', 'csv', 'data']):
        processed_query = f"pandas {processed_query}"
    
    return processed_query

# Test preprocessing on sample queries
test_queries = baseline_results['test_queries'][:5]
print("Query preprocessing examples:")
for query in test_queries:
    processed = preprocess_pandas_query(query)
    if query != processed:
        print(f"  Original:  '{query}'")
        print(f"  Processed: '{processed}'")
        print()

# Step 5: Setup Improved Vector Database
print(f"\n" + "="*60)
print("STEP 5: SETTING UP IMPROVED VECTOR DATABASE")
print("="*60)

# Connect to Qdrant
qdrant_client = QdrantClient("localhost", port=6333)

collection_name = "pandas_docs_optimized"
print(f"Creating optimized collection: {collection_name}")

try:
    qdrant_client.delete_collection(collection_name)
    print("  Deleted existing collection")
except:
    pass

qdrant_client.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=embedding_dimension, distance=Distance.COSINE)
)
print("  Collection created successfully")

# Prepare and insert points with rich metadata
print(f"Preparing {len(all_chunks)} data points...")
points = []

for i, (chunk, embedding) in enumerate(zip(all_chunks, chunk_embeddings)):
    point = PointStruct(
        id=str(uuid.uuid4()),
        vector=embedding.tolist(),
        payload={
            "text": chunk['text'],
            "token_count": chunk['token_count'],
            "content_type": chunk['content_type'],
            "chunk_index": chunk['chunk_index'],
            "global_chunk_id": chunk['global_chunk_id'],
            "source_pages": chunk['source_pages'],
            "page_count": chunk['page_count'],
            "code_score": chunk['features']['code_score'],
            "concept_score": chunk['features']['concept_score'],
            "is_code_heavy": chunk['features']['is_code_heavy'],
            "is_concept_heavy": chunk['features']['is_concept_heavy'],
            "is_additional": chunk.get('is_additional', False),
            "pandas_score": chunk.get('pandas_score', 0),
            "preview": chunk['text'][:300] + "..." if len(chunk['text']) > 300 else chunk['text']
        }
    )
    points.append(point)

# Insert points
print(f"Inserting points into Qdrant...")
result = qdrant_client.upsert(collection_name=collection_name, points=points)
print(f"  Inserted {len(points)} points successfully")

# Verify
count_result = qdrant_client.count(collection_name)
print(f"  Verification: {count_result.count} points in collection")

# Step 6: Comprehensive Testing
print(f"\n" + "="*60)
print("STEP 6: COMPREHENSIVE RETRIEVAL TESTING")
print("="*60)

def improved_retrieval_test(query, collection_name, top_k=3, use_preprocessing=True):
    """Test retrieval with improvements"""
    
    # Preprocess query if enabled
    if use_preprocessing:
        processed_query = preprocess_pandas_query(query)
    else:
        processed_query = query
    
    # Generate embedding
    query_embedding = embedding_model.encode(processed_query)
    
    # Search
    search_results = qdrant_client.query_points(
        collection_name=collection_name,
        query=query_embedding.tolist(),
        limit=top_k
    )
    
    return search_results.points, processed_query

# Test all queries with improvements
print(f"Testing retrieval with all improvements...")
improved_results = {}
improved_scores = []
quality_distribution = {"Excellent": 0, "Good": 0, "Fair": 0, "Poor": 0}

for i, query in enumerate(test_queries):
    results, processed_query = improved_retrieval_test(query, collection_name)
    
    if results:
        top_score = results[0].score
        avg_score = np.mean([r.score for r in results])
        
        # Categorize quality
        if top_score > 0.8:
            quality = "Excellent"
        elif top_score > 0.6:
            quality = "Good"
        elif top_score > 0.4:
            quality = "Fair"
        else:
            quality = "Poor"
        
        quality_distribution[quality] += 1
        improved_scores.append(top_score)
        
        improved_results[query] = {
            'top_score': top_score,
            'avg_score': avg_score,
            'quality': quality,
            'processed_query': processed_query,
            'results_count': len(results)
        }
        
        print(f"  {i+1:2d}. {quality:9s} ({top_score:.3f}) - {query}")

# Calculate improvement metrics
print(f"\nIMPROVED RETRIEVAL QUALITY ANALYSIS:")
print(f"Total queries tested: {len(test_queries)}")
print(f"Average top score: {np.mean(improved_scores):.3f}")
print(f"Median top score: {np.median(improved_scores):.3f}")

print(f"\nQuality Distribution:")
for quality, count in quality_distribution.items():
    percentage = (count / len(test_queries)) * 100
    print(f"  {quality:9s}: {count:2d} queries ({percentage:4.1f}%)")

# Compare with baseline
baseline_avg = baseline_results['metrics']['avg_score']
improved_avg = np.mean(improved_scores)
actual_improvement = ((improved_avg - baseline_avg) / baseline_avg) * 100

excellent_rate = quality_distribution["Excellent"] / len(test_queries)
good_plus_rate = (quality_distribution["Excellent"] + quality_distribution["Good"]) / len(test_queries)

print(f"\nIMPROVEMENT ANALYSIS:")
print(f"  Baseline average score: {baseline_avg:.3f}")
print(f"  Improved average score: {improved_avg:.3f}")
print(f"  Actual improvement: {actual_improvement:+.1f}%")
print(f"  Baseline Good+ rate: {baseline_results['metrics']['good_plus_rate']:.1%}")
print(f"  Improved Good+ rate: {good_plus_rate:.1%}")

# Detailed analysis for problem queries
print(f"\nDetailed Analysis for Previously Poor Queries:")

problem_queries = [
    "pandas groupby function",
    "Series creation syntax", 
    "pandas concat function"
]

for query in problem_queries:
    if query in improved_results:
        result = improved_results[query]
        baseline_score = baseline_results['evaluation_results'].get(query, {}).get('top_score', 0)
        
        print(f"\nQuery: '{query}'")
        print(f"  Baseline score: {baseline_score:.3f}")
        print(f"  Improved score: {result['top_score']:.3f}")
        print(f"  Improvement: {((result['top_score'] - baseline_score) / baseline_score * 100):+.1f}%")
        print(f"  Processed query: '{result['processed_query']}'")
        
        # Show top result
        results, _ = improved_retrieval_test(query, collection_name, top_k=1)
        if results:
            top_result = results[0]
            print(f"  Top result: {top_result.payload['content_type']} chunk")
            print(f"  Source pages: {top_result.payload['source_pages']}")
            print(f"  Preview: {top_result.payload['preview'][:150]}...")

# Save improved results
improved_evaluation = {
    'improved_results': improved_results,
    'quality_distribution': quality_distribution,
    'metrics': {
        'avg_score': improved_avg,
        'median_score': np.median(improved_scores),
        'excellent_rate': excellent_rate,
        'good_plus_rate': good_plus_rate,
        'actual_improvement': actual_improvement
    },
    'configuration': {
        'embedding_model': best_model_name,
        'total_chunks': len(all_chunks),
        'additional_chunks': len(additional_chunks),
        'query_preprocessing': True
    }
}

with open(PROCESSED_DIR / 'improved_evaluation.pkl', 'wb') as f:
    pickle.dump(improved_evaluation, f)

with open(PROCESSED_DIR / 'optimized_chunks.pkl', 'wb') as f:
    pickle.dump(all_chunks, f)

print(f"\nSUCCESS: RETRIEVAL IMPROVEMENTS IMPLEMENTED")
print(f"========================================")
print(f"✓ Embedding model upgraded to: {best_model_name}")
print(f"✓ Content expanded: +{len(additional_chunks)} chunks")
print(f"✓ Query preprocessing implemented")
print(f"✓ {actual_improvement:+.1f}% improvement achieved")
print(f"✓ Good+ rate: {baseline_results['metrics']['good_plus_rate']:.1%} → {good_plus_rate:.1%}")

print(f"\nOptimized retrieval system ready for LLM integration!")
print(f"Collection: '{collection_name}' with {len(all_chunks)} high-quality chunks")

print(f"\nResults saved:")
print(f"  Evaluation: {PROCESSED_DIR / 'improved_evaluation.pkl'}")
print(f"  Chunks: {PROCESSED_DIR / 'optimized_chunks.pkl'}")

  from .autonotebook import tqdm as notebook_tqdm


Loading improvement analysis...
Baseline Performance:
  Average score: 0.509
  Good+ rate: 22.2%

Improvement Plan:
  Best embedding model: multi-qa-mpnet-base-dot-v1
  Additional pages to add: 8
  Expected improvement: +17.7%

STEP 1: EXTRACTING ADDITIONAL HIGH-VALUE CONTENT
Extracting content from 8 additional pages...
  Extracted page 40 (navigation, pandas_score=11)
  Extracted page 20 (navigation, pandas_score=10)
  Extracted page 372 (navigation, pandas_score=6)
  Extracted page 204 (conceptual, pandas_score=5)
  Extracted page 72 (conceptual, pandas_score=3)
  Extracted page 4 (navigation, pandas_score=3)
  Extracted page 96 (general, pandas_score=2)
  Extracted page 356 (general, pandas_score=2)
Successfully extracted 8 additional pages

STEP 2: IMPROVED CONTENT PROCESSING
Existing chunks: 13
Additional pages: 8

Combined content:
  Original chunks: 13
  Additional chunks: 0
  Total chunks: 13
  Average tokens: 1160.5

STEP 3: IMPLEMENTING BEST EMBEDDING MODEL
Switching to: mul