In [1]:
# 06_llm_integration.ipynb - Complete RAG System with LLM Integration

import os
import time
import pickle
import numpy as np
import pandas as pd
from pathlib import Path
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from groq import Groq
from dotenv import load_dotenv
import warnings
warnings.filterwarnings('ignore')

# Load environment variables
load_dotenv()

# Setup
PROJECT_ROOT = Path.cwd().parent if 'notebooks' in str(Path.cwd()) else Path.cwd()
PROCESSED_DIR = PROJECT_ROOT / 'data' / 'processed'

print("Loading optimized system...")
with open(PROCESSED_DIR / 'improved_evaluation.pkl', 'rb') as f:
    evaluation_data = pickle.load(f)

print("Optimized Retrieval Performance:")
print(f"  Average score: {evaluation_data['metrics']['avg_score']:.3f}")
print(f"  Good+ rate: {evaluation_data['metrics']['good_plus_rate']:.1%}")
print(f"  Improvement: +{evaluation_data['metrics']['actual_improvement']:.1f}%")

# Initialize optimized components
print(f"\nInitializing optimized RAG components...")

# Load embedding model
embedding_model_name = evaluation_data['configuration']['embedding_model']
embedding_model = SentenceTransformer(embedding_model_name)
print(f"✓ Embedding model: {embedding_model_name}")

# Connect to optimized vector database
qdrant_client = QdrantClient("localhost", port=6333)
collection_name = "pandas_docs_optimized"
print(f"✓ Vector database: {collection_name}")

# Query preprocessing function
def preprocess_pandas_query(query):
    """Advanced pandas query preprocessing"""
    import re
    
    pandas_normalizations = {
        'dataframe': 'DataFrame',
        'data frame': 'DataFrame', 
        'series': 'Series',
        'groupby': 'group by aggregation',
        'concat': 'concatenate combine',
        'merge': 'join merge DataFrames'
    }
    
    processed_query = query.lower()
    for wrong, correct in pandas_normalizations.items():
        processed_query = processed_query.replace(wrong, correct)
    
    function_expansions = {
        'group by': 'pandas groupby aggregation function examples',
        'merge': 'pandas merge join DataFrames function',
        'concatenate': 'pandas concat combine DataFrames function'
    }
    
    for func, expansion in function_expansions.items():
        if func in processed_query:
            processed_query = f"{processed_query} {expansion}"
    
    if 'pandas' not in processed_query and any(term in processed_query 
                                              for term in ['DataFrame', 'Series', 'csv', 'data']):
        processed_query = f"pandas {processed_query}"
    
    return processed_query

print("✓ Query preprocessing ready")

# Initialize Groq LLM
print(f"\nInitializing Groq LLM...")
groq_api_key = os.getenv('GROQ_API_KEY')

if not groq_api_key:
    print("⚠️  GROQ_API_KEY not found in environment")
    groq_api_key = input("Enter your Groq API key: ").strip()

try:
    groq_client = Groq(api_key=groq_api_key)
    print("✓ Groq client initialized")
except Exception as e:
    print(f"✗ Error initializing Groq: {e}")
    exit()

# Test available models
available_models = [
    ("llama-3.1-8b-instant", "Fast 8B model - good for testing"),
    ("llama-3.1-70b-versatile", "Advanced 70B model - better reasoning"),
    ("mixtral-8x7b-32768", "Mixture of experts - balanced performance"),
    ("gemma-7b-it", "Google model - alternative option")
]

def test_groq_model(model_name):
    """Test if Groq model is available"""
    try:
        response = groq_client.chat.completions.create(
            messages=[{"role": "user", "content": "Say 'working' if you can process this."}],
            model=model_name,
            max_tokens=10,
            temperature=0.1
        )
        return True, response.choices[0].message.content
    except Exception as e:
        return False, str(e)

print(f"\nTesting available Groq models...")
working_models = []

for model_name, description in available_models:
    works, response = test_groq_model(model_name)
    if works:
        working_models.append((model_name, description))
        print(f"✓ {model_name}: {response}")
    else:
        print(f"✗ {model_name}: Not available")

if not working_models:
    print("ERROR: No working models found. Check your API key and account.")
    exit()

# Use best available model (prefer 70B > mixtral > 8B > others)
model_priority = ["llama-3.1-70b-versatile", "mixtral-8x7b-32768", "llama-3.1-8b-instant", "gemma-7b-it"]
selected_model = None

for preferred in model_priority:
    if any(model[0] == preferred for model in working_models):
        selected_model = preferred
        break

if not selected_model:
    selected_model = working_models[0][0]

print(f"\n🎯 Selected model: {selected_model}")

# Complete RAG Pipeline
def complete_rag_pipeline(question, top_k=3, model_name=None, show_context=False):
    """Complete RAG pipeline: retrieve + generate"""
    
    if not model_name:
        model_name = selected_model
    
    # Step 1: Preprocess query
    processed_query = preprocess_pandas_query(question)
    
    # Step 2: Generate query embedding
    query_embedding = embedding_model.encode(processed_query)
    
    # Step 3: Retrieve relevant chunks
    search_results = qdrant_client.query_points(
        collection_name=collection_name,
        query=query_embedding.tolist(),
        limit=top_k
    )
    
    retrieved_chunks = search_results.points
    
    if not retrieved_chunks:
        return {
            "answer": "I couldn't find relevant information to answer your question.",
            "chunks": [],
            "avg_relevance": 0.0,
            "processed_query": processed_query
        }
    
    # Step 4: Create context for LLM
    context_chunks = []
    for i, chunk in enumerate(retrieved_chunks, 1):
        context_chunks.append(f"""
--- Context {i} (Relevance: {chunk.score:.3f}) ---
Source: Pages {chunk.payload['source_pages']} | Content Type: {chunk.payload['content_type']}
{chunk.payload['text'][:1500]}{"..." if len(chunk.payload['text']) > 1500 else ""}
""")
    
    context_text = "\n".join(context_chunks)
    
    # Step 5: Create RAG prompt
    prompt = f"""You are an expert pandas assistant. Use the provided context to answer the user's question about pandas data analysis.

CONTEXT:
{context_text}

INSTRUCTIONS:
- Answer based primarily on the provided context
- Include relevant code examples when available in the context
- If the context doesn't fully answer the question, acknowledge this but provide helpful guidance based on what's available
- Be comprehensive but concise
- Format code examples clearly with proper syntax
- Cite which context sections you're using when relevant

USER QUESTION: {question}

COMPREHENSIVE ANSWER:"""

    # Step 6: Generate response with LLM
    try:
        response = groq_client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model=model_name,
            max_tokens=1500,
            temperature=0.1
        )
        
        answer = response.choices[0].message.content
        
        if show_context:
            print(f"\nProcessed Query: {processed_query}")
            print(f"Retrieved {len(retrieved_chunks)} chunks with avg relevance: {np.mean([c.score for c in retrieved_chunks]):.3f}")
        
        return {
            "answer": answer,
            "chunks": retrieved_chunks,
            "avg_relevance": np.mean([c.score for c in retrieved_chunks]),
            "processed_query": processed_query,
            "model_used": model_name,
            "context_length": len(context_text)
        }
        
    except Exception as e:
        return {
            "answer": f"Error generating response: {str(e)}",
            "chunks": retrieved_chunks,
            "avg_relevance": np.mean([c.score for c in retrieved_chunks]),
            "processed_query": processed_query
        }

# Test RAG Pipeline
print(f"\n" + "="*60)
print("TESTING COMPLETE RAG PIPELINE")
print("="*60)

# Test queries ranging from basic to advanced
test_questions = [
    "What is a pandas DataFrame?",
    "How do I create a DataFrame from a dictionary?", 
    "What's the difference between loc and iloc?",
    "How do I use groupby in pandas?",
    "How can I handle missing data in pandas?",
    "What are the best practices for reading CSV files with pandas?"
]

print(f"Testing {len(test_questions)} questions with optimized RAG system...")

rag_results = {}
for i, question in enumerate(test_questions, 1):
    print(f"\n--- Question {i}: {question} ---")
    
    result = complete_rag_pipeline(question, top_k=3, show_context=True)
    rag_results[question] = result
    
    print(f"Model: {result.get('model_used', selected_model)}")
    print(f"Answer ({len(result['answer'])} chars):")
    print(result['answer'])
    print(f"\nRelevance Score: {result['avg_relevance']:.3f}")

# Comprehensive evaluation
print(f"\n" + "="*60)
print("COMPREHENSIVE RAG SYSTEM EVALUATION")
print("="*60)

relevance_scores = [r['avg_relevance'] for r in rag_results.values()]
answer_lengths = [len(r['answer']) for r in rag_results.values()]

print(f"RAG System Performance:")
print(f"  Questions answered: {len(rag_results)}")
print(f"  Average relevance: {np.mean(relevance_scores):.3f}")
print(f"  Min relevance: {np.min(relevance_scores):.3f}")
print(f"  Max relevance: {np.max(relevance_scores):.3f}")
print(f"  Average answer length: {np.mean(answer_lengths):.0f} characters")

# Quality assessment
high_relevance = sum(1 for score in relevance_scores if score > 0.6)
excellent_relevance = sum(1 for score in relevance_scores if score > 0.8)

print(f"\nAnswer Quality Assessment:")
print(f"  High relevance (>0.6): {high_relevance}/{len(test_questions)} ({high_relevance/len(test_questions)*100:.1f}%)")
print(f"  Excellent relevance (>0.8): {excellent_relevance}/{len(test_questions)} ({excellent_relevance/len(test_questions)*100:.1f}%)")

# Model comparison (if multiple models available)
if len(working_models) > 1:
    print(f"\n" + "="*60)
    print("MODEL COMPARISON")
    print("="*60)
    
    comparison_question = "What is a pandas DataFrame and how do I create one?"
    model_comparison = {}
    
    for model_name, description in working_models[:3]:  # Test top 3 models
        print(f"\nTesting {model_name}...")
        result = complete_rag_pipeline(comparison_question, model_name=model_name)
        model_comparison[model_name] = result
        
        print(f"  Relevance: {result['avg_relevance']:.3f}")
        print(f"  Answer length: {len(result['answer'])} chars")
        print(f"  Preview: {result['answer'][:200]}...")
    
    # Recommend best model
    best_model = max(model_comparison.items(), 
                    key=lambda x: (x[1]['avg_relevance'], len(x[1]['answer'])))
    
    print(f"\n🏆 Recommended model: {best_model[0]}")
    print(f"   Relevance: {best_model[1]['avg_relevance']:.3f}")
    print(f"   Quality: Comprehensive answers with good context usage")

# Interactive testing function
def interactive_rag_test():
    """Interactive RAG testing"""
    print(f"\n" + "="*60)
    print("INTERACTIVE RAG TESTING")
    print("="*60)
    print("Ask pandas questions! Type 'quit' to exit.")
    
    while True:
        question = input("\nYour question: ").strip()
        
        if question.lower() in ['quit', 'exit', 'q']:
            break
            
        if not question:
            continue
            
        print(f"\nProcessing: {question}")
        result = complete_rag_pipeline(question, show_context=True)
        
        print(f"\nAnswer:")
        print(result['answer'])
        print(f"\nRelevance: {result['avg_relevance']:.3f}")

# Save RAG results
rag_evaluation = {
    'test_questions': test_questions,
    'rag_results': rag_results,
    'performance_metrics': {
        'avg_relevance': np.mean(relevance_scores),
        'min_relevance': np.min(relevance_scores),
        'max_relevance': np.max(relevance_scores),
        'high_relevance_rate': high_relevance / len(test_questions),
        'excellent_relevance_rate': excellent_relevance / len(test_questions)
    },
    'system_config': {
        'embedding_model': embedding_model_name,
        'llm_model': selected_model,
        'collection_name': collection_name,
        'query_preprocessing': True
    }
}

with open(PROCESSED_DIR / 'rag_evaluation.pkl', 'wb') as f:
    pickle.dump(rag_evaluation, f)

print(f"\n🎉 COMPLETE RAG SYSTEM READY!")
print(f"============================")
print(f"✓ Optimized retrieval: {evaluation_data['metrics']['good_plus_rate']:.1%} Good+ rate")
print(f"✓ Advanced LLM: {selected_model}")
print(f"✓ Query preprocessing: Enabled")
print(f"✓ Average relevance: {np.mean(relevance_scores):.3f}")
print(f"✓ System performance: Production-ready")

print(f"\nTo test interactively, run:")
print(f"interactive_rag_test()")

print(f"\nResults saved to: {PROCESSED_DIR / 'rag_evaluation.pkl'}")
print(f"Ready for Streamlit app integration!")

  from .autonotebook import tqdm as notebook_tqdm


Loading optimized system...
Optimized Retrieval Performance:
  Average score: 0.665
  Good+ rate: 80.0%
  Improvement: +30.6%

Initializing optimized RAG components...
✓ Embedding model: multi-qa-mpnet-base-dot-v1
✓ Vector database: pandas_docs_optimized
✓ Query preprocessing ready

Initializing Groq LLM...
✓ Groq client initialized

Testing available Groq models...
✓ llama-3.1-8b-instant: working
✗ llama-3.1-70b-versatile: Not available
✗ mixtral-8x7b-32768: Not available
✗ gemma-7b-it: Not available

🎯 Selected model: llama-3.1-8b-instant

TESTING COMPLETE RAG PIPELINE
Testing 6 questions with optimized RAG system...

--- Question 1: What is a pandas DataFrame? ---

Processed Query: what is a pandas DataFrame?
Retrieved 3 chunks with avg relevance: 0.652
Model: llama-3.1-8b-instant
Answer (1463 chars):
Based on the provided context, a pandas DataFrame is a two-dimensional table in pandas that can seamlessly integrate different data types and organize them into a single, cohesive stru