In [3]:
# ===================================================================
# TIMBER MOUNTAIN AI CHATBOT - DATA PROCESSING PIPELINE
# ===================================================================
# Step 1: Process and Combine Your Data
# This notebook processes A/B test metadata and PDF presentations
# to create a unified dataset for the Neo4j GraphRAG system.

import pandas as pd
import pdfplumber
import json
import os
from pathlib import Path
from datetime import datetime

print("🌲 Timber Mountain AI Chatbot - Data Processing Pipeline")
print("=" * 60)
print("✅ All libraries imported successfully!")
print(f"📅 Processing started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

🌲 Timber Mountain AI Chatbot - Data Processing Pipeline
✅ All libraries imported successfully!
📅 Processing started at: 2025-07-01 06:06:47


In [4]:
# ===================================================================
# STEP 1: LOAD AND ANALYZE A/B TEST METADATA
# ===================================================================

# Define file paths
project_root = Path("..")
metadata_path = project_root / "2 - Synthetic Metadata" / "Timber Mountain - AB Test Metadata.xlsx"
pdf_directory = project_root / "3 - Synthetic A:B Test Results Decks" / "2 - Results Presentations"

print("📊 LOADING A/B TEST METADATA")
print("-" * 40)

try:
    # Load metadata from Excel file
    metadata_df = pd.read_excel(metadata_path, sheet_name='Metadata')
    
    print(f"✅ Successfully loaded metadata for {len(metadata_df)} A/B tests")
    print(f"📁 Metadata file: {metadata_path.name}")
    print(f"📋 Columns: {list(metadata_df.columns)}")
    
    # Display summary of tests
    print(f"\n🧪 A/B TEST OVERVIEW:")
    print("-" * 40)
    for i, row in metadata_df.iterrows():
        print(f"{i+1}. {row['Test Name']}")
        print(f"   📄 PDF: {row['PDF File Name']}")
        print(f"   📅 Duration: {row['Test Launch'].strftime('%Y-%m-%d')} → {row['Test End'].strftime('%Y-%m-%d')}")
        print(f"   🎯 Target: {row['Target Segment']}")
        print()
    
    # Create metadata lookup dictionary for efficient access
    metadata_lookup = {}
    for i, row in metadata_df.iterrows():
        pdf_filename = row['PDF File Name']
        metadata_lookup[pdf_filename] = {
            'test_name': row['Test Name'],
            'test_launch': row['Test Launch'].strftime('%Y-%m-%d'),
            'test_end': row['Test End'].strftime('%Y-%m-%d'),
            'country': row['Country'],
            'target_segment': row['Target Segment'],
            'page_placement': row['Page / Placement'],
            'test_hypothesis': row['Test Hypothesis'],
            'test_result': row['Test Result & Interpretation']
        }
    
    print(f"🔍 Created metadata lookup for {len(metadata_lookup)} tests")
    
except FileNotFoundError:
    print(f"❌ ERROR: Metadata file not found at {metadata_path}")
    raise
except Exception as e:
    print(f"❌ ERROR loading metadata: {e}")
    raise

📊 LOADING A/B TEST METADATA
----------------------------------------
✅ Successfully loaded metadata for 5 A/B tests
📁 Metadata file: Timber Mountain - AB Test Metadata.xlsx
📋 Columns: ['Test Name', 'PDF File Name', 'Test Launch', 'Test End', 'Country', 'Target Segment', 'Page / Placement', 'Test Hypothesis', 'Test Result & Interpretation']

🧪 A/B TEST OVERVIEW:
----------------------------------------
1. Homepage: Domestic vs. International Visitors — Content Personalization Test
   📄 PDF: 1 - Locale-Aware-Experience-How-We-Boosted-International-Conversions-at-Timber-Mountain.pdf
   📅 Duration: 2024-07-08 → 2024-07-28
   🎯 Target: Browser-locale ≠ “en-US” (Int’l) vs “en-US” (Domestic)

2. AI Planner: Add Verified Star Ratings — Trust & Adoption Test
   📄 PDF: 2 - Wild-Willy-AI-Planner-Trust-and-Adoption-AB-Test-Results.pdf
   📅 Duration: 2024-08-05 → 2024-08-25
   🎯 Target: Wild Willy AI Travel Planner Users

3. Checkout: Unified Booking.com Bundle Flow — Seamless-Booking Test
   📄 PDF

In [5]:
# ===================================================================
# STEP 2: PROCESS PDF FILES AND EXTRACT TEXT CONTENT
# ===================================================================

print("\n📄 PROCESSING PDF FILES")
print("-" * 40)

# Find all PDF files in the results presentations directory
pdf_files = [f for f in pdf_directory.iterdir() if f.suffix.lower() == '.pdf']

if not pdf_files:
    print(f"❌ ERROR: No PDF files found in {pdf_directory}")
    raise FileNotFoundError("PDF files not found")

print(f"📂 Found {len(pdf_files)} PDF files to process:")
for pdf_file in sorted(pdf_files):
    print(f"   - {pdf_file.name}")

# Dictionary to store extracted text content
pdf_text_content = {}

print(f"\n🔍 EXTRACTING TEXT FROM PDF FILES:")
print("-" * 40)

for pdf_path in sorted(pdf_files):
    print(f"Processing: {pdf_path.name}")
    
    try:
        # Extract all text from the PDF
        with pdfplumber.open(pdf_path) as pdf:
            # Extract text from all pages
            all_pages_text = []
            
            for page_num, page in enumerate(pdf.pages, 1):
                page_text = page.extract_text()
                if page_text:
                    all_pages_text.append(f"=== PAGE {page_num} ===\n{page_text}")
                else:
                    all_pages_text.append(f"=== PAGE {page_num} ===\n[No text extracted]")
            
            # Combine all pages
            full_text = "\n\n".join(all_pages_text)
            
            # Store the text content
            pdf_text_content[pdf_path.name] = {
                'file_path': str(pdf_path),
                'page_count': len(pdf.pages),
                'text_length': len(full_text),
                'full_text': full_text
            }
            
            print(f"   ✅ Extracted {len(full_text):,} characters from {len(pdf.pages)} pages")
    
    except Exception as e:
        print(f"   ❌ ERROR processing {pdf_path.name}: {e}")
        pdf_text_content[pdf_path.name] = {
            'file_path': str(pdf_path),
            'page_count': 0,
            'text_length': 0,
            'full_text': f"[ERROR: Could not extract text - {e}]",
            'error': str(e)
        }

print(f"\n📊 PDF PROCESSING SUMMARY:")
print(f"   • Successfully processed: {len([k for k, v in pdf_text_content.items() if 'error' not in v])}")
print(f"   • Failed to process: {len([k for k, v in pdf_text_content.items() if 'error' in v])}")
print(f"   • Total text extracted: {sum(v['text_length'] for v in pdf_text_content.values()):,} characters")


📄 PROCESSING PDF FILES
----------------------------------------
📂 Found 5 PDF files to process:
   - 1 - Locale-Aware-Experience-How-We-Boosted-International-Conversions-at-Timber-Mountain.pdf
   - 2 - Wild-Willy-AI-Planner-Trust-and-Adoption-AB-Test-Results.pdf
   - 3 - Timber-Mountain-Unified-Bundle-Flow-Checkout-Test-Results.pdf
   - 4 - Timber-Mountain-CTA-Copy-Test-Results.pdf
   - 5 - Homepage-Special-Offers-Carousel-Merchandising-Test-Results.pdf

🔍 EXTRACTING TEXT FROM PDF FILES:
----------------------------------------
Processing: 1 - Locale-Aware-Experience-How-We-Boosted-International-Conversions-at-Timber-Mountain.pdf
   ✅ Extracted 2,992 characters from 7 pages
Processing: 2 - Wild-Willy-AI-Planner-Trust-and-Adoption-AB-Test-Results.pdf
   ✅ Extracted 4,031 characters from 9 pages
Processing: 3 - Timber-Mountain-Unified-Bundle-Flow-Checkout-Test-Results.pdf
   ✅ Extracted 4,595 characters from 10 pages
Processing: 4 - Timber-Mountain-CTA-Copy-Test-Results.pdf
   ✅ Extract

In [7]:
# ===================================================================
# STEP 3: COMBINE METADATA WITH PDF TEXT CONTENT
# ===================================================================

print("\n🔗 COMBINING METADATA WITH PDF CONTENT")
print("-" * 40)

unified_documents = []
processing_summary = {
    'successful_matches': 0,
    'missing_metadata': 0,
    'missing_pdf_content': 0,
    'total_processed': 0
}

# Iterate through each PDF file and combine with metadata
for pdf_filename in pdf_text_content.keys():
    print(f"Combining data for: {pdf_filename}")
    
    # Get metadata for this PDF
    metadata = metadata_lookup.get(pdf_filename, {})
    
    # Get PDF text content
    pdf_data = pdf_text_content.get(pdf_filename, {})
    
    # Create unified document
    unified_doc = {
        'document_id': f"timber_mountain_{len(unified_documents) + 1:03d}",
        'source_pdf_filename': pdf_filename,
        'source_pdf_path': pdf_data.get('file_path', ''),
        'pdf_processing': {
            'page_count': pdf_data.get('page_count', 0),
            'text_length': pdf_data.get('text_length', 0),
            'extraction_error': pdf_data.get('error', None)
        },
        'metadata': metadata,
        'content': {
            'full_text': pdf_data.get('full_text', ''),
            'processed_timestamp': datetime.now().isoformat()
        }
    }
    
    # Update processing summary
    processing_summary['total_processed'] += 1
    
    if metadata:
        processing_summary['successful_matches'] += 1
        print(f"   ✅ Metadata found: {metadata.get('test_name', 'Unknown')}")
    else:
        processing_summary['missing_metadata'] += 1
        print(f"   ⚠️  No metadata found for {pdf_filename}")
    
    if pdf_data.get('full_text') and 'error' not in pdf_data:
        print(f"   ✅ PDF content: {pdf_data['text_length']:,} characters")
    else:
        processing_summary['missing_pdf_content'] += 1
        print(f"   ⚠️  PDF content extraction failed")
    
    unified_documents.append(unified_doc)
    print()

print(f"📊 COMBINATION SUMMARY:")
print(f"   • Total documents processed: {processing_summary['total_processed']}")
print(f"   • Successful metadata matches: {processing_summary['successful_matches']}")
print(f"   • Missing metadata: {processing_summary['missing_metadata']}")
print(f"   • PDF extraction failures: {processing_summary['missing_pdf_content']}")
print(f"   • Unified documents created: {len(unified_documents)}")

# Display sample unified document structure
if unified_documents:
    print(f"\n📋 SAMPLE UNIFIED DOCUMENT STRUCTURE:")
    print("-" * 40)
    sample_doc = unified_documents[0]
    print(f"Document ID: {sample_doc['document_id']}")
    print(f"Source PDF: {sample_doc['source_pdf_filename']}")
    print(f"Test Name: {sample_doc['metadata'].get('test_name', 'N/A')}")
    print(f"Content Length: {sample_doc['content']['full_text'][:100]}...")
    print(f"Metadata Keys: {list(sample_doc['metadata'].keys()) if sample_doc['metadata'] else 'None'}")


🔗 COMBINING METADATA WITH PDF CONTENT
----------------------------------------
Combining data for: 1 - Locale-Aware-Experience-How-We-Boosted-International-Conversions-at-Timber-Mountain.pdf
   ✅ Metadata found: Homepage: Domestic vs. International Visitors — Content Personalization Test
   ✅ PDF content: 2,992 characters

Combining data for: 2 - Wild-Willy-AI-Planner-Trust-and-Adoption-AB-Test-Results.pdf
   ✅ Metadata found: AI Planner: Add Verified Star Ratings — Trust & Adoption Test
   ✅ PDF content: 4,031 characters

Combining data for: 3 - Timber-Mountain-Unified-Bundle-Flow-Checkout-Test-Results.pdf
   ✅ Metadata found: Checkout: Unified Booking.com Bundle Flow — Seamless-Booking Test
   ✅ PDF content: 4,595 characters

Combining data for: 4 - Timber-Mountain-CTA-Copy-Test-Results.pdf
   ✅ Metadata found: Site-wide CTA Copy: “Learn More” vs. “Explore More” — Engagement Nudge Test
   ✅ PDF content: 4,244 characters

Combining data for: 5 - Homepage-Special-Offers-Carousel-Merch

In [8]:
# ===================================================================
# STEP 4: GENERATE UNIFIED JSON OUTPUT FOR NEO4J GRAPHRAG
# ===================================================================

print("\n💾 GENERATING UNIFIED JSON OUTPUT")
print("-" * 40)

# Define output path
output_json_path = project_root / "processed_documents.json"

# Create the final output structure optimized for GraphRAG
final_output = {
    'processing_metadata': {
        'created_timestamp': datetime.now().isoformat(),
        'source_excel_file': str(metadata_path),
        'source_pdf_directory': str(pdf_directory),
        'total_documents': len(unified_documents),
        'processing_summary': processing_summary
    },
    'documents': unified_documents
}

try:
    # Save to JSON file with proper formatting
    with open(output_json_path, 'w', encoding='utf-8') as f:
        json.dump(final_output, f, indent=2, ensure_ascii=False)
    
    # Calculate file size
    file_size_mb = output_json_path.stat().st_size / (1024 * 1024)
    
    print(f"✅ Successfully saved unified dataset!")
    print(f"📁 Output file: {output_json_path.name}")
    print(f"📏 File size: {file_size_mb:.2f} MB")
    print(f"📊 Contains {len(unified_documents)} documents")
    
    # Display JSON structure summary
    print(f"\n📋 JSON STRUCTURE SUMMARY:")
    print("-" * 40)
    print("└── processing_metadata")
    print("    ├── created_timestamp")
    print("    ├── source_excel_file") 
    print("    ├── source_pdf_directory")
    print("    ├── total_documents")
    print("    └── processing_summary")
    print("└── documents (array)")
    print("    └── [document]")
    print("        ├── document_id")
    print("        ├── source_pdf_filename")
    print("        ├── source_pdf_path")
    print("        ├── pdf_processing")
    print("        ├── metadata")
    print("        └── content")
    
    print(f"\n🎯 READY FOR NEXT STEP: Populating Neo4j Graph Database")
    print(f"📝 Use '{output_json_path.name}' as input for graph population")
    
except Exception as e:
    print(f"❌ ERROR saving JSON file: {e}")
    raise

# Display sample of final JSON structure
print(f"\n📄 SAMPLE JSON OUTPUT:")
print("-" * 40)
sample_output = {
    'processing_metadata': final_output['processing_metadata'],
    'documents': [final_output['documents'][0]] if final_output['documents'] else []
}

print(json.dumps(sample_output, indent=2)[:1000] + "..." if len(str(sample_output)) > 1000 else json.dumps(sample_output, indent=2))


💾 GENERATING UNIFIED JSON OUTPUT
----------------------------------------
✅ Successfully saved unified dataset!
📁 Output file: processed_documents.json
📏 File size: 0.03 MB
📊 Contains 5 documents

📋 JSON STRUCTURE SUMMARY:
----------------------------------------
└── processing_metadata
    ├── created_timestamp
    ├── source_excel_file
    ├── source_pdf_directory
    ├── total_documents
    └── processing_summary
└── documents (array)
    └── [document]
        ├── document_id
        ├── source_pdf_filename
        ├── source_pdf_path
        ├── pdf_processing
        ├── metadata
        └── content

🎯 READY FOR NEXT STEP: Populating Neo4j Graph Database
📝 Use 'processed_documents.json' as input for graph population

📄 SAMPLE JSON OUTPUT:
----------------------------------------
{
  "processing_metadata": {
    "created_timestamp": "2025-07-01T06:16:03.658141",
    "source_excel_file": "../2 - Synthetic Metadata/Timber Mountain - AB Test Metadata.xlsx",
    "source_pdf_directory

In [9]:
# ===================================================================
# STEP 5: DATA VALIDATION AND SUMMARY STATISTICS
# ===================================================================

print("\n✅ DATA VALIDATION AND FINAL SUMMARY")
print("=" * 60)

# Validation checks
validation_results = {
    'all_tests_processed': len(unified_documents) == len(metadata_df),
    'all_metadata_matched': processing_summary['missing_metadata'] == 0,
    'all_pdfs_extracted': processing_summary['missing_pdf_content'] == 0,
    'output_file_created': output_json_path.exists(),
    'total_characters': sum(len(doc['content']['full_text']) for doc in unified_documents),
    'avg_characters_per_doc': 0
}

if len(unified_documents) > 0:
    validation_results['avg_characters_per_doc'] = validation_results['total_characters'] / len(unified_documents)

# Display validation results
print("🔍 VALIDATION RESULTS:")
print("-" * 30)
print(f"✅ All A/B tests processed: {validation_results['all_tests_processed']} ({len(unified_documents)}/{len(metadata_df)})")
print(f"✅ All metadata matched: {validation_results['all_metadata_matched']} ({processing_summary['successful_matches']}/{len(unified_documents)})")
print(f"✅ All PDFs extracted: {validation_results['all_pdfs_extracted']} (failures: {processing_summary['missing_pdf_content']})")
print(f"✅ Output file created: {validation_results['output_file_created']} ({output_json_path.name})")

print(f"\n📊 CONTENT STATISTICS:")
print("-" * 30)
print(f"Total text content: {validation_results['total_characters']:,} characters")
print(f"Average per document: {validation_results['avg_characters_per_doc']:,.0f} characters")

# Document-level statistics
print(f"\n📋 DOCUMENT-LEVEL BREAKDOWN:")
print("-" * 30)
for i, doc in enumerate(unified_documents, 1):
    test_name = doc['metadata'].get('test_name', 'Unknown Test')
    char_count = len(doc['content']['full_text'])
    page_count = doc['pdf_processing']['page_count']
    
    print(f"{i}. {test_name[:50]}{'...' if len(test_name) > 50 else ''}")
    print(f"   📄 {page_count} pages, {char_count:,} characters")

# Check for potential issues
print(f"\n⚠️  POTENTIAL ISSUES:")
print("-" * 30)
issues_found = 0

for doc in unified_documents:
    if doc['pdf_processing'].get('extraction_error'):
        print(f"❌ PDF extraction error in: {doc['source_pdf_filename']}")
        issues_found += 1
    
    if not doc['metadata']:
        print(f"⚠️  Missing metadata for: {doc['source_pdf_filename']}")
        issues_found += 1
    
    if len(doc['content']['full_text']) < 100:
        print(f"⚠️  Very short content in: {doc['source_pdf_filename']} ({len(doc['content']['full_text'])} chars)")
        issues_found += 1

if issues_found == 0:
    print("✅ No issues detected - data quality looks good!")

# Final processing summary
print(f"\n🎯 PROCESSING COMPLETE!")
print("=" * 60)
print(f"📈 Successfully processed {len(unified_documents)} A/B test documents")
print(f"💾 Output saved to: {output_json_path}")
print(f"🔗 Ready for Neo4j graph database population")
print(f"📅 Processing completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# Next steps guidance
print(f"\n📋 NEXT STEPS:")
print("-" * 30)
print("1. 🗄️  Set up Neo4j database connection")
print("2. 🏗️  Create graph schema for A/B test data")
print("3. 📥 Import processed_documents.json into Neo4j")
print("4. 🔍 Build GraphRAG query system")
print("5. 🤖 Integrate with LangChain for chatbot responses")
print(f"6. 🌐 Deploy Streamlit frontend")

print(f"\n🌲 Timber Mountain AI Chatbot data processing pipeline complete! 🌲")


✅ DATA VALIDATION AND FINAL SUMMARY
🔍 VALIDATION RESULTS:
------------------------------
✅ All A/B tests processed: True (5/5)
✅ All metadata matched: True (5/5)
✅ All PDFs extracted: True (failures: 0)
✅ Output file created: True (processed_documents.json)

📊 CONTENT STATISTICS:
------------------------------
Total text content: 21,031 characters
Average per document: 4,206 characters

📋 DOCUMENT-LEVEL BREAKDOWN:
------------------------------
1. Homepage: Domestic vs. International Visitors — Co...
   📄 7 pages, 2,992 characters
2. AI Planner: Add Verified Star Ratings — Trust & Ad...
   📄 9 pages, 4,031 characters
3. Checkout: Unified Booking.com Bundle Flow — Seamle...
   📄 10 pages, 4,595 characters
4. Site-wide CTA Copy: “Learn More” vs. “Explore More...
   📄 10 pages, 4,244 characters
5. Homepage: Special Offers Carousel — Merchandising ...
   📄 10 pages, 5,169 characters

⚠️  POTENTIAL ISSUES:
------------------------------
✅ No issues detected - data quality looks good!

🎯 PR

In [13]:
# ===================================================================
# STEP 6: ENVIRONMENT SETUP AND NEO4J CONNECTION
# ===================================================================
# Step 2: Populate the Enriched Neo4j Knowledge Graph
# This section uses LangChain's LLMGraphTransformer to automatically
# create an enriched knowledge graph from our processed documents.

import os
from dotenv import load_dotenv
from neo4j import GraphDatabase
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_openai import ChatOpenAI
from langchain_community.graphs import Neo4jGraph
from langchain_core.documents import Document

print("\n🔗 NEO4J KNOWLEDGE GRAPH POPULATION")
print("=" * 60)
print("🎯 Step 2: Transform documents into enriched knowledge graph")

# Load environment variables
load_dotenv()

# Verify required environment variables
required_env_vars = ['OPENAI_API_KEY', 'NEO4J_URI', 'NEO4J_USERNAME', 'NEO4J_PASSWORD']
missing_vars = [var for var in required_env_vars if not os.getenv(var)]

if missing_vars:
    print(f"❌ ERROR: Missing environment variables: {missing_vars}")
    print("Please ensure your .env file contains:")
    print("  - OPENAI_API_KEY=your_openai_key")
    print("  - NEO4J_URI=neo4j+s://your_neo4j_uri")
    print("  - NEO4J_USERNAME=your_username")
    print("  - NEO4J_PASSWORD=your_password")
    raise ValueError(f"Missing required environment variables: {missing_vars}")

print("✅ Environment variables loaded successfully")

# Test Neo4j connection
try:
    neo4j_graph = Neo4jGraph(
        url=os.getenv('NEO4J_URI'),
        username=os.getenv('NEO4J_USERNAME'),
        password=os.getenv('NEO4J_PASSWORD')
    )
    
    # Test the connection
    result = neo4j_graph.query("RETURN 'Connection successful' as message")
    print(f"✅ Neo4j connection established: {result[0]['message']}")
    
    # Get database info
    db_info = neo4j_graph.query("CALL dbms.components() YIELD name, versions RETURN name, versions[0] as version")
    for info in db_info:
        print(f"📊 {info['name']}: {info['version']}")
        
except Exception as e:
    print(f"❌ ERROR connecting to Neo4j: {e}")
    print("Please verify your Neo4j credentials and connection string")
    raise

print(f"\n🔧 SETUP COMPLETE - Ready for graph transformation!")


🔗 NEO4J KNOWLEDGE GRAPH POPULATION
🎯 Step 2: Transform documents into enriched knowledge graph
✅ Environment variables loaded successfully
✅ Neo4j connection established: Connection successful
📊 Neo4j Kernel: 5.27-aura
📊 Cypher: 5

🔧 SETUP COMPLETE - Ready for graph transformation!


In [14]:
# ===================================================================
# STEP 7: LOAD AND PREPARE PROCESSED DOCUMENTS
# ===================================================================

print("\n📂 LOADING PROCESSED DOCUMENTS")
print("-" * 40)

# Load the processed documents JSON
try:
    with open(output_json_path, 'r', encoding='utf-8') as f:
        processed_data = json.load(f)
    
    print(f"✅ Loaded processed documents from: {output_json_path.name}")
    print(f"📊 Processing metadata: {processed_data['processing_metadata']['created_timestamp']}")
    print(f"📈 Total documents: {processed_data['processing_metadata']['total_documents']}")
    
except FileNotFoundError:
    print(f"❌ ERROR: {output_json_path.name} not found. Please run the data processing steps first.")
    raise
except Exception as e:
    print(f"❌ ERROR loading processed documents: {e}")
    raise

# Convert to LangChain Document objects with enriched metadata
langchain_documents = []

print(f"\n🔄 CONVERTING TO LANGCHAIN DOCUMENTS:")
print("-" * 40)

for doc_data in processed_data['documents']:
    # Create metadata dictionary that includes both structured and processing info
    metadata = {
        'document_id': doc_data['document_id'],
        'source_pdf': doc_data['source_pdf_filename'],
        'page_count': doc_data['pdf_processing']['page_count'],
        'text_length': doc_data['pdf_processing']['text_length'],
        # Structured metadata from Excel
        'test_name': doc_data['metadata'].get('test_name', ''),
        'test_launch': doc_data['metadata'].get('test_launch', ''),
        'test_end': doc_data['metadata'].get('test_end', ''),
        'country': doc_data['metadata'].get('country', ''),
        'target_segment': doc_data['metadata'].get('target_segment', ''),
        'page_placement': doc_data['metadata'].get('page_placement', ''),
        'test_hypothesis': doc_data['metadata'].get('test_hypothesis', ''),
        'test_result': doc_data['metadata'].get('test_result', ''),
    }
    
    # Create enriched content that includes both text and structured data
    # This is the key insight: LLMGraphTransformer will see both unstructured text
    # and structured metadata, allowing it to create nodes with rich properties
    enriched_content = f"""
A/B TEST: {metadata['test_name']}

METADATA:
- Document ID: {metadata['document_id']}
- Test Launch Date: {metadata['test_launch']}
- Test End Date: {metadata['test_end']}
- Country: {metadata['country']}
- Target Segment: {metadata['target_segment']}
- Page/Placement: {metadata['page_placement']}
- Test Hypothesis: {metadata['test_hypothesis']}
- Test Result: {metadata['test_result']}

FULL PRESENTATION CONTENT:
{doc_data['content']['full_text']}
"""
    
    # Create LangChain Document
    langchain_doc = Document(
        page_content=enriched_content,
        metadata=metadata
    )
    
    langchain_documents.append(langchain_doc)
    
    print(f"✅ {metadata['document_id']}: {metadata['test_name'][:50]}...")
    print(f"   📝 Content length: {len(enriched_content):,} characters")

print(f"\n📋 DOCUMENT PREPARATION SUMMARY:")
print(f"   • LangChain documents created: {len(langchain_documents)}")
print(f"   • Total content for transformation: {sum(len(doc.page_content) for doc in langchain_documents):,} characters")
print(f"   • Ready for LLMGraphTransformer processing")

# Display sample of enriched content structure
if langchain_documents:
    sample_doc = langchain_documents[0]
    print(f"\n📄 SAMPLE ENRICHED CONTENT STRUCTURE:")
    print("-" * 40)
    print(f"Document ID: {sample_doc.metadata['document_id']}")
    print(f"Content preview: {sample_doc.page_content[:300]}...")
    print(f"Metadata keys: {list(sample_doc.metadata.keys())}")


📂 LOADING PROCESSED DOCUMENTS
----------------------------------------
✅ Loaded processed documents from: processed_documents.json
📊 Processing metadata: 2025-07-01T06:16:03.658141
📈 Total documents: 5

🔄 CONVERTING TO LANGCHAIN DOCUMENTS:
----------------------------------------
✅ timber_mountain_001: Homepage: Domestic vs. International Visitors — Co...
   📝 Content length: 3,839 characters
✅ timber_mountain_002: AI Planner: Add Verified Star Ratings — Trust & Ad...
   📝 Content length: 4,802 characters
✅ timber_mountain_003: Checkout: Unified Booking.com Bundle Flow — Seamle...
   📝 Content length: 5,408 characters
✅ timber_mountain_004: Site-wide CTA Copy: “Learn More” vs. “Explore More...
   📝 Content length: 4,931 characters
✅ timber_mountain_005: Homepage: Special Offers Carousel — Merchandising ...
   📝 Content length: 5,939 characters

📋 DOCUMENT PREPARATION SUMMARY:
   • LangChain documents created: 5
   • Total content for transformation: 24,919 characters
   • Ready for LL

In [15]:
# ===================================================================
# STEP 8: CONFIGURE LLMGRAPHTRANSFORMER
# ===================================================================

print("\n🤖 CONFIGURING LLMGRAPHTRANSFORMER")
print("-" * 40)

# Initialize OpenAI LLM for graph transformation
try:
    llm = ChatOpenAI(
        model="gpt-4o-mini",  # Using cost-effective model for graph extraction
        temperature=0,        # Deterministic output for consistent graph structure
        api_key=os.getenv('OPENAI_API_KEY')
    )
    print("✅ OpenAI LLM initialized successfully")
    
except Exception as e:
    print(f"❌ ERROR initializing OpenAI LLM: {e}")
    raise

# Configure LLMGraphTransformer with specific node types and relationships
# This will help the transformer understand the A/B testing domain
try:
    transformer = LLMGraphTransformer(
        llm=llm,
        # Define allowed node labels - these guide the LLM to create structured nodes
        allowed_nodes=[
            "ABTest",           # Main A/B test entities
            "Metric",           # KPIs and measurements
            "Segment",          # Target segments and audiences
            "Feature",          # Features being tested
            "Result",           # Test outcomes and findings
            "Hypothesis",       # Test hypotheses
            "Page",             # Web pages/placements
            "Variant",          # Test variants (control/treatment)
            "Conversion",       # Conversion events
            "Insight",          # Key insights and learnings
            "Recommendation"    # Strategic recommendations
        ],
        # Define relationship types for connecting entities
        allowed_relationships=[
            "TESTED_ON",        # ABTest -> Page
            "MEASURED_BY",      # ABTest -> Metric
            "TARGETS",          # ABTest -> Segment
            "HAS_VARIANT",      # ABTest -> Variant
            "PRODUCED",         # ABTest -> Result
            "VALIDATES",        # Result -> Hypothesis
            "INDICATES",        # Result -> Insight
            "SUGGESTS",         # Insight -> Recommendation
            "AFFECTS",          # Feature -> Metric
            "CONVERTS_TO",      # Segment -> Conversion
            "RELATES_TO",       # Generic relationship
        ],
        # Enable strict mode for better structure
        strict_mode=False,  # Allow flexibility for diverse content
    )
    
    print("✅ LLMGraphTransformer configured successfully")
    print(f"📝 Allowed node types: {len(transformer.allowed_nodes)}")
    print(f"🔗 Allowed relationship types: {len(transformer.allowed_relationships)}")
    
    # Display configuration details
    print(f"\n📊 TRANSFORMATION CONFIGURATION:")
    print("-" * 40)
    print(f"🎯 Node Types: {', '.join(transformer.allowed_nodes)}")
    print(f"🔗 Relationships: {', '.join(transformer.allowed_relationships)}")
    print(f"🧠 LLM Model: {llm.model_name}")
    print(f"🌡️  Temperature: {llm.temperature}")
    
except Exception as e:
    print(f"❌ ERROR configuring LLMGraphTransformer: {e}")
    raise

print(f"\n🎯 READY FOR GRAPH TRANSFORMATION!")
print("The LLMGraphTransformer will now:")
print("  1. 📖 Read the enriched document content")
print("  2. 🧠 Use GPT-4 to identify entities and relationships")
print("  3. 🏗️  Create structured graph nodes with metadata properties")
print("  4. 🔗 Establish meaningful connections between entities")
print("  5. 💾 Prepare for Neo4j database population")


🤖 CONFIGURING LLMGRAPHTRANSFORMER
----------------------------------------
✅ OpenAI LLM initialized successfully
✅ LLMGraphTransformer configured successfully
📝 Allowed node types: 11
🔗 Allowed relationship types: 11

📊 TRANSFORMATION CONFIGURATION:
----------------------------------------
🎯 Node Types: ABTest, Metric, Segment, Feature, Result, Hypothesis, Page, Variant, Conversion, Insight, Recommendation
🔗 Relationships: TESTED_ON, MEASURED_BY, TARGETS, HAS_VARIANT, PRODUCED, VALIDATES, INDICATES, SUGGESTS, AFFECTS, CONVERTS_TO, RELATES_TO
🧠 LLM Model: gpt-4o-mini
🌡️  Temperature: 0.0

🎯 READY FOR GRAPH TRANSFORMATION!
The LLMGraphTransformer will now:
  1. 📖 Read the enriched document content
  2. 🧠 Use GPT-4 to identify entities and relationships
  3. 🏗️  Create structured graph nodes with metadata properties
  4. 🔗 Establish meaningful connections between entities
  5. 💾 Prepare for Neo4j database population


In [16]:
# ===================================================================
# STEP 9: GRAPH TRANSFORMATION AND NEO4J POPULATION
# ===================================================================

print("\n🏗️  TRANSFORMING DOCUMENTS TO KNOWLEDGE GRAPH")
print("-" * 40)

# Clear existing data in Neo4j (optional - for clean start)
clear_db = input("Clear existing Neo4j database? (y/N): ").lower().strip()
if clear_db == 'y':
    try:
        neo4j_graph.query("MATCH (n) DETACH DELETE n")
        print("🗑️  Cleared existing database")
    except Exception as e:
        print(f"⚠️  Could not clear database: {e}")

# Transform documents into graph elements
all_graph_documents = []
transformation_summary = {
    'total_documents': len(langchain_documents),
    'successful_transformations': 0,
    'total_nodes': 0,
    'total_relationships': 0,
    'errors': []
}

print(f"\n🔄 PROCESSING {len(langchain_documents)} DOCUMENTS:")
print("-" * 40)

for i, document in enumerate(langchain_documents, 1):
    doc_id = document.metadata['document_id']
    test_name = document.metadata['test_name']
    
    print(f"{i}. Processing {doc_id}: {test_name[:50]}...")
    
    try:
        # Transform single document to graph
        graph_documents = transformer.convert_to_graph_documents([document])
        
        if graph_documents:
            graph_doc = graph_documents[0]
            
            # Count nodes and relationships
            node_count = len(graph_doc.nodes)
            rel_count = len(graph_doc.relationships)
            
            print(f"   ✅ Created {node_count} nodes, {rel_count} relationships")
            
            # Add to collection
            all_graph_documents.extend(graph_documents)
            
            # Update summary
            transformation_summary['successful_transformations'] += 1
            transformation_summary['total_nodes'] += node_count
            transformation_summary['total_relationships'] += rel_count
            
        else:
            print(f"   ⚠️  No graph elements created")
            
    except Exception as e:
        error_msg = f"Error processing {doc_id}: {str(e)}"
        print(f"   ❌ {error_msg}")
        transformation_summary['errors'].append(error_msg)

# Populate Neo4j database
print(f"\n💾 POPULATING NEO4J DATABASE:")
print("-" * 40)

if all_graph_documents:
    try:
        # Add graph documents to Neo4j
        neo4j_graph.add_graph_documents(
            all_graph_documents,
            baseEntityLabel=True,  # Add base Entity label to all nodes
            include_source=True    # Include source document info
        )
        
        print(f"✅ Successfully populated Neo4j database!")
        
        # Verify population with basic queries
        node_count = neo4j_graph.query("MATCH (n) RETURN count(n) as count")[0]['count']
        rel_count = neo4j_graph.query("MATCH ()-[r]->() RETURN count(r) as count")[0]['count']
        
        print(f"📊 Database population verified:")
        print(f"   • Total nodes in database: {node_count}")
        print(f"   • Total relationships in database: {rel_count}")
        
    except Exception as e:
        print(f"❌ ERROR populating Neo4j database: {e}")
        transformation_summary['errors'].append(f"Database population error: {str(e)}")

# Display transformation summary
print(f"\n📈 TRANSFORMATION SUMMARY:")
print("=" * 40)
print(f"Documents processed: {transformation_summary['total_documents']}")
print(f"Successful transformations: {transformation_summary['successful_transformations']}")
print(f"Total nodes created: {transformation_summary['total_nodes']}")
print(f"Total relationships created: {transformation_summary['total_relationships']}")

if transformation_summary['errors']:
    print(f"\\nErrors encountered: {len(transformation_summary['errors'])}")
    for error in transformation_summary['errors']:
        print(f"  • {error}")
else:
    print("\\n✅ No errors encountered!")

print(f"\\n🎉 KNOWLEDGE GRAPH POPULATION COMPLETE!")
print("Your Timber Mountain A/B test data is now structured as an enriched knowledge graph in Neo4j.")


🏗️  TRANSFORMING DOCUMENTS TO KNOWLEDGE GRAPH
----------------------------------------

🔄 PROCESSING 5 DOCUMENTS:
----------------------------------------
1. Processing timber_mountain_001: Homepage: Domestic vs. International Visitors — Co...
   ✅ Created 7 nodes, 6 relationships
2. Processing timber_mountain_002: AI Planner: Add Verified Star Ratings — Trust & Ad...
   ✅ Created 14 nodes, 11 relationships
3. Processing timber_mountain_003: Checkout: Unified Booking.com Bundle Flow — Seamle...
   ✅ Created 10 nodes, 9 relationships
4. Processing timber_mountain_004: Site-wide CTA Copy: “Learn More” vs. “Explore More...
   ✅ Created 12 nodes, 11 relationships
5. Processing timber_mountain_005: Homepage: Special Offers Carousel — Merchandising ...
   ✅ Created 13 nodes, 12 relationships

💾 POPULATING NEO4J DATABASE:
----------------------------------------
✅ Successfully populated Neo4j database!
📊 Database population verified:
   • Total nodes in database: 58
   • Total relationships 

In [18]:
# ===================================================================
# STEP 10: GRAPH VALIDATION AND VERIFICATION QUERIES
# ===================================================================

print("\n🔍 VALIDATING KNOWLEDGE GRAPH STRUCTURE")
print("-" * 40)

# First, let's discover what actually exists in the database
discovery_queries = {
    "Database Overview": "MATCH (n) RETURN count(n) as total_nodes, count(distinct labels(n)) as unique_label_combinations",
    "All Node Labels": "MATCH (n) UNWIND labels(n) as label RETURN distinct label, count(*) as count ORDER BY count DESC",
    "All Relationship Types": "MATCH ()-[r]->() RETURN distinct type(r) as relationship_type, count(*) as count ORDER BY count DESC",
    "Sample Nodes": "MATCH (n) RETURN labels(n) as labels, keys(n) as properties, n LIMIT 10"
}

print("🔎 DISCOVERING ACTUAL GRAPH STRUCTURE:")
print("-" * 40)

discovered_labels = set()
discovered_relationships = set()

for query_name, query in discovery_queries.items():
    print(f"\n📊 {query_name}:")
    try:
        results = neo4j_graph.query(query)
        
        if results:
            for result in results:
                if query_name == "Database Overview":
                    print(f"  Total nodes: {result['total_nodes']}")
                    print(f"  Unique label combinations: {result['unique_label_combinations']}")
                    
                elif query_name == "All Node Labels":
                    label = result['label']
                    count = result['count']
                    discovered_labels.add(label)
                    print(f"  {label}: {count} nodes")
                    
                elif query_name == "All Relationship Types":
                    rel_type = result['relationship_type']
                    count = result['count']
                    discovered_relationships.add(rel_type)
                    print(f"  {rel_type}: {count} relationships")
                    
                elif query_name == "Sample Nodes":
                    labels = result['labels']
                    props = result['properties']
                    node = result['n']
                    print(f"  Labels: {labels}")
                    print(f"  Properties: {props}")
                    # Show a few key properties if they exist
                    if hasattr(node, 'get'):
                        id_prop = node.get('id', node.get('name', 'N/A'))
                        print(f"  Sample ID/Name: {id_prop}")
                    print("  ---")
        else:
            print("  No results found")
            
    except Exception as e:
        print(f"  ❌ Error executing query: {e}")

# Now create adaptive queries based on what we actually found
print(f"\n🎯 ADAPTIVE SAMPLE QUERIES (Based on Discovered Structure):")
print("-" * 40)

if discovered_labels:
    print(f"Available node labels: {sorted(discovered_labels)}")
    print(f"Available relationships: {sorted(discovered_relationships)}")
    
    # Create adaptive queries based on what actually exists
    adaptive_queries = []
    
    # Look for any document-related nodes
    doc_labels = [label for label in discovered_labels if any(term in label.lower() for term in ['document', 'test', 'ab'])]
    if doc_labels:
        adaptive_queries.append({
            "name": f"Find all {doc_labels[0]} nodes with their properties",
            "query": f"MATCH (n:{doc_labels[0]}) RETURN n LIMIT 5"
        })
    
    # Look for any relationship patterns
    if discovered_relationships:
        rel_type = list(discovered_relationships)[0]  # Use first available relationship
        adaptive_queries.append({
            "name": f"Find nodes connected by {rel_type} relationships",
            "query": f"MATCH (a)-[r:{rel_type}]->(b) RETURN labels(a)[0] as from_type, labels(b)[0] as to_type, count(*) as connections LIMIT 5"
        })
    
    # Generic content search
    adaptive_queries.append({
        "name": "Search for nodes containing 'test' in any property",
        "query": "MATCH (n) WHERE any(prop in keys(n) WHERE toString(n[prop]) CONTAINS 'test') RETURN labels(n) as node_labels, n LIMIT 5"
    })
    
    # Show nodes with most properties (likely the enriched ones)
    adaptive_queries.append({
        "name": "Find nodes with the most properties (likely enriched data)",
        "query": "MATCH (n) RETURN labels(n) as labels, size(keys(n)) as property_count, keys(n) as properties ORDER BY property_count DESC LIMIT 5"
    })
    
else:
    # If no labels found, the database might be empty
    adaptive_queries = [{
        "name": "Check if database is empty",
        "query": "MATCH (n) RETURN count(n) as total_nodes"
    }]

# Execute adaptive queries
for sample in adaptive_queries:
    print(f"\n📋 {sample['name']}:")
    try:
        results = neo4j_graph.query(sample['query'])
        if results:
            for i, result in enumerate(results):
                print(f"  {i+1}. {result}")
        else:
            print("  No results found")
    except Exception as e:
        print(f"  ❌ Query error: {e}")

# Troubleshooting section
print(f"\n🔧 TROUBLESHOOTING INFORMATION:")
print("-" * 40)

if not discovered_labels:
    print("⚠️  No node labels found in database!")
    print("   Possible issues:")
    print("   1. Step 9 (graph transformation) wasn't executed")
    print("   2. LLMGraphTransformer failed to create nodes")
    print("   3. Database connection issues")
    print("   4. Empty source documents")
    
    # Check if variables from previous steps exist
    try:
        if 'all_graph_documents' in locals() or 'all_graph_documents' in globals():
            print(f"   ✅ Graph documents variable exists")
        else:
            print(f"   ⚠️  Graph documents variable not found - run Step 9 first")
    except:
        print(f"   ⚠️  Cannot check Step 9 variables")
else:
    print("✅ Graph structure discovered successfully!")
    print(f"   • Found {len(discovered_labels)} node types")
    print(f"   • Found {len(discovered_relationships)} relationship types")

# Final success confirmation
print(f"\n🎉 KNOWLEDGE GRAPH VALIDATION COMPLETE!")
print("=" * 60)
if discovered_labels:
    print("✅ Your Timber Mountain A/B test knowledge graph contains data!")
    print("🔗 Graph structure has been analyzed and validated")
    print("🤖 Ready for GraphRAG-powered chatbot queries")
    print("📊 Use the discovered labels and relationships for your queries")
else:
    print("⚠️  Graph appears to be empty - please run Step 9 first")
    print("🔧 Check the troubleshooting information above")

print(f"\n📋 NEXT STEPS FOR CHATBOT DEVELOPMENT:")
print("-" * 40)
print("1. 🔍 Build GraphRAG query system with discovered node types")
print("2. 🤖 Create conversation chain for natural language queries")
print("3. 🌐 Develop Streamlit frontend interface")
print("4. 🚀 Deploy publicly accessible chatbot")
print("5. 📈 Test with complex A/B testing questions")

print(f"\n🌲 Timber Mountain AI Chatbot - Knowledge Graph Phase Complete! 🌲")


🔍 VALIDATING KNOWLEDGE GRAPH STRUCTURE
----------------------------------------
🔎 DISCOVERING ACTUAL GRAPH STRUCTURE:
----------------------------------------

📊 Database Overview:
  Total nodes: 58
  Unique label combinations: 11

📊 All Node Labels:
  __Entity__: 53 nodes
  Segment: 17 nodes
  Feature: 6 nodes
  Metric: 6 nodes
  Document: 5 nodes
  Abtest: 5 nodes
  Page: 5 nodes
  Hypothesis: 5 nodes
  Result: 5 nodes
  Insight: 3 nodes
  Recommendation: 2 nodes

📊 All Relationship Types:
  MENTIONS: 56 relationships
  TARGETS: 19 relationships
  TESTED_ON: 6 relationships
  PRODUCED: 6 relationships
  VALIDATES: 6 relationships
  INDICATES: 6 relationships
  HAS_VARIANT: 3 relationships
  SUGGESTS: 2 relationships
  RELATES_TO: 1 relationships

📊 Sample Nodes:
  Labels: ['Document']
  Properties: ['id', 'text', 'country', 'test_end', 'test_launch', 'source_pdf', 'document_id', 'text_length', 'test_hypothesis', 'target_segment', 'test_result', 'page_placement', 'test_name', 'page_c

In [ ]:
# ===================================================================
# STEP 11: CREATE RAG RETRIEVER WITH NEO4J VECTOR STORE
# ===================================================================
# Step 3: Build Intelligent Semantic Search System
# This creates a RAG retriever that combines vector similarity search
# with the rich graph structure for intelligent A/B testing queries.

from langchain_neo4j import Neo4jVector
from langchain_openai import OpenAIEmbeddings
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

print("\n🔍 BUILDING RAG RETRIEVER SYSTEM")
print("=" * 60)
print("🎯 Step 3: Transform knowledge graph into intelligent semantic search")

# Configure OpenAI embeddings for semantic search
print("\n⚡ CONFIGURING SEMANTIC EMBEDDINGS:")
print("-" * 40)

try:
    embeddings = OpenAIEmbeddings(
        model="text-embedding-ada-002",  # High-quality embedding model
        openai_api_key=os.getenv('OPENAI_API_KEY')
    )
    
    # Test embeddings configuration
    test_embedding = embeddings.embed_query("A/B test conversion improvement")
    print(f"✅ OpenAI embeddings configured successfully")
    print(f"📊 Embedding dimension: {len(test_embedding)}")
    print(f"🧠 Model: text-embedding-ada-002")
    
except Exception as e:
    print(f"❌ ERROR configuring embeddings: {e}")
    raise

# Discover existing graph structure for vector store setup
print("\n🔎 ANALYZING GRAPH FOR VECTOR STORE SETUP:")
print("-" * 40)

# Check what text-rich nodes exist in the graph
nodes_query = "MATCH (n) WHERE any(prop in keys(n) WHERE size(toString(n[prop])) > 100) RETURN labels(n) as labels, keys(n) as properties, count(*) as count ORDER BY count DESC"

available_labels = []
discovered_properties = []

try:
    results = neo4j_graph.query(nodes_query)
    
    print("📋 Nodes with substantial text content:")
    for result in results:
        labels = result['labels']
        properties = result['properties']
        count = result['count']
        available_labels.extend(labels)
        discovered_properties.extend(properties)
        print(f"  {labels}: {count} nodes with properties {properties}")
        
except Exception as e:
    print(f"❌ Error analyzing nodes: {e}")

# Use comprehensive text-rich properties from Document nodes
print(f"\n📝 Using comprehensive text properties for rich embeddings:")
# Include ALL meaningful text fields for comprehensive semantic search
text_properties = [
    "text",              # Primary document content
    "test_hypothesis",   # Test hypothesis description  
    "test_result",       # Test results and findings
    "target_segment",    # User segment being tested
    "test_name",         # Descriptive test name
    "page_placement"     # Page location description
]

print(f"📊 Selected {len(text_properties)} text properties for embeddings:")
for prop in text_properties:
    print(f"  • {prop}")

# Properties excluded (non-text or existing embeddings):
excluded_props = ["id", "document_id", "country", "test_end", "test_launch", "source_pdf", "text_length", "page_count", "embedding"]
print(f"\n🚫 Excluded {len(excluded_props)} non-text properties: {excluded_props}")

# Create Neo4j Vector Store from existing graph
print(f"\n🏗️  CREATING NEO4J VECTOR STORE:")
print("-" * 40)

# Determine best configuration based on discovered structure
node_label = None
text_node_props = text_properties  # Use comprehensive text properties

# Use discovered labels and properties
if available_labels:
    # Prefer Document or similar labels, fall back to first available
    preferred_labels = ["Document", "ABTest", "Test", "Entity"]
    node_label = next((label for label in preferred_labels if label in available_labels), available_labels[0])

print(f"🎯 Vector store configuration:")
print(f"  • Target node label: {node_label}")
print(f"  • Text properties: {text_node_props}")
print(f"  • Embedding strategy: Combine all text fields for rich semantic search")

try:
    # Create vector store from existing graph
    vector_store = Neo4jVector.from_existing_graph(
        embedding=embeddings,
        url=os.getenv('NEO4J_URI'),
        username=os.getenv('NEO4J_USERNAME'),
        password=os.getenv('NEO4J_PASSWORD'),
        index_name="timber_mountain_embeddings",  # Unique index name
        node_label=node_label if node_label else "Document",  # Primary node type for embeddings
        text_node_properties=text_node_props,  # Properties containing embeddable text
        embedding_node_property="embedding",  # Where to store embeddings
    )
    
    print(f"✅ Neo4j Vector Store created successfully!")
    print(f"📊 Index name: timber_mountain_embeddings")
    print(f"🔗 Connected to existing knowledge graph")
    print(f"🧠 Embeddings created from {len(text_node_props)} text properties")
    
except Exception as e:
    print(f"❌ ERROR creating vector store: {e}")
    print(f"🔧 Troubleshooting tips:")
    print(f"   • Ensure Neo4j database contains nodes with text content")
    print(f"   • Check that Step 9 (graph population) was completed successfully")
    print(f"   • Verify Neo4j connection credentials")
    
    # Fallback: Create empty vector store for testing
    print(f"\n⚠️  Creating fallback vector store for testing...")
    try:
        vector_store = Neo4jVector(
            embedding=embeddings,
            url=os.getenv('NEO4J_URI'),
            username=os.getenv('NEO4J_USERNAME'),
            password=os.getenv('NEO4J_PASSWORD'),
            index_name="timber_mountain_embeddings_fallback"
        )
        print(f"✅ Fallback vector store created")
    except Exception as fallback_error:
        print(f"❌ Fallback also failed: {fallback_error}")
        raise

print(f"\n🎯 RAG RETRIEVER SETUP COMPLETE!")
print("Your knowledge graph now has comprehensive semantic search capabilities!")
print("Ready for intelligent A/B testing queries and chatbot integration.")

In [None]:
# ===================================================================
# STEP 12: CONFIGURE RETRIEVER AND TEST SEMANTIC SEARCH
# ===================================================================

print("\n🔧 CONFIGURING INTELLIGENT RETRIEVER")
print("-" * 40)

# Create base retriever from vector store
try:
    base_retriever = vector_store.as_retriever(
        search_type="similarity_score_threshold",  # Only return highly relevant results
        search_kwargs={
            "score_threshold": 0.7,  # Minimum similarity score (0-1)
            "k": 5,                  # Maximum number of results
            "fetch_k": 20,           # Number of documents to fetch before filtering
        }
    )
    
    print(f"✅ Base retriever configured:")
    print(f"  • Search type: similarity_score_threshold")
    print(f"  • Score threshold: 0.7 (high relevance)")
    print(f"  • Max results: 5")
    print(f"  • Fetch pool: 20 documents")
    
    # Create enhanced retriever with compression for better context
    try:
        llm_for_compression = ChatOpenAI(
            model="gpt-4o-mini",
            temperature=0,
            api_key=os.getenv('OPENAI_API_KEY')
        )
        
        compressor = LLMChainExtractor.from_llm(llm_for_compression)
        
        enhanced_retriever = ContextualCompressionRetriever(
            base_compressor=compressor,
            base_retriever=base_retriever
        )
        
        print(f"✅ Enhanced retriever with contextual compression enabled")
        print(f"  • Uses GPT-4o-mini for intelligent result compression")
        print(f"  • Removes irrelevant content, keeps essential context")
        
        # Use enhanced retriever as primary
        primary_retriever = enhanced_retriever
        retriever_type = "Enhanced (with compression)"
        
    except Exception as compression_error:
        print(f"⚠️  Compression setup failed: {compression_error}")
        print(f"✅ Using base retriever without compression")
        primary_retriever = base_retriever
        retriever_type = "Base (without compression)"
        
except Exception as e:
    print(f"❌ ERROR configuring retriever: {e}")
    raise

# Test retriever with A/B testing domain queries
print(f"\n🧪 TESTING SEMANTIC SEARCH WITH A/B TESTING QUERIES")
print("-" * 40)

# Define test queries that represent typical user questions
test_queries = [
    "homepage conversion improvements",
    "mobile user behavior tests",
    "international visitor experiences", 
    "booking flow optimization",
    "trust and credibility features",
    "personalization and targeting",
    "CTA button effectiveness",
    "special offers and promotions"
]

print(f"🎯 Testing {len(test_queries)} domain-specific queries:")
print(f"Retriever type: {retriever_type}")

retrieval_results = {}

for i, query in enumerate(test_queries, 1):
    print(f"\n📋 Query {i}: '{query}'")
    
    try:
        # Retrieve relevant documents
        docs = primary_retriever.get_relevant_documents(query)
        
        if docs:
            print(f"  ✅ Found {len(docs)} relevant documents")
            
            # Show summary of results
            for j, doc in enumerate(docs, 1):
                content_preview = doc.page_content[:150] + "..." if len(doc.page_content) > 150 else doc.page_content
                metadata_summary = {k: v for k, v in doc.metadata.items() if k in ['test_name', 'document_id', 'target_segment']}
                
                print(f"    {j}. Content: {content_preview}")
                print(f"       Metadata: {metadata_summary}")
                
            retrieval_results[query] = {
                'found': len(docs),
                'documents': docs
            }
        else:
            print(f"  ⚠️  No documents found")
            retrieval_results[query] = {'found': 0, 'documents': []}
            
    except Exception as e:
        print(f"  ❌ Error retrieving for '{query}': {e}")
        retrieval_results[query] = {'found': 0, 'error': str(e)}

# Analyze retrieval performance
print(f"\n📊 RETRIEVAL PERFORMANCE ANALYSIS:")
print("-" * 40)

successful_queries = [q for q, r in retrieval_results.items() if r.get('found', 0) > 0]
total_results = sum(r.get('found', 0) for r in retrieval_results.values())
avg_results = total_results / len(test_queries) if test_queries else 0

print(f"Successful queries: {len(successful_queries)}/{len(test_queries)}")
print(f"Total documents retrieved: {total_results}")
print(f"Average results per query: {avg_results:.1f}")

if successful_queries:
    print(f"\n✅ Top performing queries:")
    for query in successful_queries[:3]:
        count = retrieval_results[query]['found']
        print(f"  • '{query}': {count} documents")
else:
    print(f"\n⚠️  No successful retrievals found")
    print(f"Possible issues:")
    print(f"  • Vector store may be empty (run Step 11 successfully first)")
    print(f"  • Embedding similarity threshold too high (0.7)")
    print(f"  • Graph content doesn't match test queries")
    print(f"  • Text properties not properly embedded")

# Advanced retrieval testing with graph context
print(f"\n🔗 TESTING GRAPH-ENHANCED RETRIEVAL:")
print("-" * 40)

if successful_queries:
    # Test hybrid retrieval combining semantic search with graph traversal
    sample_query = successful_queries[0]
    print(f"Using sample query: '{sample_query}'")
    
    try:
        # Get semantic results
        semantic_docs = primary_retriever.get_relevant_documents(sample_query)
        
        if semantic_docs and semantic_docs[0].metadata:
            # Extract node identifiers for graph enhancement
            sample_metadata = semantic_docs[0].metadata
            
            print(f"🔍 Semantic search found: {len(semantic_docs)} documents")
            print(f"📊 Sample metadata: {sample_metadata}")
            
            # Could add graph traversal here to find related nodes
            # This would combine semantic similarity with graph relationships
            print(f"🌐 Graph enhancement: Ready for implementation")
            print(f"   (Would traverse relationships from retrieved nodes)")
            
        else:
            print(f"⚠️  No metadata available for graph enhancement")
            
    except Exception as e:
        print(f"❌ Graph enhancement test failed: {e}")

print(f"\n🎉 RAG RETRIEVER TESTING COMPLETE!")
print("=" * 60)
print(f"✅ Semantic search system operational")
print(f"🔍 Vector embeddings enable intelligent query understanding")
print(f"🧠 Ready for natural language A/B testing questions")
print(f"🚀 Foundation complete for Streamlit chatbot development")

print(f"\n📋 NEXT STEPS:")
print("-" * 30)
print("1. 🤖 Build conversational AI chain with retrieved context")
print("2. 🌐 Create Streamlit chatbot interface")
print("3. 🔗 Integrate retriever with chat responses")
print("4. 🚀 Deploy public chatbot for A/B testing queries")

print(f"\n🌲 Timber Mountain RAG Retriever - Complete! 🌲")