In [None]:
#pip install -U sentence-transformers torch

In [1]:
from content_tree import *
import requests
import json

In [2]:
def test():
    tree = ContentTree()
    
    # Build the textbook tree from markdown files
    md_directory = "/Users/chemxai/GenAI/AI_Tutor/mcp_kb/md_files"
    tree.build_textbook_tree(md_directory)
    
    # Print tree structure
    print("Textbook Structure:")
    tree.print_tree_structure()
    
    # Get all nodes
    all_nodes = tree.tree_node_iterator()
    print(f"\nTotal nodes: {len(all_nodes)}")
    
    # Find a specific chapter
    chapter1 = tree.find_node_by_header("Chapter 1 - Essential Ideas")
    if chapter1:
        print(f"\nFound: {chapter1}")
        print(f"Content preview: {chapter1.content_text[:200]}...")
    
    # Get content from a node and its children
    if chapter1:
        section_content = tree.content_retriever(chapter1)
        print(f"\nChapter 1 total content length: {len(section_content)} characters")
    
    # Test: Print headers of all child nodes to verify order
    print("\n" + "="*60)
    print("ORDER VERIFICATION: All Root Child Nodes")
    print("="*60)
    print(f"Total root children: {len(tree.root.child_nodes)}")
    print("\nOrder of all child nodes:")
    for i, child in enumerate(tree.root.child_nodes, 1):
        print(f"{i:2d}. [{child.node_id:4d}] Level {child.header_level}: {child.header}")
    
    # Verify expected order
    expected_order = ["Preface"] + [f"Chapter {i} -" for i in range(1, 22)] + [f"Appendix {chr(65+i)}" for i in range(13)]
    print(f"\nExpected count: Preface(1) + Chapters(21) + Appendices(13) = 35 total")
    print(f"Actual count: {len(tree.root.child_nodes)}")
    
    # Check if order matches expected pattern
    order_correct = True
    for i, child in enumerate(tree.root.child_nodes):
        if i == 0 and not child.header.startswith("Preface"):
            order_correct = False
            print(f"❌ Position {i+1}: Expected Preface, got '{child.header}'")
        elif 1 <= i <= 21 and not child.header.startswith(f"Chapter {i}"):
            order_correct = False
            print(f"❌ Position {i+1}: Expected Chapter {i}, got '{child.header}'")
        elif i > 21 and not child.header.startswith(f"Appendix {chr(65+i-22)}"):
            order_correct = False
            print(f"❌ Position {i+1}: Expected Appendix {chr(65+i-22)}, got '{child.header}'")
    
    if order_correct:
        print("✅ Order verification: All nodes are in correct order!")
    else:
        print("❌ Order verification: Some nodes are out of order.")

def test1():
    # Create a ContentTree instance
    tree = ContentTree()

    # Build the textbook tree from markdown files
    md_directory = "/Users/chemxai/GenAI/AI_Tutor/mcp_kb/md_files"
    tree.build_textbook_tree(md_directory)

    # Rename repeating headers to make them unique
    tree.rename_repeating_headers()

    # Print tree structure
    print("Textbook Structure:")
    tree.print_tree_structure()

    # Generate summaries and keywords for all nodes
    print("\n" + "="*60)
    print("GENERATING SUMMARIES AND KEYWORDS")
    print("="*60)
    #tree.generate_all_summaries_and_keywords()
    tree.process_all_content()

    # Print tree structure with summaries and keywords
    print("\n" + "="*60)
    print("TREE STRUCTURE WITH SUMMARIES AND KEYWORDS")
    print("="*60)
    tree.print_tree_structure(show_summary=True, show_keywords=True)

    # Get all nodes
    all_nodes = tree.tree_node_iterator()
    print(f"\nTotal nodes: {len(all_nodes)}")

    # Find a specific chapter and show its enhanced information
    chapter1 = tree.find_node_by_header("Chapter 1 - Essential Ideas")
    if chapter1:
        print(f"\nFound: {chapter1}")
        print(f"Content preview: {chapter1.content_text[:200]}...")
        print(f"Summary: {chapter1.summary}")
        print(f"Keywords: {', '.join(chapter1.keywords)}")

    # Get content from a node and its children
    if chapter1:
        section_content = tree.content_retriever(chapter1)
        print(f"\nChapter 1 total content length: {len(section_content)} characters")

    # Test: Print headers of all child nodes to verify order
    print("\n" + "="*60)
    print("ORDER VERIFICATION: All Root Child Nodes")
    print("="*60)
    print(f"Total root children: {len(tree.root.child_nodes)}")
    print("\nOrder of all child nodes:")
    for i, child in enumerate(tree.root.child_nodes, 1):
        print(f"{i:2d}. [{child.node_id:4d}] Level {child.header_level}: {child.header}")

    # Verify expected order
    expected_order = ["Preface"] + [f"Chapter {i} -" for i in range(1, 22)] + [f"Appendix {chr(65+i)}" for i in range(13)]
    print(f"\nExpected count: Preface(1) + Chapters(21) + Appendices(13) = 35 total")
    print(f"Actual count: {len(tree.root.child_nodes)}")

    # Check if order matches expected pattern
    order_correct = True
    for i, child in enumerate(tree.root.child_nodes):
        if i == 0 and not child.header.startswith("Preface"):
            order_correct = False
            print(f"❌ Position {i+1}: Expected Preface, got '{child.header}'")
        elif 1 <= i <= 21 and not child.header.startswith(f"Chapter {i}"):
            order_correct = False
            print(f"❌ Position {i+1}: Expected Chapter {i}, got '{child.header}'")
        elif i > 21 and not child.header.startswith(f"Appendix {chr(65+i-22)}"):
            order_correct = False
            print(f"❌ Position {i+1}: Expected Appendix {chr(65+i-22)}, got '{child.header}'")

    if order_correct:
        print("✅ Order verification: All nodes are in correct order!")
    else:
        print("❌ Order verification: Some nodes are out of order.")

    # Show some examples of generated summaries and keywords
    print("\n" + "="*60)
    print("EXAMPLES OF GENERATED SUMMARIES AND KEYWORDS")
    print("="*60)
    content_nodes = [node for node in all_nodes if node.header_level > 0 and node.content_text.strip()]
    for i, node in enumerate(content_nodes[:5]):  # Show first 5 content nodes
        print(f"\nNode {i+1}: {node.header}")
        print(f"Content length: {len(node.content_text)} characters")
        print(f"Summary: {node.summary}")
        print(f"Keywords: {', '.join(node.keywords)}")
        print("-" * 40)
    return tree

In [None]:
content_tree = test1()

In [5]:
from content_tree import ContentTree

def test_unified_search():
    """Test the unified search methods."""
    print("="*80)
    print("TESTING UNIFIED SEARCH FUNCTIONALITY")
    print("="*80)
    
    # Create a ContentTree instance
    tree = ContentTree()
    
    # Build the textbook tree from markdown files (limited to 2 files for testing)
    md_directory = "/Users/chemxai/GenAI/AI_Tutor/mcp_kb/md_files"
    print(f"Building tree from: {md_directory}")
    tree.build_textbook_tree(md_directory)
    
    # Rename repeating headers to make them unique
    tree.rename_repeating_headers()
    
    # Process content to create search indexes
    print("\nProcessing content and creating search indexes...")
    tree.process_tree_content(
        max_summary_words=20,
        max_keywords=5,
        generate_embeddings=False,  # Disabled for faster testing
        create_inverse_index=True
    )
    
    print("\n" + "="*80)
    print("TESTING SEARCH METHODS")
    print("="*80)
    
    test_queries = [
        "chemistry atoms",
        "measurements accuracy",
        "density volume",
        "scientific method"
    ]
    
    for query in test_queries:
        print(f"\n{'='*50}")
        print(f"Testing query: '{query}'")
        print(f"{'='*50}")
        
        # Test main search_content method with n-grams
        print("\n1. search_content(use_ngrams=True):")
        try:
            results_ngrams = tree.search_content(query, max_results=3, use_ngrams=True)
            print(f"   Found {len(results_ngrams)} results:")
            for i, (node_id, score) in enumerate(results_ngrams, 1):
                node = next((n for n in tree.tree_node_iterator() if n.node_id == node_id), None)
                if node:
                    print(f"     {i}. [{node_id}] {node.header} (score: {score:.3f})")
                else:
                    print(f"     {i}. [Node {node_id} not found] (score: {score:.3f})")
        except Exception as e:
            print(f"   ❌ Error: {e}")
        
        # Test main search_content method without n-grams
        print("\n2. search_content(use_ngrams=False):")
        try:
            results_simple = tree.search_content(query, max_results=3, use_ngrams=False)
            print(f"   Found {len(results_simple)} results:")
            for i, (node_id, score) in enumerate(results_simple, 1):
                node = next((n for n in tree.tree_node_iterator() if n.node_id == node_id), None)
                if node:
                    print(f"     {i}. [{node_id}] {node.header} (score: {score})")
                else:
                    print(f"     {i}. [Node {node_id} not found] (score: {score})")
        except Exception as e:
            print(f"   ❌ Error: {e}")
        
        # Test enhanced search
        print("\n3. enhanced_search():")
        try:
            enhanced_results = tree.enhanced_search(query, max_results=3)
            print(f"   Found {len(enhanced_results)} results:")
            for i, (node_id, score) in enumerate(enhanced_results, 1):
                node = next((n for n in tree.tree_node_iterator() if n.node_id == node_id), None)
                if node:
                    print(f"     {i}. [{node_id}] {node.header} (score: {score:.3f})")
                else:
                    print(f"     {i}. [Node {node_id} not found] (score: {score:.3f})")
        except Exception as e:
            print(f"   ❌ Error: {e}")
        
        # Test search_content_tree (returns actual nodes)
        print("\n4. search_content_tree():")
        try:
            node_results = tree.search_content_tree(query, max_results=3, use_ngrams=True)
            print(f"   Found {len(node_results)} nodes:")
            for i, node in enumerate(node_results, 1):
                print(f"     {i}. [{node.node_id}] {node.header}")
        except Exception as e:
            print(f"   ❌ Error: {e}")
        
        # Test deprecated method (should show warning)
        print("\n5. search_inverse_index() [DEPRECATED]:")
        try:
            deprecated_results = tree.search_inverse_index(query, max_results=3)
            print(f"   Found {len(deprecated_results)} results:")
            for i, (node_id, score) in enumerate(deprecated_results, 1):
                node = next((n for n in tree.tree_node_iterator() if n.node_id == node_id), None)
                if node:
                    print(f"     {i}. [{node_id}] {node.header} (score: {score})")
                else:
                    print(f"     {i}. [Node {node_id} not found] (score: {score})")
        except Exception as e:
            print(f"   ❌ Error: {e}")
    
    print("\n" + "="*80)
    print("TESTING SEARCH INDEX AVAILABILITY")
    print("="*80)
    
    # Test search indexes
    print(f"Has inverse_index_builder: {hasattr(tree, 'inverse_index_builder') and tree.inverse_index_builder is not None}")
    print(f"Has basic inverse_index: {hasattr(tree, 'inverse_index') and len(tree.inverse_index) > 0}")
    
    if hasattr(tree, 'inverse_index_builder') and tree.inverse_index_builder:
        print(f"N-gram indexes available:")
        print(f"  - Monograms: {len(tree.inverse_index_builder.monogram_index)} terms")
        print(f"  - Bigrams: {len(tree.inverse_index_builder.bigram_index)} terms")
        print(f"  - Trigrams: {len(tree.inverse_index_builder.trigram_index)} terms")
    
    print("\n" + "="*80)
    print("✅ UNIFIED SEARCH TESTING COMPLETE!")
    print("="*80)
    return True

if __name__ == "__main__":
    success = test_unified_search()
    if success:
        print("\n🎉 All search methods tested successfully!")
    else:
        print("\n❌ Some tests failed!")

TESTING UNIFIED SEARCH FUNCTIONALITY
Building tree from: /Users/chemxai/GenAI/AI_Tutor/mcp_kb/md_files

Processing content and creating search indexes...
COMPREHENSIVE CONTENT TREE PROCESSING
Processing 55 content nodes...
LLM Model: qwen2.5vl:32b
Embedding Model: text-embedding-3-large
Generate Embeddings: False
Create Inverse Index: True
--------------------------------------------------------------------------------

[1/55] Processing node 2: Preface
Content length: 205 characters
Process node content ........
Generating summary...
Generating keywords...
  ✓ Summary: 106 chars
  ✓ Keywords: 4 items
  ✓ Chunks: 1 items
  ✓ Sentences: 2 items

[2/55] Processing node 3: About OpenStax
Content length: 668 characters
Process node content ........
Generating summary...
Generating keywords...
  ✓ Summary: 151 chars
  ✓ Keywords: 4 items
  ✓ Chunks: 1 items
  ✓ Sentences: 4 items

[3/55] Processing node 4: About OpenStax resources Customization
Content length: 944 characters
Process node co

In [2]:
from content_tree import ContentTree

def test_normalized_scoring():
    """Test the normalized scoring functionality."""
    print("="*80)
    print("TESTING NORMALIZED N-GRAM SCORING")
    print("="*80)
    
    # Create a ContentTree instance
    tree = ContentTree()
    
    # Build the textbook tree from markdown files (limited to 2 files for testing)
    md_directory = "/Users/chemxai/GenAI/AI_Tutor/mcp_kb/md_files"
    print(f"Building tree from: {md_directory}")
    tree.build_textbook_tree(md_directory)
    
    # Rename repeating headers to make them unique
    tree.rename_repeating_headers()
    
    # Process content to create search indexes
    print("\nProcessing content and creating search indexes...")
    tree.process_tree_content(
        max_summary_words=20,
        max_keywords=5,
        generate_embeddings=False,  # Disabled for faster testing
        create_inverse_index=True
    )
    
    print("\n" + "="*80)
    print("TESTING SCORE NORMALIZATION")
    print("="*80)
    
    test_queries = [
        "chemistry",
        "chemistry atoms",
        "chemistry atoms molecules structure bonds",
        "measurements accuracy precision uncertainty",
        "scientific method observation hypothesis"
    ]
    
    for query in test_queries:
        print(f"\n{'='*60}")
        print(f"Testing query: '{query}'")
        print(f"Query length: {len(query.split())} words")
        print(f"{'='*60}")
        
        # Test with InverseIndexBuilder directly
        if hasattr(tree, 'inverse_index_builder') and tree.inverse_index_builder:
            builder = tree.inverse_index_builder
            
            # Calculate raw scores
            raw_scores = builder.calculate_lexical_similarity(query)
            
            # Calculate normalized scores
            normalized_scores = builder.calculate_normalized_lexical_similarity(query)
            
            # Get top 3 results for comparison
            top_raw = sorted(raw_scores.items(), key=lambda x: x[1], reverse=True)[:3]
            top_normalized = sorted(normalized_scores.items(), key=lambda x: x[1], reverse=True)[:3]
            
            print("\nRAW SCORES (unbounded):")
            max_raw_score = max(raw_scores.values()) if raw_scores else 0
            print(f"  Max raw score: {max_raw_score:.3f}")
            for i, (node_id, score) in enumerate(top_raw, 1):
                node = next((n for n in tree.tree_node_iterator() if n.node_id == node_id), None)
                header = node.header if node else f"Node {node_id}"
                print(f"    {i}. [{node_id}] {header} (score: {score:.3f})")
            
            print("\nNORMALIZED SCORES (0-1.0 range):")
            max_normalized_score = max(normalized_scores.values()) if normalized_scores else 0
            print(f"  Max normalized score: {max_normalized_score:.3f}")
            for i, (node_id, score) in enumerate(top_normalized, 1):
                node = next((n for n in tree.tree_node_iterator() if n.node_id == node_id), None)
                header = node.header if node else f"Node {node_id}"
                print(f"    {i}. [{node_id}] {header} (score: {score:.3f})")
            
            # Calculate maximum possible score for this query
            query_tokens = builder._tokenize_and_clean(query)
            query_monograms = query_tokens
            query_bigrams = [f"{query_tokens[i]} {query_tokens[i+1]}" 
                            for i in range(len(query_tokens)-1)]
            query_trigrams = [f"{query_tokens[i]} {query_tokens[i+1]} {query_tokens[i+2]}" 
                             for i in range(len(query_tokens)-2)]
            
            max_possible = builder._calculate_max_possible_score(query_monograms, query_bigrams, query_trigrams)
            print(f"\nMax possible score for this query: {max_possible:.3f}")
            print(f"Normalization factor: {max_raw_score/max_possible:.3f}" if max_possible > 0 else "N/A")
    
    print("\n" + "="*80)
    print("TESTING ENHANCED SEARCH WITH NORMALIZED SCORES")
    print("="*80)
    
    # Test enhanced search which should now use normalized scores
    test_query = "chemistry atoms molecules"
    print(f"\nTesting enhanced_search with query: '{test_query}'")
    
    enhanced_results = tree.enhanced_search(test_query, max_results=5)
    print(f"\nEnhanced search results (using normalized lexical scores):")
    for i, (node_id, score) in enumerate(enhanced_results, 1):
        node = next((n for n in tree.tree_node_iterator() if n.node_id == node_id), None)
        header = node.header if node else f"Node {node_id}"
        print(f"  {i}. [{node_id}] {header} (combined score: {score:.3f})")
    
    print("\n" + "="*80)
    print("TESTING SEMANTIC + LEXICAL SCORE BALANCE")
    print("="*80)
    
    # Demonstrate the score balance issue and solution
    print("With normalized scores, semantic (0-1.0) and lexical (0-1.0) are now balanced!")
    print("Example weights: semantic_weight=0.6, lexical_weight=0.4")
    print("Max combined score would be: 0.6 * 1.0 + 0.4 * 1.0 = 1.0")
    
    # Show some example calculations
    semantic_weight = 0.6
    lexical_weight = 0.4
    
    print(f"\nExample score combinations:")
    print(f"  High semantic (0.9), High lexical (0.8): {semantic_weight * 0.9 + lexical_weight * 0.8:.3f}")
    print(f"  Medium semantic (0.5), High lexical (0.9): {semantic_weight * 0.5 + lexical_weight * 0.9:.3f}")
    print(f"  High semantic (0.8), Medium lexical (0.4): {semantic_weight * 0.8 + lexical_weight * 0.4:.3f}")
    
    return True

if __name__ == "__main__":
    success = test_normalized_scoring()
    if success:
        print("\n" + "="*80)
        print("✅ NORMALIZED SCORING TEST COMPLETE!")
        print("="*80)
    else:
        print("\n" + "="*80)
        print("❌ SOME TESTS FAILED!")
        print("="*80)

TESTING NORMALIZED N-GRAM SCORING
Building tree from: /Users/chemxai/GenAI/AI_Tutor/mcp_kb/md_files

Processing content and creating search indexes...
COMPREHENSIVE CONTENT TREE PROCESSING
Processing 55 content nodes...
LLM Model: qwen2.5vl:32b
Embedding Model: text-embedding-3-large
Generate Embeddings: False
Create Inverse Index: True
--------------------------------------------------------------------------------

[1/55] Processing node 2: Preface
Content length: 205 characters
Process node content ........
Generating summary...
Generating keywords...
  ✓ Summary: 106 chars
  ✓ Keywords: 4 items
  ✓ Chunks: 1 items
  ✓ Sentences: 2 items

[2/55] Processing node 3: About OpenStax
Content length: 668 characters
Process node content ........
Generating summary...
Generating keywords...
  ✓ Summary: 151 chars
  ✓ Keywords: 4 items
  ✓ Chunks: 1 items
  ✓ Sentences: 4 items

[3/55] Processing node 4: About OpenStax resources Customization
Content length: 944 characters
Process node conte

In [19]:
#pip install -U sentence-transformers torch

Collecting sentence-transformers
  Downloading sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
Collecting scikit-learn (from sentence-transformers)
  Downloading scikit_learn-1.7.1-cp312-cp312-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting joblib>=1.2.0 (from scikit-learn->sentence-transformers)
  Downloading joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn->sentence-transformers)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading sentence_transformers-5.0.0-py3-none-any.whl (470 kB)
Downloading scikit_learn-1.7.1-cp312-cp312-macosx_12_0_arm64.whl (8.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m17.9 kB/s[0m eta [36m0:00:00[0ma [36m0:00:15[0m
[?25hDownloading joblib-1.5.1-py3-none-any.whl (307 kB)
Downloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn, sentence-transformers
[