In [1]:
from content_tree import *
import requests
import json
import pickle
import time

In [None]:
# Create a content tree
tree = ContentTree()

# Build the textbook tree from markdown files (limited to 2 files for testing)
md_directory = "/Users/chemxai/GenAI/AI_Tutor/mcp_kb/md_files"
print(f"Building tree from: {md_directory}")
tree.build_textbook_tree(md_directory)

# Rename repeating headers to make them unique
tree.rename_repeating_headers()

# Process content to create search indexes
print("\nProcessing content and creating search indexes...")
tree.process_tree_content(
    max_summary_words=20,
    max_keywords=5,
    generate_embeddings=True,
    create_inverse_index=True
)

# Save for repeating use
with open('content_tree2.pkl','wb') as f:
    pickle.dump(tree, f)

Building tree from: /Users/chemxai/GenAI/AI_Tutor/mcp_kb/md_files

Processing content and creating search indexes...
COMPREHENSIVE CONTENT TREE PROCESSING
Processing 1121 content nodes...
LLM Model: qwen2.5vl:32b
Embedding Model: text-embedding-3-large
Generate Embeddings: True
Create Inverse Index: True
--------------------------------------------------------------------------------

[1/1121] Processing node 2: Preface
Content length: 205 characters
Process node content ........
Generating summary...
Generating keywords...
Generating embeddings for 1 texts in 1 batches...
Generating embeddings for 1 texts in 1 batches...
Generating embeddings for 1 texts in 1 batches...
Generating embeddings for 2 texts in 1 batches...
  ✓ Summary: 106 chars
  ✓ Keywords: 4 items
  ✓ Chunks: 1 items
  ✓ Sentences: 2 items
  ✓ Embeddings: Header, Summary, Chunks, Sentences

[2/1121] Processing node 3: About OpenStax
Content length: 668 characters
Process node content ........
Generating summary...
Gene

In [2]:
# Check the file loading
with open('content_tree2.pkl', 'rb') as f:
    
    tree = pickle.load(f)
    print(tree.root.child_nodes[1].child_nodes[1].header_embedding)

[-0.00229232  0.00762639 -0.01105725 ...  0.01253245  0.00987979
  0.00668239]


In [4]:
def test_rag_functionality():
    """Test the RAG function with various queries."""
    print("="*80)
    print("TESTING RAG FUNCTIONALITY")
    print("="*80)

    print("Load content tree .....")
    with open('content_tree2.pkl', 'rb') as f:
        tree = pickle.load(f)
    
    print("\n" + "="*80)
    print("RAG QUERY TESTING")
    print("="*80)
    
    # Test queries - from basic to complex
    test_queries = [
        "What is chemistry?",
        "What are the phases of matter?",
        "How do you measure density?",
        "What is the scientific method?",
        "How do you calculate significant figures?",
        "What are atoms and molecules?",
        "What is temperature measurement?",
        "What is the difference between accuracy and precision?",
        "How do you perform dimensional analysis?",
        "What are the domains of chemistry?",
        "What is quantum mechanics?",  # Should not be found
        "How do you build a rocket?",  # Should not be found
    ]
    
    for i, query in enumerate(test_queries, 1):
        print(f"\n{'='*60}")
        print(f"Query {i}: {query}")
        print(f"{'='*60}")
        
        try:
            # Test with top-1 node (default)
            answer = tree.rag_query(query, top_k=1)
            print(f"\nAnswer (top-1 node):")
            print(f"{answer}")
            
            # For some queries, also test with top-3 nodes
            if i <= 5:  # Only for first 5 queries to save time
                print(f"\n{'-'*40}")
                print(f"Testing with top-3 nodes:")
                answer_top3 = tree.rag_query(query, top_k=3)
                print(f"\nAnswer (top-3 nodes):")
                print(f"{answer_top3}")
        
        except Exception as e:
            print(f"Error processing query '{query}': {e}")
    
    print("\n" + "="*80)
    print("RAG FUNCTIONALITY TEST COMPLETE!")
    print("="*80)

if __name__ == "__main__":
    test_rag_functionality()

TESTING RAG FUNCTIONALITY
Load content tree .....

RAG QUERY TESTING

Query 1: What is chemistry?
Generating embeddings for 1 texts in 1 batches...


KeyboardInterrupt: 

In [3]:
print("Load content tree .....")
with open('content_tree2.pkl', 'rb') as f:
    tree = pickle.load(f)


# Test query with search analysis
#query = "What are the phases of matter?"
query = "What is the major elements of earth crust and air? What is the weight percentage of oxygen  on earth?"
print(f"\nAnalyzing query: {query}")

# Step 1: Show search results
search_results = tree.enhanced_search(query, max_results=1)
print(f"\nSearch results (top 1):")

all_nodes = tree.tree_node_iterator()
node_map = {node.node_id: node for node in all_nodes}

for i, (node_id, score) in enumerate(search_results, 1):
    if node_id in node_map:
        node = node_map[node_id]
        print(f"  {i}. [Node {node_id}] {node.header}")
        print(f"     Score: {score:.4f}")
        print(f"     Content preview: {node.content_text[:100]}...")
        print()

# Step 2: Show RAG answer
print(f"{'='*60}")
print("RAG Answer:")
print(f"{'='*60}")
answer = tree.rag_query(query, top_k=1)
print(answer)

print("\n" + "="*80)
print("SEARCH ANALYSIS COMPLETE!")
print("="*80)

Load content tree .....

Analyzing query: What is the major elements of earth crust and air? What is the weight percentage of oxygen  on earth?
Generating embeddings for 1 texts in 1 batches...

Search results (top 1):
  1. [Node 937] Chapter 18 18.9 Occurrence, Preparation, and Compounds of Oxygen
     Score: 0.3894
     Content preview: By the end of this section, you will be able to:

- Describe the properties, preparation, and compou...

RAG Answer:
Generating embeddings for 1 texts in 1 batches...
No information in the provided content for your query.

SEARCH ANALYSIS COMPLETE!


In [6]:
# Test customized weights for scoring
from parameters import (
    DEFAULT_PARAMETERS, 
    SEMANTIC_FOCUSED_PARAMETERS,
    LEXICAL_FOCUSED_PARAMETERS,
    create_custom_parameters
)

custom_config = create_custom_parameters(
    # Semantic similarity weights
    semantic_header=0.2,
    semantic_summary=0.2,
    semantic_content=0.2,
    semantic_chunks=0.2,
    semantic_sentences=0.2,
    # N-gram weights  
    ngram_monogram=1.0,
    ngram_bigram=2.0,
    ngram_trigram=4.0,
    # Combined search weights
    combined_semantic=0.6,
    combined_lexical=0.4
)

#query = "Give me an example of heterogeneous mixture in daily life"
#query = "What is the major elements of earth crust and air? What is the weight percentage of oxygen  on earth?"
query = "What is periodic table? Who discovered it?"
query = "Show me the figure of the periodic table"
time1 = time.time()
answer = tree.rag_query(query, top_k=2, custom_params=custom_config, debug=True)
print(answer)
print("Time = ", time.time() - time1)
    

Generating embeddings for 1 texts in 1 batches...

🔍 DEBUG: Search results for 'Show me the figure of the periodic table':
Total search results: 5
  1. [Node 1135] Score: 0.5628 | Header: 'Appendix A'
     Content preview: ![Image](Appendix_A_images/img-0.jpeg)

Figure A1. The Periodic Table...
  2. [Node 75] Score: 0.5532 | Header: 'Chapter 2 2.5 The Periodic Table'
     Content preview: By the end of this section, you will be able to:

- State the periodic law and explain the organizat...
  3. [Node 290] Score: 0.5188 | Header: 'The Aufbau Principle'
     Content preview: To determine the electron configuration for any particular atom, we can "build" the structures in th...
  4. [Node 36] Score: 0.5020 | Header: '1.3 Physical and Chemical Properties Chemistry in Everyday Life'
     Content preview: **Hazard Diamond**

You may have seen the symbol shown in Figure 1.21 on containers of chemicals in ...
  5. [Node 292] Score: 0.4282 | Header: 'Electron Configurations and the Periodic Ta

In [8]:
query = "Show me the figure of the periodic table"

def content_tree_copy(source_tree, target_tree):
    target_tree.root = source_tree.root
    target_tree._node_counter = source_tree._node_counter
    target_tree.inverse_index = source_tree.inverse_index
    target_tree.inverse_index_builder = source_tree.inverse_index_builder

target_tree = ContentTree()
content_tree_copy(tree, target_tree)

In [9]:
time1 = time.time()
answer = target_tree.rag_query(query, top_k=2, custom_params=custom_config, debug=True)
print("\n\nFinal answer")
print(answer)
print("Time = ", time.time() - time1)

Generating embeddings for 1 texts in 1 batches...

🔍 DEBUG: Search results for 'Show me the figure of the periodic table':
Total search results: 5
  1. [Node 1135] Score: 0.5628 | Header: 'Appendix A'
     Content preview: ![Image](Appendix_A_images/img-0.jpeg)

Figure A1. The Periodic Table...
  2. [Node 75] Score: 0.5532 | Header: 'Chapter 2 2.5 The Periodic Table'
     Content preview: By the end of this section, you will be able to:

- State the periodic law and explain the organizat...
  3. [Node 290] Score: 0.5188 | Header: 'The Aufbau Principle'
     Content preview: To determine the electron configuration for any particular atom, we can "build" the structures in th...
  4. [Node 36] Score: 0.5020 | Header: '1.3 Physical and Chemical Properties Chemistry in Everyday Life'
     Content preview: **Hazard Diamond**

You may have seen the symbol shown in Figure 1.21 on containers of chemicals in ...
  5. [Node 292] Score: 0.4282 | Header: 'Electron Configurations and the Periodic Ta

The figure of the periodic table is shown in **Figure A1** in the provided content. Here is the relevant information:

- **Figure A1**: The Periodic Table
- **Image Link**:
![Image](Appendix_A_images/img-0.jpeg)

This figure is described as "The Periodic Table," a fundamental chart in chemistry organizing elements by atomic structure and properties. The image link provided in the content is the visual representation of the periodic table. 

**Answer**: The figure of the periodic table is **Figure A1**, and its image link is ![Image](Appendix_A_images/img-0.jpeg).