In [1]:
from content_tree import *
import requests
import json
import pickle
import time

In [None]:
# Create a content tree
tree = ContentTree()

# Build the textbook tree from markdown files (limited to 2 files for testing)
md_directory = "/Users/chemxai/GenAI/AI_Tutor/mcp_kb/md_files"
print(f"Building tree from: {md_directory}")
tree.build_textbook_tree(md_directory)

# Rename repeating headers to make them unique
tree.rename_repeating_headers()

# Process content to create search indexes
print("\nProcessing content and creating search indexes...")
tree.process_tree_content(
    max_summary_words=20,
    max_keywords=5,
    generate_embeddings=True,
    create_inverse_index=True
)

# Save for repeating use
with open('content_tree2.pkl','wb') as f:
    pickle.dump(tree, f)

In [3]:
# Check the file loading
with open('content_tree2.pkl', 'rb') as f:
    tree = pickle.load(f)
    print(tree.root.child_nodes[1].child_nodes[1].header_embedding)

[-0.00228252  0.00768171 -0.01102511 ...  0.01256146  0.00989485
  0.00670711]


In [4]:
def test_rag_functionality():
    """Test the RAG function with various queries."""
    print("="*80)
    print("TESTING RAG FUNCTIONALITY")
    print("="*80)

    print("Load content tree .....")
    with open('content_tree2.pkl', 'rb') as f:
        tree = pickle.load(f)
    
    print("\n" + "="*80)
    print("RAG QUERY TESTING")
    print("="*80)
    
    # Test queries - from basic to complex
    test_queries = [
        "What is chemistry?",
        "What are the phases of matter?",
        "How do you measure density?",
        "What is the scientific method?",
        "How do you calculate significant figures?",
        "What are atoms and molecules?",
        "What is temperature measurement?",
        "What is the difference between accuracy and precision?",
        "How do you perform dimensional analysis?",
        "What are the domains of chemistry?",
        "What is quantum mechanics?",  # Should not be found
        "How do you build a rocket?",  # Should not be found
    ]
    
    for i, query in enumerate(test_queries, 1):
        print(f"\n{'='*60}")
        print(f"Query {i}: {query}")
        print(f"{'='*60}")
        
        try:
            # Test with top-1 node (default)
            answer = tree.rag_query(query, top_k=1)
            print(f"\nAnswer (top-1 node):")
            print(f"{answer}")
            
            # For some queries, also test with top-3 nodes
            if i <= 5:  # Only for first 5 queries to save time
                print(f"\n{'-'*40}")
                print(f"Testing with top-3 nodes:")
                answer_top3 = tree.rag_query(query, top_k=3)
                print(f"\nAnswer (top-3 nodes):")
                print(f"{answer_top3}")
        
        except Exception as e:
            print(f"Error processing query '{query}': {e}")
    
    print("\n" + "="*80)
    print("RAG FUNCTIONALITY TEST COMPLETE!")
    print("="*80)

if __name__ == "__main__":
    test_rag_functionality()

TESTING RAG FUNCTIONALITY
Load content tree .....

RAG QUERY TESTING

Query 1: What is chemistry?
Generating embeddings for 1 texts in 1 batches...

Answer (top-1 node):
Chemistry is the study of the composition, properties, and interactions of matter. It is often referred to as "the central science" because it is interconnected with a wide range of other STEM disciplines, including biology, medicine, materials science, and environmental science. Chemistry plays a vital role in understanding and explaining changes in matter that are essential to daily life, such as digesting food, synthesizing polymers for various materials, and refining crude oil into gasoline. The practice of chemistry involves understanding the principles and laws governing these changes in matter and their energy transformations.

----------------------------------------
Testing with top-3 nodes:
Generating embeddings for 1 texts in 1 batches...

Answer (top-3 nodes):
Chemistry is the study of the composition, prop

In [4]:
print("Load content tree .....")
with open('content_tree2.pkl', 'rb') as f:
    tree = pickle.load(f)


# Test query with search analysis
#query = "What are the phases of matter?"
query = "What is the major elements of earth crust and air? What is the weight percentage of oxygen  on earth?"
print(f"\nAnalyzing query: {query}")

# Step 1: Show search results
search_results = tree.enhanced_search(query, max_results=5)
print(f"\nSearch results (top 5):")

all_nodes = tree.tree_node_iterator()
node_map = {node.node_id: node for node in all_nodes}

for i, (node_id, score) in enumerate(search_results, 1):
    if node_id in node_map:
        node = node_map[node_id]
        print(f"  {i}. [Node {node_id}] {node.header}")
        print(f"     Score: {score:.4f}")
        print(f"     Content preview: {node.content_text[:100]}...")
        print()

# Step 2: Show RAG answer
print(f"{'='*60}")
print("RAG Answer:")
print(f"{'='*60}")
answer = tree.rag_query(query, top_k=1)
print(answer)

print("\n" + "="*80)
print("SEARCH ANALYSIS COMPLETE!")
print("="*80)

Load content tree .....

Analyzing query: What is the major elements of earth crust and air? What is the weight percentage of oxygen  on earth?
Generating embeddings for 1 texts in 1 batches...

Search results (top 5):
  1. [Node 31] Classifying Matter
     Score: 0.3284
     Content preview: Matter can be classified into several categories. Two broad categories are mixtures and pure substan...

  2. [Node 45] Density
     Score: 0.1821
     Content preview: We use the mass and volume of a substance to determine its density. Thus, the units of density are d...

  3. [Node 26] 1.1 Chemistry in Context
     Score: 0.1740
     Content preview: By the end of this module, you will be able to:

- Outline the historical development of chemistry
-...

  4. [Node 34] 1.2 Phases and Classification of Matter Chemistry in Everyday Life
     Score: 0.1724
     Content preview: **Chemistry of Cell Phones**

Imagine how different your life would be without cell phones (Figure 1...

  5. [Node 32] Ato

In [8]:
# Test customized weights for scoring
from parameters import (
    DEFAULT_PARAMETERS, 
    SEMANTIC_FOCUSED_PARAMETERS,
    LEXICAL_FOCUSED_PARAMETERS,
    create_custom_parameters
)

custom_config = create_custom_parameters(
    # Semantic similarity weights
    semantic_header=0.3,
    semantic_summary=0.3,
    semantic_content=0.1,
    semantic_chunks=0.2,
    semantic_sentences=0.1,
    # N-gram weights  
    ngram_monogram=0.5,
    ngram_bigram=2.0,
    ngram_trigram=4.5,
    # Combined search weights
    combined_semantic=0.7,
    combined_lexical=0.3
)


answer = tree.rag_query(query, top_k=1, custom_params=custom_config)
print(answer)
    

Generating embeddings for 1 texts in 1 batches...
### Answer:

#### Major Elements of Earth's Crust:
The major elements of Earth's crust, based on the provided content, are:
1. **Oxygen (O)**: 49.20% by mass
2. **Silicon (Si)**: 25.67% by mass
3. **Aluminum (Al)**: 7.50% by mass
4. **Iron (Fe)**: 4.71% by mass
5. **Calcium (Ca)**: 3.39% by mass
6. **Sodium (Na)**: 2.63% by mass
7. **Potassium (K)**: 2.40% by mass
8. **Magnesium (Mg)**: 1.93% by mass
9. **Hydrogen (H)**: 0.87% by mass
10. **Titanium (Ti)**: 0.58% by mass

#### Major Elements of Air:
The content does not explicitly list the major elements of air. However, it does mention that **oxygen (O)** is a significant component of the Earth's atmosphere.

#### Weight Percentage of Oxygen on Earth:
The weight percentage of oxygen on Earth, as provided in the content, is **49.20%** by mass.

### Final Answer:
- **Major Elements of Earth's Crust**: Oxygen, Silicon, Aluminum, Iron, Calcium, Sodium, Potassium, Magnesium, Hydrogen, Titan