In [1]:
from content_tree import *
import requests
import json
import pickle
import time
import re
from typing import Dict, List, Tuple

TERM_LINE_RE = re.compile(
    r'^\s*(?:[-*]\s*)?\*\*(.+?)\*\*\s*(?:[:\-–—])?\s*(.*)\s*$'
)

def _clean(text: str) -> str:
    # Strip surrounding bold markers if any, normalize whitespace
    text = text.strip()
    if text.startswith("**") and text.endswith("**") and len(text) >= 4:
        text = text[2:-2].strip()
    # Collapse whitespace, keep LaTeX and symbols intact
    text = re.sub(r'[ \t]+', ' ', text)
    text = re.sub(r'\s*\n\s*', ' ', text)
    return text.strip(' |')

def _parse_table_block(lines: List[str], start: int) -> Tuple[Dict[str, str], int]:
    """
    Parse a markdown table starting at index `start` (assumes header present).
    Returns (rows_dict, next_index_after_table)
    """
    out: Dict[str, str] = {}
    i = start
    # consume header
    header = lines[i].strip()
    i += 1
    # optional separator row(s)
    while i < len(lines) and lines[i].lstrip().startswith('|') and set(lines[i].replace('|', '').strip()) <= set('-: '):
        i += 1

    # consume data rows
    while i < len(lines) and lines[i].lstrip().startswith('|'):
        row = lines[i].strip().strip('|')
        cells = [c.strip() for c in row.split('|')]
        if len(cells) >= 2:
            term = _clean(cells[0])
            definition = _clean(cells[1])
            if term:
                # prefer longer definition if duplicate
                if term not in out or len(definition) > len(out[term]):
                    out[term] = definition
        i += 1
    return out, i

def _collect_until(next_breaks: set, i: int, lines: List[str], start_at: int) -> Tuple[str, int]:
    """
    Collect definition lines from start_at until index in next_breaks or EOF.
    Returns (definition, next_index)
    """
    parts: List[str] = []
    j = start_at
    while j < len(lines) and j not in next_breaks:
        parts.append(lines[j].rstrip())
        j += 1
    # Trim leading/trailing blank lines from collected parts
    while parts and not parts[0].strip():
        parts.pop(0)
    while parts and not parts[-1].strip():
        parts.pop()
    return _clean('\n'.join(parts)), j

def parse_key_terms(text: str) -> Dict[str, str]:
    """
    Extract key terms and definitions from heterogeneous markdown-like samples.
    Supports:
      - Markdown tables with 'Term' and 'Definition' columns
      - Bulleted lines like: - **term**  definition
      - Bold term on a line; definition on same or following lines
    """
    lines = text.splitlines()
    n = len(lines)
    i = 0
    result: Dict[str, str] = {}

    # First pass: detect table blocks (by header)
    table_headers = set()
    idx = 0
    while idx < n:
        line = lines[idx].strip()
        if line.startswith('|') and 'Term' in line and 'Definition' in line:
            table_headers.add(idx)
            # skip parsing here; we’ll parse in main loop
        idx += 1

    # Precompute term-line indices (non-table)
    term_line_indices = set()
    for idx, line in enumerate(lines):
        if idx in table_headers:
            continue
        if line.lstrip().startswith('|'):
            continue  # table content; will be parsed in table handler
        if TERM_LINE_RE.match(line):
            term_line_indices.add(idx)

    # Main scan
    while i < n:
        line = lines[i]

        # Table?
        if i in table_headers:
            rows, i_next = _parse_table_block(lines, i)
            for k, v in rows.items():
                if k not in result or len(v) > len(result[k]):
                    result[k] = v
            i = i_next
            continue

        # Term line?
        m = TERM_LINE_RE.match(line)
        if m:
            term = _clean(m.group(1))
            inline_def = _clean(m.group(2)) if m.group(2) else ''
            if inline_def:
                definition = inline_def
                i += 1
            else:
                # Collect until next term/table start
                # Build set of break indices >= i+1
                next_breaks = set(j for j in term_line_indices if j > i) | set(k for k in table_headers if k > i)
                definition, i_after = _collect_until(next_breaks, i, lines, i + 1)
                i = i_after
            if term and definition:
                if term not in result or len(definition) > len(result[term]):
                    result[term] = definition
            continue

        i += 1

    return result

In [2]:
# Create a content tree
tree = ContentTree()

# copy some md files into this test folder so that you can test only one or a few md files!
md_directory = "./md_files"
print(f"Building tree from: {md_directory}")
tree.build_textbook_tree(md_directory, max_level=4)

# Rename repeating headers to make them unique
tree.rename_repeating_headers()
nodes = tree.tree_node_iterator()
print(len(nodes))

with open('genchem.pkl','wb') as f:
    pickle.dump(tree, f)

Building tree from: ./md_files
1218


In [3]:
# Build key term dictionary for the content tree
with open('genchem.pkl', 'rb') as f:
    tree = pickle.load(f)
    
nodes = tree.tree_node_iterator()

key_term_dict = {}
for node in nodes:
    if (node.header.find(" Key Terms") > -1):
        print(node.header)
        terms = parse_key_terms(node.content_text)
        print("Key terms = ", len(terms))
        key_term_dict.update(terms)
# Print a few to verify
print("Total terms = ", len(key_term_dict))
with open('key_term_dict.pkl','wb') as f:
    pickle.dump(key_term_dict, f)

Chapter 1 Key Terms
Key terms =  52
Chapter 2 Key Terms
Key terms =  61
Chapter 3 Key Terms
Key terms =  20
Chapter 4 Key Terms
Key terms =  49
Chapter 5 Key Terms
Key terms =  32
Chapter 6 Key Terms
Key terms =  50
Chapter 7 Key Terms
Key terms =  39
Chapter 8 Key Terms
Key terms =  28
Chapter 9 Key Terms
Key terms =  32
Chapter 10 Key Terms
Key terms =  56
Chapter 11 Key Terms
Key terms =  44
Chapter 12 Key Terms
Key terms =  29
Solution Key Terms
Key terms =  8
Chapter 14 Key Terms
Key terms =  31
Chapter 15 Key Terms
Key terms =  14
Chapter 16 Key Terms
Key terms =  12
Chapter 17 Key Terms
Key terms =  31
Chapter 18 Key Terms
Key terms =  42
Chapter 19 Key Terms
Key terms =  41
Chapter 20 Key Terms
Key terms =  21
Chapter 21 Key Terms
Key terms =  67
Total terms =  747


In [15]:
# Process content for all tree nodes
# Load content tree with partically processed without embedding

with open('genchem_no_embedding.pkl', 'rb') as f:
    tree = pickle.load(f)
    
nodes = tree.tree_node_iterator()
for node in nodes:
    if (node.node_id in [20, 30, 50])
    if (node.header.find(" Key Terms") > -1 or node.header.find(" Summary") > -1 or node.header.find(" Exercises") > -1 or node.header.find("Root") > -1 or node.header.find("Preface") > -1 or node.header.find(" Key Equations") > -1):
        continue
    print("Node ID = ",node.node_id)
    node.process_content(llm_type="openai",llm_model="gpt-4o", llm_api_url="", generate_embeddings=True)

Node ID =  25
Process node content ........
Generating embeddings for 1 texts in 1 batches...
Generating embeddings for 1 texts in 1 batches...
Generating embeddings for 6 texts in 1 batches...
Generating embeddings for 13 texts in 1 batches...
Generating embeddings for 10 texts in 1 batches...
Node ID =  26
Process node content ........
Generating embeddings for 1 texts in 1 batches...
Generating embeddings for 1 texts in 1 batches...
Generating embeddings for 5 texts in 1 batches...
Generating embeddings for 19 texts in 1 batches...
Generating embeddings for 13 texts in 1 batches...
Node ID =  27
Process node content ........
Generating embeddings for 1 texts in 1 batches...
Generating embeddings for 1 texts in 1 batches...
Generating embeddings for 3 texts in 1 batches...
Generating embeddings for 17 texts in 1 batches...
Generating embeddings for 13 texts in 1 batches...
Node ID =  28
Process node content ........
Generating embeddings for 1 texts in 1 batches...
Generating embeddi

In [16]:
# Save fully processed content tree
with open('genchem_with_embedding.pkl','wb') as f:
    pickle.dump(tree, f)

In [7]:
with open('genchem_with_embedding.pkl', 'rb') as f:
    tree = pickle.load(f)
nodes = tree.tree_node_iterator()

for node in nodes[28:30]:
    print(node.header)
    print(node.summary)
    print(node.content_text)
    print(node.questions)
    

The Domains of Chemistry
Chemists study matter and energy in macroscopic, microscopic, and symbolic domains, using observations, imagination, and specialized language to describe and interpret chemical behavior.
Chemists study and describe the behavior of matter and energy in three different domains: macroscopic, microscopic, and symbolic. These domains provide different ways of considering and describing chemical behavior.

Macro is a Greek word that means "large." The macroscopic domain is familiar to us: It is the realm of everyday things that are large enough to be sensed directly by human sight or touch. In daily life, this includes the food you eat and the breeze you feel on your face. The macroscopic domain includes everyday and laboratory chemistry, where we observe and measure physical and chemical properties such as density, solubility, and flammability.

Micro comes from Greek and means "small." The microscopic domain of chemistry is often visited in the imagination. Some as