# AM-Guided Disambiguation Demo

Demonstrates the new adjacency matrix-guided disambiguation algorithm.

In [1]:
import sys
sys.path.insert(0, '../src')
   
from hllset_swarm.ingest import CorpusState
from hllset_swarm.disambiguate import disambiguate_with_am 
import torch



Loading HLLSet kernel from: None


## Example 1: Simple Text

In [4]:
# Create corpus
corpus = ["人工智能"]

# Ingest
state = CorpusState(P=10)
state.ingest_corpus(corpus)

# Get components
adj, token_to_idx = state.get_adjacency_matrix()
text_hll = state.get_hllset_for_text(0)
print(f"Original text: {corpus[0]}")
print(f"HLL cardinality: {text_hll.count():.0f}")
print(f"AM size: {adj.shape}, {adj._nnz()} edges")


=== Ingesting 1 texts ===
Text 1: ⊢人工智能⊣
  Text HLL cardinality: 13

Global state:
  Total unique tokens: 13
  Total HLLSets: 1
  Total edges: 5
  Master HLL cardinality: 13
Original text: 人工智能
HLL cardinality: 13
AM size: torch.Size([4, 4]), 5 edges


In [5]:
# Disambiguate
sequences, best = disambiguate_with_am(
    text_hll, adj, token_to_idx, state.lut, max_paths=10
)

print(f"\\n{'='*50}")
print(f"Found {len(sequences)} sequences:")

for i, seq in enumerate(sequences, 1):
    print(f"  {i}. {''.join(seq)}")

print(f"\\nBest sequence: {''.join(best) if best else 'None'}")
print(f"Original:      {corpus[0]}")
print(f"Match: {best == list(corpus[0]) if best else False}")


=== Starting AM-guided disambiguation ===
START symbol: '⊢'
END symbol: '⊣'

[Path 1] Current: '⊢', Collected: []
  Valid 2-grams: []
  ✗ Dead end (no valid 2-grams)

=== Disambiguation complete ===
Explored 1 paths
Found 0 valid sequences
Found 0 sequences:
\nBest sequence: None
Original:      人工智能
Match: False


## Example 2: Multiple Texts

In [None]:
corpus2 = ["人工智能", "机器学习", "深度学习"]

state2 = CorpusState(P=10)
state2.ingest_corpus(corpus2)

adj2, token_to_idx2 = state2.get_adjacency_matrix()

print(f"Corpus: {corpus2}")
print(f"Total tokens: {len(token_to_idx2)}")
print(f"Total edges: {adj2._nnz()}")
print(f"Master HLL: {state2.master_hll.count():.0f}")

In [None]:
# Disambiguate each text
for i, original_text in enumerate(corpus2):
    hll = state2.get_hllset_for_text(i)
    sequences, best = disambiguate_with_am(
        hll, adj2, token_to_idx2, state2.lut, max_paths=20
    )

    recovered = ''.join(best) if best else 'FAILED'
    match = '✓' if best == list(original_text) else '✗'

    print(f"\\nText {i+1}:")
    print(f"  Original:  {original_text}")
    print(f"  Recovered: {recovered} {match}")
    print(f"  Paths explored: {len(sequences)}")

## Example 3: Performance Comparison

In [None]:
import time

# Create larger corpus
large_corpus = [
    "人工智能是计算机科学的一个分支",
    "机器学习是实现人工智能的一种方法",
    "深度学习是机器学习的一个子领域",
    "神经网络是深度学习的核心技术"
]

state3 = CorpusState(P=12)
state3.ingest_corpus(large_corpus)

adj3, token_to_idx3 = state3.get_adjacency_matrix()