In [None]:
from hllset_swarm.trajectory import SwarmProgram
from hllset_swarm.io.env import Environment

__all__ = ["SwarmProgram", "Environment"]

In [None]:
from hllset_swarm.hllset_wrapper import HllSet, HllHashInfo

hll = HllSet(P=6)

# Add single element
hash_info = hll.add("人")
print(f"Token: {hash_info.token}, Hash: {hash_info.hash_value}, Bin: {hash_info.bin}, Idx: {hash_info.idx}")
print(f"HLL Pair: {hash_info.hll_pair}")  # (bin, idx)

# Add batch
results = hll.add_batch(["工", "智", "能"])
for info in results:
    print(f"Token hash info: {info}")

# Demo

In [None]:
import torch
import hashlib
from typing import List, Dict, Tuple, Set, Optional
from collections import defaultdict

### Lookup Table Class

In [None]:
class LookupTable:
    """
    Hash-based lookup table for tokens with collision handling.
    
    Structure:
        hll_pair (reg, run) -> {
            'tokens': set of tokens with this (reg, run),
            'hashes': set of original hash values
        }
    """
    def __init__(self):
        self.table: Dict[Tuple[int, int], Dict] = {}
        self.token_to_pair: Dict[str, Tuple[int, int]] = {}
    
    def add_token(self, token: str, hash_info: HllHashInfo) -> Tuple[int, int]:
        """
        Add token to LUT using HllHashInfo.
        
        Args:
            token: Token string
            hash_info: HllHashInfo from HllSet.add()
        
        Returns:
            (reg, run) pair
        """
        pair = hash_info.hll_pair  # (bin, idx)
        
        if pair not in self.table:
            self.table[pair] = {
                'tokens': set(),  # Changed to set
                'hashes': set()   # Changed to set
            }
        
        # Set.add() is idempotent, no need to check membership
        self.table[pair]['tokens'].add(token)
        self.table[pair]['hashes'].add(hash_info.hash_value)
        self.token_to_pair[token] = pair
        
        return pair
    
    def get_tokens_by_hll(self, reg: int, run: int) -> List[str]:
        """Get all tokens that map to this (reg, run) pair"""
        pair = (reg, run)
        if pair in self.table:
            return list(self.table[pair]['tokens'])  # Convert set to list for return
        return []
    
    def get_hll_pair(self, token: str) -> Optional[Tuple[int, int]]:
        """Get HLLSet representation for token"""
        return self.token_to_pair.get(token)
    
    def get_collision_count(self) -> int:
        """Return number of (reg, run) pairs with multiple tokens"""
        return sum(1 for data in self.table.values() if len(data['tokens']) > 1)
    
    def get_collisions(self) -> Dict[Tuple[int, int], set]:
        """Return all (reg, run) pairs that have collisions"""
        return {
            pair: data['tokens'] 
            for pair, data in self.table.items() 
            if len(data['tokens']) > 1
        }

### Ingest Corpus

In [None]:
def ingest_corpus(corpus: List[str], hll: HllSet) -> Tuple[torch.Tensor, Dict[str, int], LookupTable]:
    """
    Ingest corpus with proper START/END handling and sliding 3-token window.
    
    Algorithm:
    1. For each text: wrap with START+text+END (START and END are single tokens)
    2. Slide 3-token window over extended text with step=1
    3. Each 3-token window (a,b,c) decomposes into: {(a), (a,b), (a,b,c)}
       - 1-token (unigram): a
       - 2-token (bigram): ab
       - 3-token (trigram): abc
    4. Build adjacency matrix and lookup table
    
    Args:
        corpus: List of text strings (each text is a sequence of tokens/characters)
        hll: HllSet instance for encoding tokens
    
    Returns:
        adj: Sparse adjacency matrix with frequency edges
        tok_id: token_sequence → AM node ID mapping
        lut: LookupTable with token_sequence → HLL mappings
    """
    START = "⊢"  # Unicode start symbol (single token)
    END = "⊣"    # Unicode end symbol (single token)
    
    lut = LookupTable()
    tok_id = {}
    edge_freq = defaultdict(int)
    
    # Register special tokens (they are single tokens like any other character)
    start_info = hll.add(START)
    end_info = hll.add(END)
    lut.add_token(START, start_info)
    lut.add_token(END, end_info)
    tok_id[START] = 0
    tok_id[END] = 1
    
    for text in corpus:
        text = text.strip()
        if len(text) == 0:
            continue
        
        # Create token sequence: [START, tok1, tok2, ..., tokN, END]
        tokens = [START] + list(text) + [END]
        
        # Slide 3-token window over token sequence (step=1)
        for i in range(len(tokens) - 2):
            # Extract 3-token window
            tok_a = tokens[i]      # Token at position i
            tok_b = tokens[i + 1]  # Token at position i+1
            tok_c = tokens[i + 2]  # Token at position i+2
            
            # Decompose into n-token sequences (n-grams)
            unigram = tok_a              # 1-token sequence
            bigram = tok_a + tok_b       # 2-token sequence
            trigram = tok_a + tok_b + tok_c  # 3-token sequence
            
            # Register all n-token sequences in HLL, LUT, and AM
            for ngram in [unigram, bigram, trigram]:
                if ngram not in tok_id:
                    # Add to HLL and get hash info
                    hash_info = hll.add(ngram)
                    # Register in LUT
                    lut.add_token(ngram, hash_info)
                    # Assign AM node ID
                    tok_id[ngram] = len(tok_id)
            
            # Build edges: unigram → bigram → trigram
            id_1 = tok_id[unigram]
            id_2 = tok_id[bigram]
            id_3 = tok_id[trigram]
            
            edge_freq[(id_1, id_2)] += 1
            edge_freq[(id_2, id_3)] += 1
    
    # Build sparse adjacency matrix
    N = len(tok_id)
    adj_u, adj_v, adj_w = [], [], []
    
    for (u, v), freq in edge_freq.items():
        adj_u.append(u)
        adj_v.append(v)
        adj_w.append(float(freq))
    
    adj = torch.sparse_coo_tensor(
        indices=torch.tensor([adj_u, adj_v], dtype=torch.long),
        values=torch.tensor(adj_w, dtype=torch.float32),
        size=(N, N)
    ).coalesce()
    
    return adj, tok_id, lut

In our case n-grams are actually n-tokens. In Chinese character is token, so <START> and <END> also characters in chinese language sense

So in the context of our system:

- **Token** = atomic unit (in Chinese: a character; in English: could be a word/subword)
- **n-gram** = sequence of n tokens
- **START** and **END** = special single-character tokens

In [None]:

def test_ingest():
    """Test the ingest function with sample corpus"""
    # from hllset_wrapper import HllSet
    
    print("=== Test Ingest Function ===\n")
    
    # Sample corpus (Chinese text - each character is a token)
    CORPUS = [
        "人工智能",
        "机器学习"
    ]
    
    # Initialize HllSet
    hll = HllSet(P=10)
    
    print("Step 1: Ingest corpus")
    adj, tok_id, lut = ingest_corpus(CORPUS, hll)
    
    print(f"  Total token sequences in AM: {len(tok_id)}")
    print(f"  Total edges: {adj._nnz()}")
    print(f"  HLL cardinality: {hll.count():.0f}")
    
    print("\nStep 2: Verify token sequence registration")
    # Show some token sequences and their HLL pairs
    print("  Sample token sequences:")
    for seq in list(tok_id.keys())[:15]:
        pair = lut.get_hll_pair(seq)
        print(f"    '{seq}' (len={len(seq)}) -> Node ID: {tok_id[seq]}, HLL pair: {pair}")
    
    print("\nStep 3: Verify extended text processing")
    # Check that START and END are registered
    START, END = "⊢", "⊣"
    assert START in tok_id, "START token not found"
    assert END in tok_id, "END token not found"
    print(f"  ✓ START ('{START}') and END ('{END}') tokens registered")
    
    # Verify first text decomposition
    text1 = CORPUS[0]
    tokens = [START] + list(text1) + [END]
    print(f"\n  Original text: '{text1}'")
    print(f"  Token sequence: {tokens}")
    print(f"  Total tokens: {len(tokens)}")
    
    # Check 3-token decomposition for first window
    tok_a, tok_b, tok_c = tokens[0], tokens[1], tokens[2]
    unigram = tok_a
    bigram = tok_a + tok_b
    trigram = tok_a + tok_b + tok_c
    
    print(f"\n  First 3-token window: [{tok_a}, {tok_b}, {tok_c}]")
    print(f"    Unigram (1-token): '{unigram}' (Node ID: {tok_id.get(unigram)})")
    print(f"    Bigram (2-token): '{bigram}' (Node ID: {tok_id.get(bigram)})")
    print(f"    Trigram (3-token): '{trigram}' (Node ID: {tok_id.get(trigram)})")
    
    print("\nStep 4: Verify adjacency matrix structure")
    indices = adj.coalesce().indices()
    values = adj.coalesce().values()
    
    id_to_tok = {v: k for k, v in tok_id.items()}
    
    print("  Sample edges (source -> target: weight):")
    for i in range(min(15, indices.shape[1])):
        u, v = indices[0, i].item(), indices[1, i].item()
        w = values[i].item()
        src_seq = id_to_tok[u]
        dst_seq = id_to_tok[v]
        print(f"    '{src_seq}' (len={len(src_seq)}) -> '{dst_seq}' (len={len(dst_seq)}): {w}")
    
    print("\nStep 5: Verify LUT hash collisions")
    collisions = lut.get_collisions()
    
    if len(collisions) == 0:
        print("  No hash collisions detected")
    else:
        print(f"  Found {len(collisions)} collision(s):")
        for pair, token_set in collisions.items():
            print(f"    {pair}: {token_set}")
    
    print("\n=== Test Complete ===")
    
    return adj, tok_id, lut, hll

In [37]:
adj, tok_id, lut, hll = test_ingest()

print(adj)

=== Test Ingest Function ===

Step 1: Ingest corpus
  Total token sequences in AM: 24
  Total edges: 16
  HLL cardinality: 25

Step 2: Verify token sequence registration
  Sample token sequences:
    '⊢' (len=1) -> Node ID: 0, HLL pair: (736, 3)
    '⊣' (len=1) -> Node ID: 1, HLL pair: (415, 1)
    '⊢人' (len=2) -> Node ID: 2, HLL pair: (290, 1)
    '⊢人工' (len=3) -> Node ID: 3, HLL pair: (740, 3)
    '人' (len=1) -> Node ID: 4, HLL pair: (235, 1)
    '人工' (len=2) -> Node ID: 5, HLL pair: (285, 2)
    '人工智' (len=3) -> Node ID: 6, HLL pair: (60, 1)
    '工' (len=1) -> Node ID: 7, HLL pair: (208, 1)
    '工智' (len=2) -> Node ID: 8, HLL pair: (346, 3)
    '工智能' (len=3) -> Node ID: 9, HLL pair: (511, 1)
    '智' (len=1) -> Node ID: 10, HLL pair: (291, 3)
    '智能' (len=2) -> Node ID: 11, HLL pair: (71, 2)
    '智能⊣' (len=3) -> Node ID: 12, HLL pair: (386, 1)
    '⊢机' (len=2) -> Node ID: 13, HLL pair: (447, 2)
    '⊢机器' (len=3) -> Node ID: 14, HLL pair: (349, 4)

Step 3: Verify extended text proces