# Demo

In [1]:
import torch
import hashlib
from typing import List, Dict, Tuple, Set, Optional
from collections import defaultdict

# from hllset_swarm.hllset_wrapper import HllSet, HllHashInfo
from hllset_swarm.persistence import PersistentLookupTable, PersistentAdjacencyMatrix, CorpusStateManager
from hllset_swarm.ingest import LookupTable, ingest_corpus

Loading HLLSet kernel from: None


### Lookup Table Class

### Ingest Corpus

In our case n-grams are actually n-tokens. In Chinese character is token, so <START> and <END> also characters in chinese language sense

So in the context of our system:

- **Token** = atomic unit (in Chinese: a character; in English: could be a word/subword)
- **n-gram** = sequence of n tokens
- **START** and **END** = special single-character tokens

In [2]:
from hllset_swarm.hllset_wrapper import HllSet, HllHashInfo

def test_ingest(CORPUS: List[str]) -> Tuple[PersistentAdjacencyMatrix, Dict[str, int], LookupTable, HllSet]:
    """Test the ingest function with sample corpus"""
    # from hllset_wrapper import HllSet
    
    print("=== Test Ingest Function ===\n")
    
    # Initialize HllSet
    hll = HllSet(P=10)
    
    print("Step 1: Ingest corpus")
    adj, tok_id, lut = ingest_corpus(CORPUS, hll)
    
    print(f"  Total token sequences in AM: {len(tok_id)}")
    print(f"  Total edges: {adj._nnz()}")
    print(f"  HLL cardinality: {hll.count():.0f}")
    
    print("\nStep 2: Verify token sequence registration")
    # Show some token sequences and their HLL pairs
    print("  Sample token sequences:")
    for seq in list(tok_id.keys())[:15]:
        pair = lut.get_hll_pair(seq)
        print(f"    '{seq}' (len={len(seq)}) -> Node ID: {tok_id[seq]}, HLL pair: {pair}")
    
    print("\nStep 3: Verify extended text processing")
    # Check that START and END are registered
    START, END = "⊢", "⊣"
    assert START in tok_id, "START token not found"
    assert END in tok_id, "END token not found"
    print(f"  ✓ START ('{START}') and END ('{END}') tokens registered")
    
    # Verify first text decomposition
    text1 = CORPUS[0]
    tokens = [START] + list(text1) + [END]
    print(f"\n  Original text: '{text1}'")
    print(f"  Token sequence: {tokens}")
    print(f"  Total tokens: {len(tokens)}")
    
    # Check 3-token decomposition for first window
    tok_a, tok_b, tok_c = tokens[0], tokens[1], tokens[2]
    unigram = tok_a
    bigram = tok_a + tok_b
    trigram = tok_a + tok_b + tok_c
    
    print(f"\n  First 3-token window: [{tok_a}, {tok_b}, {tok_c}]")
    print(f"    Unigram (1-token): '{unigram}' (Node ID: {tok_id.get(unigram)})")
    print(f"    Bigram (2-token): '{bigram}' (Node ID: {tok_id.get(bigram)})")
    print(f"    Trigram (3-token): '{trigram}' (Node ID: {tok_id.get(trigram)})")
    
    print("\nStep 4: Verify adjacency matrix structure")
    indices = adj.coalesce().indices()
    values = adj.coalesce().values()
    
    id_to_tok = {v: k for k, v in tok_id.items()}
    
    print("  Sample edges (source -> target: weight):")
    for i in range(min(15, indices.shape[1])):
        u, v = indices[0, i].item(), indices[1, i].item()
        w = values[i].item()
        src_seq = id_to_tok[u]
        dst_seq = id_to_tok[v]
        print(f"    '{src_seq}' (len={len(src_seq)}) -> '{dst_seq}' (len={len(dst_seq)}): {w}")
    
    print("\nStep 5: Verify LUT hash collisions")
    collisions = lut.get_collisions()
    
    if len(collisions) == 0:
        print("  No hash collisions detected")
    else:
        print(f"  Found {len(collisions)} collision(s):")
        for pair, token_set in collisions.items():
            print(f"    {pair}: {token_set}")
    
    print("\n=== Test Complete ===")
    
    return adj, tok_id, lut, hll

In [3]:
# Sample corpus (Chinese text - each character is a token)
CORPUS = [
    "人工智能",
    "机器学习"
]    

adj, tok_id, lut, hll = test_ingest(CORPUS)

print(adj)

=== Test Ingest Function ===

Step 1: Ingest corpus
  Total token sequences in AM: 24
  Total edges: 16
  HLL cardinality: 25

Step 2: Verify token sequence registration
  Sample token sequences:
    '⊢' (len=1) -> Node ID: 0, HLL pair: (736, 3)
    '⊣' (len=1) -> Node ID: 1, HLL pair: (415, 1)
    '⊢人' (len=2) -> Node ID: 2, HLL pair: (290, 1)
    '⊢人工' (len=3) -> Node ID: 3, HLL pair: (740, 3)
    '人' (len=1) -> Node ID: 4, HLL pair: (235, 1)
    '人工' (len=2) -> Node ID: 5, HLL pair: (285, 2)
    '人工智' (len=3) -> Node ID: 6, HLL pair: (60, 1)
    '工' (len=1) -> Node ID: 7, HLL pair: (208, 1)
    '工智' (len=2) -> Node ID: 8, HLL pair: (346, 3)
    '工智能' (len=3) -> Node ID: 9, HLL pair: (511, 1)
    '智' (len=1) -> Node ID: 10, HLL pair: (291, 3)
    '智能' (len=2) -> Node ID: 11, HLL pair: (71, 2)
    '智能⊣' (len=3) -> Node ID: 12, HLL pair: (386, 1)
    '⊢机' (len=2) -> Node ID: 13, HLL pair: (447, 2)
    '⊢机器' (len=3) -> Node ID: 14, HLL pair: (349, 4)

Step 3: Verify extended text proces

## Complete Persistence System with Incremental Merging

In [4]:
import torch
import duckdb
import pickle
import zstandard as zstd
from pathlib import Path
from typing import Dict, Tuple, List, Optional, Set
from dataclasses import dataclass
from collections import defaultdict

In [5]:

# Usage example for SGS.ai real-time processing
def demo_realtime_workflow():
    """Simulate SGS.ai real-time workflow"""
    # from hllset_wrapper import HllSet
    
    # Initialize state manager
    state_mgr = CorpusStateManager("./sgs_corpus_state")
    
    # Initialize HLL for this session
    hll = HllSet(P=10)
    
    # === Scenario 1: New data arrives ===
    print("=" * 60)
    print("SCENARIO 1: Ingest new sensory data")
    print("=" * 60)
    
    new_corpus_1 = ["人工智能", "机器学习"]
    state_mgr.ingest_and_merge(new_corpus_1, hll)
    
    # === Scenario 2: More data arrives (incremental) ===
    print("\n" + "=" * 60)
    print("SCENARIO 2: Ingest more data (incremental merge)")
    print("=" * 60)
    
    new_corpus_2 = ["深度学习", "神经网络"]
    state_mgr.ingest_and_merge(new_corpus_2, hll)
    
    # === Scenario 3: Restore from HLLSet ===
    print("\n" + "=" * 60)
    print("SCENARIO 3: Restore text from HLLSet")
    print("=" * 60)
    
    # Create HLLSet from some text (simulating self-generation loop output)
    restoration_hll = HllSet(P=10)
    test_text = "人工智能"
    for char in test_text:
        restoration_hll.add(char)
    
    # Retrieve data for restoration
    pruned_adj, pruned_tok_id, tokens_by_pair = state_mgr.retrieve_for_restoration(restoration_hll)
    
    print(f"\nReady for order reconstruction:")
    print(f"  Pruned AM: {pruned_adj.shape}")
    print(f"  Token candidates: {sum(len(v) for v in tokens_by_pair.values())}")
    
    # Close connections
    state_mgr.close()
    
    print("\n" + "=" * 60)
    print("Demo complete!")
    print("=" * 60)


# if __name__ == "__main__":
#     demo_realtime_workflow()

In [6]:
demo_realtime_workflow()

SCENARIO 1: Ingest new sensory data

=== Ingesting new corpus ===
Memory: 24 tokens, 16 edges

=== Merging with persistent storage ===
Merged 24 token entries into persistent LUT
Loaded AM: torch.Size([43, 43]), 30 edges
Merging adjacency matrices...
Saved AM: torch.Size([43, 43]), 30 edges, compressed to 0.7 KB
Merge complete: 43 tokens, 30 edges
Merge complete!


SCENARIO 2: Ingest more data (incremental merge)

=== Ingesting new corpus ===
Memory: 24 tokens, 16 edges

=== Merging with persistent storage ===
Merged 24 token entries into persistent LUT
Loaded AM: torch.Size([43, 43]), 30 edges
Merging adjacency matrices...
Saved AM: torch.Size([43, 43]), 30 edges, compressed to 0.9 KB
Merge complete: 43 tokens, 30 edges
Merge complete!


SCENARIO 3: Restore text from HLLSet

=== Retrieving data for restoration ===
HLLSet contains 4 unique (reg, run) pairs
Retrieved token candidates for 0 pairs
Loaded AM: torch.Size([43, 43]), 30 edges
Disambiguated to 2 tokens
Pruned AM: 0 edges, 2 no