# Ingest new data and merge it into existing SGS.ai structures

In [1]:
import torch
import hashlib
from typing import List, Dict, Tuple, Set, Optional
from collections import defaultdict
from hllset_swarm.hllset_wrapper import HllSet, HllHashInfo

from hllset_swarm.ingest import CorpusState
from hllset_swarm.persistence import PersistenceManager



Loading HLLSet kernel from: None


## Usage Examples
### Example 1: Single Ingestion

In [2]:
corpus1 = ["人工智能", "机器学习"]
corpus_state = CorpusState(P=10)
corpus_state.ingest_corpus(corpus1)

print("\nAdjacency Matrix:")
adj_matrix, token_to_idx = corpus_state.get_adjacency_matrix()
print(adj_matrix)


=== Ingesting 2 texts ===
Text 1: ⊢人工智能⊣
  Text HLL cardinality: 13
Text 2: ⊢机器学习⊣
  Text HLL cardinality: 14

Global state:
  Total unique tokens: 24
  Total HLLSets: 2
  Total edges: 9
  Master HLL cardinality: 25

Adjacency Matrix:
tensor(indices=tensor([[1, 1, 1, 1, 2, 2, 2, 3, 3],
                       [1, 2, 3, 4, 1, 3, 4, 1, 2]]),
       values=tensor([1., 3., 2., 1., 3., 1., 1., 2., 2.]),
       size=(5, 5), nnz=9, layout=torch.sparse_coo)


### Example 2: Incremental Merge (The Key Feature!)

In [3]:
# First ingestion
corpus1 = ["人工智能", "机器学习"]
corpus_state.ingest_corpus(corpus1)

print(f"After corpus1: {corpus_state.hllsets} hllsets, {corpus_state.edge_freq} edges")

# Second ingestion - MERGES with first!
corpus2 = ["深度学习", "人工智能"]  # Note: "人工智能" overlaps
corpus_state.ingest_corpus(corpus2)

print(f"After corpus1: {corpus_state.hllsets} hllsets, {corpus_state.edge_freq} edges")

# Check that "人工智能" has increased frequency
print(f"Frequency of '人': {corpus_state.lut.token_frequency['人']}")  # Should be 2

master_hll = corpus_state.get_master_hll()
print(f"Master HLL cardinality: {master_hll.count():.0f}")  # Should reflect unique tokens across both corpora


=== Ingesting 2 texts ===
Text 1: ⊢人工智能⊣
  Text HLL cardinality: 13
Text 2: ⊢机器学习⊣
  Text HLL cardinality: 14

Global state:
  Total unique tokens: 24
  Total HLLSets: 4
  Total edges: 9
  Master HLL cardinality: 25
After corpus1: [HllSet(P=10, count=13, tau=0.700, rho=0.210), HllSet(P=10, count=14, tau=0.700, rho=0.210), HllSet(P=10, count=13, tau=0.700, rho=0.210), HllSet(P=10, count=14, tau=0.700, rho=0.210)] hllsets, defaultdict(<class 'int'>, {(3, 1): 4, (1, 3): 4, (1, 2): 6, (2, 1): 6, (3, 2): 4, (2, 4): 2, (2, 3): 2, (1, 4): 2, (1, 1): 2}) edges

=== Ingesting 2 texts ===
Text 1: ⊢深度学习⊣
  Text HLL cardinality: 12
Text 2: ⊢人工智能⊣
  Text HLL cardinality: 13

Global state:
  Total unique tokens: 32
  Total HLLSets: 6
  Total edges: 9
  Master HLL cardinality: 32
After corpus1: [HllSet(P=10, count=13, tau=0.700, rho=0.210), HllSet(P=10, count=14, tau=0.700, rho=0.210), HllSet(P=10, count=13, tau=0.700, rho=0.210), HllSet(P=10, count=14, tau=0.700, rho=0.210), HllSet(P=10, count=12, 

### Example 3: Retrieve Specific Text's HLLSet

In [4]:
# Get HLLSet for 3rd text
text_3_hll = corpus_state.get_hllset_for_text(1)  # 0-indexed
print(f"Text 3 HLL cardinality: {text_3_hll.count():.0f}")

Text 3 HLL cardinality: 14


### What Gets Stored

Component |	Scope |	Purpose
----------|-------|---------
text_hll |	Per-text |	Self-generation loop input
master_hll |	Global |	Ensures consistent hash_idx
token_to_idx |	Global |	Unified adjacency matrix
edge_freq |	Global |	Accumulates across ingestions
lut	| Global|	Token → (reg, run) mapping
state.hllsets |	Collection	| All text HLLSets

**This design supports both**:

✅ Self-generation: Each text has its own HLLSet

✅ Global structure: Unified adjacency matrix across all texts

✅ Incremental merge: Edge frequencies accumulate properly

In [5]:
import importlib
import hllset_swarm.persistence
importlib.reload(hllset_swarm.persistence)
from hllset_swarm.persistence import PersistenceManager

# Now re-run your code
pm = PersistenceManager("./sgs_state")

# ===== Iteration 1 =====
corpus1 = ["人工智能", "机器学习"]
corpus_state.ingest_corpus(corpus1)

# Save after iteration completes
pm.save(corpus_state)

# ===== Iteration 2 (incremental) =====
corpus2 = ["深度学习", "神经网络"]
corpus_state.ingest_corpus(corpus2)

# Save again (overwrites with updated state)
pm.save(corpus_state)

# ===== Restore from disk =====
restored_state = pm.load(P=10)
if restored_state:
    print(f"Restored: {len(restored_state.hllsets)} texts")
    
    # Continue from where we left off
    corpus3 = ["计算机视觉"]
    corpus_state.ingest_corpus(corpus3)

# Check storage stats
stats = pm.get_stats()
print(stats)

pm.close()


=== Ingesting 2 texts ===
Text 1: ⊢人工智能⊣
  Text HLL cardinality: 13
Text 2: ⊢机器学习⊣
  Text HLL cardinality: 14

Global state:
  Total unique tokens: 32
  Total HLLSets: 8
  Total edges: 9
  Master HLL cardinality: 32

=== Saving corpus state ===
Saved 32 token entries to LUT
Saved AM: torch.Size([5, 5]), 9 edges, compressed to 0.5 KB
Saved 8 HLLSets, compressed to 0.4 KB
Save complete!


=== Ingesting 2 texts ===
Text 1: ⊢深度学习⊣
  Text HLL cardinality: 12
Text 2: ⊢神经网络⊣
  Text HLL cardinality: 14

Global state:
  Total unique tokens: 43
  Total HLLSets: 10
  Total edges: 10
  Master HLL cardinality: 44

=== Saving corpus state ===
Saved 43 token entries to LUT
Saved AM: torch.Size([5, 5]), 10 edges, compressed to 0.5 KB
Saved 10 HLLSets, compressed to 0.5 KB
Save complete!


=== Loading corpus state ===
Loaded AM: torch.Size([5, 5]), 10 edges
Loaded 10 HLLSets
Loaded state: 10 texts, 43 tokens
Load complete!

Restored: 10 texts

=== Ingesting 1 texts ===
Text 1: ⊢计算机视觉⊣
  Text HLL cardi