You’re right – the **current inverted-index scaffold** is **over-engineered** for the **ordering task**.  
We only need:

1. **Look-up table** `<hash> → token` (1-,2-,3-gram) – **O(1)** cover extraction  
2. **TF-based adjacency matrix** `A_M` – **weights for ordering** (ordering ≠ swarm)  
3. **BSS lattice** `BSS_L` – **swarm trajectory only** (never used for order)

Below is a **refactored pipeline** that **drops the inverted indices entirely** and **uses only**:

- **hash table** for **cover extraction**  
- **TF-weighted adjacency** for **ordering**  
- **BSS lattice** stays **private to the swarm** – **never touched by ordering code**

---

## 1.  Data we *actually* need

| Name | Structure | Purpose | Built when? |
|---|---|---|---|
| **HashTable** | `Dict[int, str]` | `<hash> → token` | **ingestion** |
| **TF Matrix** | `sparse float32` | `weight = log(TF + 1)` | **ingestion** |
| **BSS Lattice** | `sparse float32` | **swarm trajectory only** | **separate build** |

**No inverted indices**, **no register/run slots**, **no bit-vectors for ordering**.

---

## 2.  Ingestion (new, 30 lines)

In [1]:
import torch
import hashlib
from typing import List, Dict, Tuple, Set
from collections import defaultdict

# HyperLogLog parameters
P = 10  # 2^10 = 1024 registers

def hash32(s: str) -> int:
    """Compute 32-bit hash of string"""
    return int(hashlib.md5(s.encode('utf-8')).hexdigest()[:8], 16)

def slot(h: int) -> Tuple[int, int]:
    """Extract (register, run_of_zeros) from hash"""
    reg = h & ((1 << P) - 1)
    run = (h >> P).bit_length() if h >> P else 0
    return reg, run

In [2]:
class LookupTable:
    """
    Hash-based lookup table for tokens with collision handling.
    
    Structure:
        hash -> {
            'tokens': [list of tokens with this hash],
            'hll_pair': (reg_num, run_zero)
        }
    """
    def __init__(self):
        self.table: Dict[int, Dict] = {}
        self.token_to_hash: Dict[str, int] = {}
    
    def add_token(self, token: str) -> Tuple[int, int]:
        """
        Add token to LUT, return its HLLSet representation.
        
        Returns:
            (reg_num, run_zero) pair
        """
        h = hash32(token)
        reg, run = slot(h)
        
        if h not in self.table:
            self.table[h] = {
                'tokens': [],
                'hll_pair': (reg, run)
            }
        
        if token not in self.table[h]['tokens']:
            self.table[h]['tokens'].append(token)
            self.token_to_hash[token] = h
        
        return reg, run
    
    def get_tokens_by_hll(self, reg: int, run: int) -> List[str]:
        """Get all tokens that map to this (reg, run) pair"""
        result = []
        for h, data in self.table.items():
            if data['hll_pair'] == (reg, run):
                result.extend(data['tokens'])
        return result
    
    def get_hll_pair(self, token: str) -> Tuple[int, int]:
        """Get HLLSet representation for token"""
        h = self.token_to_hash.get(token)
        if h is not None:
            return self.table[h]['hll_pair']
        return None

In [3]:
def build_hllset(lut: LookupTable) -> Set[Tuple[int, int]]:
    """
    Build HLLSet from all registered tokens in LUT.
    
    Returns:
        Set of (reg, run) pairs
    """
    hllset = set()
    for h, data in lut.table.items():
        hllset.add(data['hll_pair'])
    return hllset

In [4]:
def ingest_corpus(corpus: List[str]) -> Tuple[torch.Tensor, Dict[str, int], LookupTable]:
    """
    Ingest corpus and build:
    1. Lookup Table (LUT) with hash collisions and HLLSet pairs
    2. Adjacency Matrix (AM) with frequency-based edges
    
    Returns:
        adj: Sparse adjacency matrix
        tok_id: token → AM node ID mapping
        lut: LookupTable instance with hash and HLL data
    """
    START = "<START>"
    END = "<END>"
    
    lut = LookupTable()
    tok_id = {START: 0, END: 1}
    edge_freq = defaultdict(int)
    
    # Register special tokens
    lut.add_token(START)
    lut.add_token(END)
    
    for line in corpus:
        text = line.strip()
        if len(text) == 0:
            continue
        
        # === START connection ===
        first_char = text[0]
        if first_char not in tok_id:
            tok_id[first_char] = len(tok_id)
        lut.add_token(first_char)
        
        start_2gram = START + first_char
        if start_2gram not in tok_id:
            tok_id[start_2gram] = len(tok_id)
        lut.add_token(start_2gram)
        
        edge_freq[(tok_id[START], tok_id[start_2gram])] += 1
        edge_freq[(tok_id[start_2gram], tok_id[first_char])] += 1
        
        # === Sliding window with step=2 ===
        i = 0
        while i < len(text):
            g1 = text[i] if i < len(text) else None
            g2 = text[i:i+2] if i+1 < len(text) else None
            g3 = text[i:i+3] if i+2 < len(text) else None
            
            # Register all n-grams in LUT and AM
            for gram in [g1, g2, g3]:
                if gram and gram not in tok_id:
                    tok_id[gram] = len(tok_id)
                    lut.add_token(gram)
            
            # Build edges: 1g → 2g → 3g → next_1g
            if g1 and g2:
                edge_freq[(tok_id[g1], tok_id[g2])] += 1
                
                if g3:
                    edge_freq[(tok_id[g2], tok_id[g3])] += 1
                    
                    if i+2 < len(text):
                        next_g1 = text[i+2]
                        if next_g1 not in tok_id:
                            tok_id[next_g1] = len(tok_id)
                            lut.add_token(next_g1)
                        edge_freq[(tok_id[g3], tok_id[next_g1])] += 1
            
            i += 2
        
        # === END connection ===
        last_char = text[-1]
        if last_char not in tok_id:
            tok_id[last_char] = len(tok_id)
            lut.add_token(last_char)
        
        end_2gram = last_char + END
        if end_2gram not in tok_id:
            tok_id[end_2gram] = len(tok_id)
        lut.add_token(end_2gram)
        
        edge_freq[(tok_id[last_char], tok_id[end_2gram])] += 1
        edge_freq[(tok_id[end_2gram], tok_id[END])] += 1
    
    # Build sparse adjacency matrix
    N = len(tok_id)
    adj_u, adj_v, adj_w = [], [], []
    
    for (u, v), freq in edge_freq.items():
        adj_u.append(u)
        adj_v.append(v)
        adj_w.append(float(freq))
    
    adj = torch.sparse_coo_tensor(
        indices=torch.tensor([adj_u, adj_v], dtype=torch.long),
        values=torch.tensor(adj_w, dtype=torch.float32),
        size=(N, N)
    ).coalesce()
    
    return adj, tok_id, lut

In [5]:
def decompose_ngrams(tokens: List[str], n: int) -> Set[str]:
    """
    Decompose n-grams into 1-grams.
    
    Args:
        tokens: List of n-gram tokens
        n: n-gram size (2 or 3)
    
    Returns:
        Set of 1-gram characters
    """
    result = set()
    for token in tokens:
        if len(token) == n:
            for char in token:
                result.add(char)
    return result

## 3.  Cover extraction (hash table only)

In [6]:
def unambiguate_tokens(hllset: Set[Tuple[int, int]], lut: LookupTable, tok_id: Dict[str, int]) -> Dict[str, int]:
    """
    Disambiguate tokens using intersection of decomposed n-grams.
    
    Algorithm:
    1. T_1 = {all 1-gram tokens matching HLLSet pairs}
    2. T_2 = {1-grams from decomposed 2-grams matching HLLSet}
    3. T_3 = {1-grams from decomposed 3-grams matching HLLSet}
    4. Return intersection T_1 ∩ T_2 ∩ T_3
    
    Returns:
        Disambiguated token → node_id mapping
    """
    # Collect tokens by n-gram type that match HLLSet
    tokens_1g = []
    tokens_2g = []
    tokens_3g = []
    
    for reg, run in hllset:
        matching_tokens = lut.get_tokens_by_hll(reg, run)
        for token in matching_tokens:
            if len(token) == 1:
                tokens_1g.append(token)
            elif len(token) == 2:
                tokens_2g.append(token)
            elif len(token) == 3:
                tokens_3g.append(token)
    
    # Build sets
    T_1 = set(tokens_1g)
    T_2 = decompose_ngrams(tokens_2g, n=2)
    T_3 = decompose_ngrams(tokens_3g, n=3)
    
    # Intersection
    disambiguated_1g = T_1 & T_2 & T_3
    
    # Now rebuild full token set with disambiguated 1-grams
    result = {}
    
    # Add disambiguated 1-grams
    for token in disambiguated_1g:
        if token in tok_id:
            result[token] = tok_id[token]
    
    # Add 2-grams and 3-grams that are composed only of disambiguated 1-grams
    for token in tokens_2g:
        if len(token) == 2 and all(c in disambiguated_1g for c in token):
            if token in tok_id:
                result[token] = tok_id[token]
    
    for token in tokens_3g:
        if len(token) == 3 and all(c in disambiguated_1g for c in token):
            if token in tok_id:
                result[token] = tok_id[token]
    
    # Always include START and END
    result["<START>"] = tok_id["<START>"]
    result["<END>"] = tok_id["<END>"]
    
    return result

→ **O(cover size)** – **no bit-vector scan**, **no register/run**.

---

## 4.  Adjacency weights for ordering (TF-based)


In [7]:
def prune_adjacency(adj: torch.Tensor, keep_token_ids: Dict[str, int], tok_id: Dict[str, int]) -> Tuple[torch.Tensor, Dict[int, int], Dict[int, str]]:
    """
    Prune adjacency matrix to only keep disambiguated tokens.
    """
    keep_ids = set(keep_token_ids.values())
    keep_list = sorted(keep_ids)
    old_to_new = {old_id: new_id for new_id, old_id in enumerate(keep_list)}
    
    id_to_tok = {v: k for k, v in tok_id.items()}
    new_to_tok = {new_id: id_to_tok[old_id] for old_id, new_id in old_to_new.items()}
    
    adj = adj.coalesce()
    indices = adj.indices()
    values = adj.values()
    
    mask = torch.tensor([
        u.item() in keep_ids and v.item() in keep_ids
        for u, v in zip(indices[0], indices[1])
    ], dtype=torch.bool)
    
    new_indices = indices[:, mask]
    new_values = values[mask]
    
    remapped = torch.tensor([
        [old_to_new[u.item()] for u in new_indices[0]],
        [old_to_new[v.item()] for v in new_indices[1]]
    ], dtype=torch.long)
    
    pruned = torch.sparse_coo_tensor(
        indices=remapped,
        values=new_values,
        size=(len(keep_ids), len(keep_ids)),
        dtype=torch.float32
    ).coalesce()
    
    return pruned, old_to_new, new_to_tok

def reconstruct_order(pruned_adj: torch.Tensor, new_to_tok: Dict[int, str], start_id: int, end_id: int, 
                     total_tokens: int, k_threshold: float = 1.2, beam: int = 5, max_len: int = 100) -> List[List[str]]:
    """
    Reconstruct token order using beam search.
    
    Key changes:
    - Allows duplicate tokens (no visited set)
    - Stops when path length > k_threshold * total_tokens OR reaches END
    - Open-ended parsing strategy
    
    Args:
        pruned_adj: Pruned adjacency matrix
        new_to_tok: Node ID → token mapping
        start_id: START node ID
        end_id: END node ID
        total_tokens: Total disambiguated tokens count
        k_threshold: Multiplier for max path length (default 1.2)
        beam: Beam width for search
        max_len: Hard limit on path length
    
    Returns:
        List of candidate token sequences
    """
    # Build adjacency dict
    indices = pruned_adj.coalesce().indices()
    values = pruned_adj.coalesce().values()
    
    adj_dict = defaultdict(list)
    for i in range(indices.shape[1]):
        u, v, w = indices[0, i].item(), indices[1, i].item(), values[i].item()
        adj_dict[u].append((v, w))
    
    # Calculate stopping threshold
    length_threshold = int(k_threshold * total_tokens)
    
    # Beam search - no visited set, allow duplicates
    candidates = [([start_id], 0.0)]
    
    for step in range(max_len):
        next_cand = []
        
        for path, score in candidates:
            current = path[-1]
            
            # Stopping conditions:
            # 1. Reached END token
            # 2. Path length exceeds threshold
            if current == end_id or len(path) > length_threshold:
                next_cand.append((path, score))
                continue
            
            # Explore neighbors (allow revisiting nodes for duplicates)
            for next_node, weight in adj_dict.get(current, []):
                new_path = path + [next_node]
                new_score = score + weight
                next_cand.append((new_path, new_score))
        
        if not next_cand:
            break
        
        # Keep top-k candidates
        candidates = sorted(next_cand, key=lambda x: x[1], reverse=True)[:beam]
        
        # Stop if all candidates reached stopping condition
        if all(p[-1] == end_id or len(p) > length_threshold for p, _ in candidates):
            break
    
    # Convert to token sequences
    results = []
    for path, score in candidates:
        # Extract tokens, excluding START and END
        tokens = []
        for node_id in path:
            if node_id not in (start_id, end_id):
                token = new_to_tok[node_id]
                tokens.append(token)
        results.append((tokens, score))
    
    return results

def greedy_reconstruct(pruned_adj: torch.Tensor, new_to_tok: Dict[int, str], start_id: int, end_id: int,
                       total_tokens: int, k_threshold: float = 1.5) -> Tuple[List[str], int]:
    """
    Greedy reconstruction: always pick highest weight edge.
    Allows duplicates, stops when length > k*total_tokens or END reached.
    
    Returns:
        (token_sequence, path_length)
    """
    # Build adjacency dict
    indices = pruned_adj.coalesce().indices()
    values = pruned_adj.coalesce().values()
    
    adj_dict = defaultdict(list)
    for i in range(indices.shape[1]):
        u, v, w = indices[0, i].item(), indices[1, i].item(), values[i].item()
        adj_dict[u].append((v, w))
    
    length_threshold = int(k_threshold * total_tokens)
    
    path = [start_id]
    current = start_id
    
    for step in range(length_threshold * 2):  # Hard limit
        # Stop if reached END or exceeded threshold
        if current == end_id or len(path) > length_threshold:
            break
        
        neighbors = adj_dict.get(current, [])
        if not neighbors:
            break
        
        # Pick highest weight neighbor (allow revisiting)
        next_node, weight = max(neighbors, key=lambda x: x[1])
        path.append(next_node)
        current = next_node
    
    # Extract tokens
    tokens = [new_to_tok[i] for i in path if i not in (start_id, end_id)]
    return tokens, len(path)


→ **uses TF weights** – **never touches BSS lattice**.

---

## 5.  Demo (same corpus, new pipeline)

In [9]:
CORPUS = [
    "人工智能正在改变世界",
    "机器学习让代码更聪明",
    "深度学习是未来的钥匙"
]

print("=== Step 1: Ingest Corpus ===")
adj, tok_id, lut = ingest_corpus(CORPUS)

print(adj)
print(tok_id)
print(lut.table)

print(f"Total tokens in AM: {len(tok_id)}")
print(f"Total edges: {adj._nnz()}")

print("\n=== Step 2: Build HLLSet ===")
hllset = build_hllset(lut)
print(f"HLLSet size: {len(hllset)} unique (reg, run) pairs")

print("\n=== Step 3: Disambiguate Tokens ===")
disambiguated = unambiguate_tokens(hllset, lut, tok_id)
print(f"Disambiguated tokens: {len(disambiguated)}")

print("\n=== Step 4: Prune Adjacency Matrix ===")
pruned_adj, old_to_new, new_to_tok = prune_adjacency(adj, disambiguated, tok_id)
start_id = old_to_new[tok_id["<START>"]]
end_id = old_to_new[tok_id["<END>"]]
print(f"Pruned edges: {pruned_adj._nnz()}")
print(f"Pruned nodes: {len(new_to_tok)}")

print("\n=== Step 5A: Greedy Reconstruction ===")
tokens_greedy, path_len = greedy_reconstruct(
    pruned_adj, new_to_tok, start_id, end_id, 
    total_tokens=len(disambiguated),
    k_threshold=1.5
)
print(f"Path length: {path_len}")
print(f"1-grams only: {''.join([t for t in tokens_greedy if len(t) == 1])}")
print(f"Full sequence: {' '.join(tokens_greedy[:50])}")  # First 50 tokens

print("\n=== Step 5B: Beam Search Reconstruction ===")
candidates = reconstruct_order(
    pruned_adj, new_to_tok, start_id, end_id,
    total_tokens=len(disambiguated),
    k_threshold=1.5,
    beam=3
)

for i, (tokens, score) in enumerate(candidates[:3], 1):
    text_1g = ''.join([t for t in tokens if len(t) == 1])
    print(f"\nCandidate {i} (score={score:.2f}):")
    print(f"  1-grams: {text_1g[:100]}")  # First 100 chars
    print(f"  Length: {len(tokens)} tokens")
    print(f"  Sample: {' '.join(tokens[:20])}")

=== Step 1: Ingest Corpus ===
tensor(indices=tensor([[ 0,  0,  0,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12,
                        13, 14, 15, 17, 18, 19, 20, 21, 22, 23, 24, 24, 25, 26,
                        27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39, 40, 41,
                        42, 43, 44, 45, 46, 47, 49, 50],
                       [ 3, 20, 37,  4,  2,  5,  6,  7,  8,  9, 10, 11, 12, 13,
                        14, 15, 16, 18,  1, 21, 19, 22, 23, 24, 25, 40, 26, 27,
                        28, 29, 30, 31, 32, 33, 35,  1, 38, 36, 39, 23, 41, 42,
                        43, 44, 45, 46, 47, 48, 50,  1]]),
       values=tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
                      1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 1., 1., 1., 1.,
                      1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
                      1., 1., 1., 1., 1., 1., 1., 1.]),
       size=(51, 51), nnz=50, layout=torch.sparse_coo)
{'<START>': 0, '<END>': 1, '人