# String predictor

Tablog encodes column name strings in headers and we want to compress them at least a bit -- the strings should not be the dominant part of the dataset, but still saving few bytes is always good.

This file contains experiments that should help determining an appropriate compression algorithm.

Goals for this part are (in order of importance):
1. fast and light
2. simple-ishto implement
3. compresses well enough to make me happy

In [1]:
import os
import re
import collections
import math
import string
import heapq
import itertools
import string
import gzip

In [2]:
import requests
from tqdm.notebook import tqdm, trange

## Manual test data
These are just a couple of strings that I feel will be representative of the intended use of Tablog.
Used mostly to get an intuitive feel of how the compression behaves.

In [3]:
manual_test_data = [
    "deadbeef", "hello_world", "voltage", "current", "left", "column58", "left_motor_speed", "kP", "right_motor_kD",
    "POWER!", "timestamp", "AverageTicks", "temperature", "MCU-power", "alpha", "beta", "omega", "velocityX",
    "!@#$%^&*()", ""
]

## Harvesting test data

In [4]:
def regex_from_example_directories(regex):
    """Recursively go through all source-looking files in the
    directories and return a counter of words matching regex."""
    
    extensions = (".cpp", ".hpp", ".c", ".h", ".py", ".sh")
    directories = [
        "/home/cube/development/c/tablog/encoder",
        "/home/cube/development/c/tablog/decoder",
        "/home/cube/development/c/tablog/integration_tests",
        "/usr/src/linux"
    ]

    regex = re.compile(regex)
    
    ret = collections.Counter()
    
    for directory in tqdm(directories, desc="Looking through directories"):
        for root, dirs, files in os.walk(directory):
            for file in files:
                if not file.endswith(extensions):
                    continue
                path = os.path.join(root, file)
                with open(path, "r") as fp:
                    for line in fp:
                        for match in regex.finditer(line):
                            ret[match.group(0)] += 1

    return ret

Load strings that look like identifiers from Tablog sources and  from Linux kernel sources.

In [5]:
#data = regex_from_example_directories(r"(?<![a-zA-Z0-9_])[a-zA-Z_][a-zA-Z0-9_]*")

## English words

In [6]:
def google_books_1grams_words():
    ret = collections.Counter()
    good_string_re = re.compile(r"^([a-zA-Z]*)\t[^\t]*\t([0-9]*)")
    for letter in tqdm(string.ascii_lowercase):
        r = requests.get(f"http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-1gram-20120701-{letter}.gz", stream=True)
        file_size = int(r.headers.get('Content-Length', 0))
        with tqdm.wrapattr(r.raw, "read", total=file_size, desc=letter) as compressed_response_file:
            with gzip.open(compressed_response_file, "rt") as response_file:
                for line in response_file:
                    if match := good_string_re.match(line):
                        ret[match[1]] += int(match[2])
                        
    return ret

In [7]:
english_words = google_books_1grams_words()

  0%|          | 0/26 [00:00<?, ?it/s]

a:   0%|          | 0/343487895 [00:00<?, ?it/s]

b:   0%|          | 0/247964820 [00:00<?, ?it/s]

c:   0%|          | 0/393712861 [00:00<?, ?it/s]

d:   0%|          | 0/239389478 [00:00<?, ?it/s]

e:   0%|          | 0/204996476 [00:00<?, ?it/s]

f:   0%|          | 0/185305580 [00:00<?, ?it/s]

g:   0%|          | 0/160767731 [00:00<?, ?it/s]

h:   0%|          | 0/183438682 [00:00<?, ?it/s]

i:   0%|          | 0/203553215 [00:00<?, ?it/s]

j:   0%|          | 0/64839808 [00:00<?, ?it/s]

k:   0%|          | 0/109307817 [00:00<?, ?it/s]

l:   0%|          | 0/188008415 [00:00<?, ?it/s]

m:   0%|          | 0/289750659 [00:00<?, ?it/s]

n:   0%|          | 0/141931087 [00:00<?, ?it/s]

o:   0%|          | 0/139934943 [00:00<?, ?it/s]

p:   0%|          | 0/357808228 [00:00<?, ?it/s]

q:   0%|          | 0/26608642 [00:00<?, ?it/s]

r:   0%|          | 0/216873218 [00:00<?, ?it/s]

s:   0%|          | 0/444315601 [00:00<?, ?it/s]

t:   0%|          | 0/265071292 [00:00<?, ?it/s]

u:   0%|          | 0/88124330 [00:00<?, ?it/s]

v:   0%|          | 0/108848534 [00:00<?, ?it/s]

w:   0%|          | 0/118835349 [00:00<?, ?it/s]

x:   0%|          | 0/14649720 [00:00<?, ?it/s]

y:   0%|          | 0/26595711 [00:00<?, ?it/s]

z:   0%|          | 0/27324007 [00:00<?, ?it/s]

In [8]:
english_words.most_common(30)

[('the', 23688414489),
 ('of', 15342397280),
 ('and', 11021132912),
 ('to', 9494905988),
 ('in', 7611765281),
 ('a', 7083003595),
 ('is', 4139526351),
 ('that', 3870260345),
 ('for', 3021925527),
 ('The', 2763139452),
 ('was', 2737725870),
 ('as', 2626386982),
 ('with', 2504225400),
 ('be', 2392576396),
 ('by', 2249978492),
 ('not', 2208497726),
 ('it', 2204134503),
 ('on', 2143313303),
 ('I', 1873234887),
 ('are', 1826889845),
 ('or', 1805149769),
 ('his', 1672173489),
 ('from', 1651527506),
 ('at', 1562321315),
 ('which', 1557215562),
 ('he', 1529423270),
 ('this', 1423199366),
 ('have', 1372997478),
 ('had', 1298386780),
 ('an', 1266190915)]

In [9]:
def all_substring_compositions(s):
    """Yield all sequences of substring that concatenate to s. If len(s) >= 1 then the first item is always [s]"""
    if not len(s):
        yield []
    for i in range(len(s)):
        for substring in all_substring_compositions(s[:i]):
            yield substring + [s[i:]]

In [10]:
def ngrams(data, n):
    """Generate a counter with all length n substrings of strings in the input counter"""
    counts = collections.Counter()
    for s, count in data.items():
        for i in range(len(s) - n + 1):
            counts[s[i:i + n]] += count
    return counts

In [11]:
def ngrams_up_to(data, max_n):
    """Generate a list of counters with all length 1-maxn substrings of strings in the input counter"""
    ret = collections.Counter()
    for s, count in tqdm(data.items(), desc=f"Collecting (1..{max_n})-grams"):
        for j in range(1, max_n + 1):
            for i in range(len(s) - j + 1):
                ret[s[i:i + j]] += count
    return ret

In [12]:
english_words_ngrams = ngrams_up_to(english_words, 5)

Collecting (1..5)-grams:   0%|          | 0/6857277 [00:00<?, ?it/s]

## Dictionary extras
Padding the dictionary with ngrams by stuff that we expect to appear in the inputs, but which is not contained in the english words dataset.

The weights of different symbols are manually estimated and may be significantly non-optimal.

### Numbers

In [13]:
def generate_numbers_ngrams(p, max_value, max_ngram_length):
    ret = collections.Counter()
    for i in range(max_value):
        weight = p * (1 - p)**i
        s = str(i)
        for n in range(1, max_ngram_length + 1):
            for k in range(len(s) - n + 1):
                ret[s[k:k + n]] += weight
    for i in range(int(math.log2(max_value))):
        weight =  p * (1 - p)**i
        s = str(2**i)
        for n in range(1, max_ngram_length + 1):
            for k in range(len(s) - n + 1):
                ret[s[k:k + n]] += weight
    for i in range(int(math.log10(max_value))):
        weight = 2 * p * (1 - p)**i
        s = str(10**i)
        for n in range(1, max_ngram_length + 1):
            for k in range(len(s) - n + 1):
                ret[s[k:k + n]] += weight
    return ret

In [14]:
number_ngrams = generate_numbers_ngrams(p=0.25, max_value=10000, max_ngram_length=5)

### Symbols

In [15]:
def simple_ngrams(*weighted_strings):
    """ Generate a length-separated list with ngrams directly provided. Argument is (weight, ngram) tuples. """
    ret = []
    for weight, ngram in weighted_strings:
        while len(ret) <= len(ngram):
            ret.append(collections.Counter())
        
        ret[len(ngram)][ngram] += weight
    return ret

In [16]:
symbols_ngrams = collections.Counter([
    ("_", 1), (" ", .5), ("-", .2),
    (".", .2), (.5, ","), (", ", .2), (". ", .2), (.1, ",\n"), (.2, ";\n"),
    ("http", .1), ("@", .05)
])

### Capitals

In [17]:
def map_ngrams(ngrams, func):
    """ Map all keys of ngrams """
    ret = collections.Counter()
    for ngram, count in tqdm(ngrams.items()):
        ret[func(ngram)] += count
    return ret

In [18]:
caps_english_words_ngrams = map_ngrams(english_words_ngrams, lambda x: x.upper())

  0%|          | 0/3266865 [00:00<?, ?it/s]

## Merging ngrams

In [19]:
def merge_ngrams(*weighted_ngrams):
    """ Merge tuples of (weight, ngrams) to a single ngram block.
    Weights within each ngram block are normalized first, so that they sum up to 1. """
    ret = collections.Counter()
    for weight, ngrams in tqdm(weighted_ngrams):
        factor = weight / sum(ngrams.values())

        for ngram, count in tqdm(ngrams.items()):
            ret[ngram] += factor * count
    return ret

In [20]:
def print_ngrams(data):
    total_count = 0
    for ngrams, total in data:
        for ngram, count in ngrams.most_common():
            total_count += 1
            print(f"{ngram!r}: {100 * count / total:.1f}%")
    print(f"Total {total_count} different ngrams")

In [21]:
merged_ngrams = merge_ngrams(
    (1, english_words_ngrams),
    (0.02, caps_english_words_ngrams),
    (0.02, number_ngrams),
    (0.05, symbols_ngrams)
)

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/3266865 [00:00<?, ?it/s]

  0%|          | 0/1657513 [00:00<?, ?it/s]

  0%|          | 0/10110 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

In [28]:
def split_counter_by_length(counter):
    ret = []
    for s, count in tqdm(counter.items(), desc="Splitting by length"):
        while len(s) > len(ret):
            ret.append(collections.Counter())
        
        ret[len(s) - 1][s] += count
        
    return ret

In [44]:
def filter_ngrams(input_ngrams, min_goodness=0, max_n=float("inf")):
    def ngram_goodness(ngram_length, ngram_probability):
        """Return an estimate of how much this ngram will help with making the encoding smaller."""
        # TODO: This is based just on my feeling on how the filtering should behave...
        encoded_length = -math.log2(ngram_probability)
        return ngram_probability * ngram_length / encoded_length
    
    max_n = min(max_n, max(len(key) for key in input_ngrams.keys()))
    total = sum(count for count in input_ngrams.values())
    
    split = split_counter_by_length(input_ngrams)
    
    for n in trange(2, max_n + 1):
        current_ngrams = split[n - 1]
        current_ngrams_to_keep = collections.Counter()
        current_ngram_count = len(current_ngrams)
                
        for ngram, count in tqdm(current_ngrams.items(), desc=f"Filtering {n}-tuples"):
            ngram_probability = count / total
            if ngram_goodness(n, ngram_probability) < min_goodness:
                continue
                
            
            # Find shorter ngrams that would be combined to create this one
            i = 0
            alternative_ngrams = []
            while i < n:
                for j in reversed(range(1, n)):
                    if ngram[i:i+j] in split[j]:
                        alternative_ngrams.append(ngram[i:i+j])
                        i += j
                        break
                else:
                    i += 1
                
                
                
                
            composition_probability = 0  # How probable is it that we find replicate this ngram as a combination of shorter ones
            for composition in itertools.islice(all_substring_compositions(ngram), 1, None):
                this_composition_probability = 1
                try:
                    for substring in composition:
                        substring_probability = split[len(substring) - 1][substring] / totals[len(substring) - 1]
                        this_composition_probability *= substring_probability
                except KeyError:  # The substring of current ngram might not be in the list if it was previously pruned
                    pass
                else:
                    composition_probability += this_composition_probability
                    if composition_probability > ngram_probability:
                        break

            if ngram_probability > composition_probability:
                current_ngrams_to_keep[ngram] = count

        print(f"Keeping {len(current_ngrams_to_keep)} ({100 * len(current_ngrams_to_keep) / current_ngram_count:.0f}%) {n}-grams")
        split[n] = current_ngrams_to_keep
    
        for sub_ngram, sub_ngram_count in ngrams_up_to(split[n - 1], n - 1).items():
            split[len(sub_ngram) - 1][sub_ngram] -= sub_ngram_count
    
    for i, (current_ngrams, current_total) in enumerate(zip(filtered_ngrams, totals)):
        for ngram, count in current_ngrams.items():
            ngram_probability = count / current_total
            if ngram_goodness(n, ngram_probability) < min_goodness:
                continue
    
    return [(ngrams, total) for ngrams, total in zip(filtered_ngrams, totals)]

In [45]:
filtered_merged_ngrams = filter_ngrams(merged_ngrams, min_goodness=0.01, max_n=5)

Splitting by length:   0%|          | 0/4421739 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

Filtering 2-tuples:   0%|          | 0/2815 [00:00<?, ?it/s]

Keeping 0 (0%) 2-grams


Collecting (1..1)-grams:   0%|          | 0/2815 [00:00<?, ?it/s]

Filtering 3-tuples: 0it [00:00, ?it/s]

ZeroDivisionError: division by zero

In [None]:
print_ngrams(filtered_merged_ngrams)

# Analysing the data

In [None]:
def minmax(iterable, key=lambda x: x):
    it =  iter(iterable)
    try:
        min_v = next(it)
    except StopIteration:
        return None, None
    max_v = min_v
    min_k = key(min_v)
    max_k = min_k
    
    for v in it:
        k = key(v)
        if k < min_k:
            min_v = v
            min_k = k
        if k > max_k:
            max_v = v
            max_k = k
    
    return min_v, max_v

## Variable length Huffman coding

In [None]:
def huffman_tree(symbols):
    """Build a huffman tree from a dict of symbol: probability. """
    
    heap = [(x[1], True, x[0]) for x in symbols.items()]
    # Heap elements look like: probability, leaf flag, (leaf data | children).
    heapq.heapify(heap)
    
    while len(heap) > 1:
        first = heapq.heappop(heap)
        second = heap[0]
        
        if first > second:
            (first, second) = (second, first)
        
        new = (first[0] + second[0], False, (first, second))
        
        heapq.heapreplace(heap, new)
        
    encoding = {}
        
    def flatten_tree(element, current_encoding=""):
        if element[1]:  # is leaf
            encoding[element[2]] = current_encoding
        else:
            flatten_tree(element[2][0], current_encoding + "0")
            flatten_tree(element[2][1], current_encoding + "1")
    
    flatten_tree(heap[0])
    
    return encoding

In [None]:
def pruned_huffman_tree(symbols, max_bits, require_full_trie, max_items=float("inf")):
    all_symbols = symbols
    symbols = all_symbols.copy()
    
    while True:
        encoding = huffman_tree(symbols)
                
        max_bits_used = max(len(x) for x in encoding.values())
        
        required_symbols = set()
        if require_full_trie:
            for symbol in symbols:
                required_symbols |= prefixes(symbol)
            
        def expected_shortening(x):
            """Return expected decrease of size of the output if we include this item"""
            return (8 * len(x) - len(encoding[x])) * symbols[x]
        
        def victim_key(x):
            """Use probability of the symbol as a secondary key"""
            return (expected_shortening(x), symbols[x]) 

        worst_sym, best_sym = minmax(encoding.keys(), key=victim_key)
        
        def format_symbol(x):
            return f"{b(x)} ({len(encoding[x])}b, p = {100 * symbols[x]:.2f}%, CR = {100 * len(encoding[x]) / (8 * len(x)):.2f}%, expected shortening = {expected_shortening(x):.3f}b)"
        
        print(f"Encoded {len(encoding)} symbols, max code length {max_bits_used}, {len(required_symbols)} required symbols")
        print("  Best symbol", format_symbol(best_sym))
        print("  Worst symbol", format_symbol(worst_sym))
        
        victim = min(symbols.keys() - required_symbols, key=victim_key)
        
        if \
            expected_shortening(victim) > 0 and \
            max_bits_used <= max_bits and \
            len(encoding) <= max_items:
            return encoding

        print("Removing symbol", format_symbol(victim))
        
        del symbols[victim]

In [None]:
encoding = pruned_huffman_tree(symbols, 12, require_full_trie=False)
#encoding = huffman_tree(symbols)
print(len(encoding))
for tup, enc in sorted(encoding.items(), key=lambda x: (symbols[x[0]]), reverse=True):
    print(f"{b(tup)} -> {enc} ({len(enc)}b, p = {100 * symbols[tup]:.2f}%, expected_shortening = {symbols[tup] * (8 * len(tup) - len(enc)):.3f}b)")

Make sure that the encoding is actually a prefix code:

In [None]:
for tup1, enc1 in encoding.items():
    for tup2, enc2 in encoding.items():
        if tup1 == tup2:
            continue
        if enc1.startswith(enc2):
            print(f"{b(tup1)} -> {enc1} starts with {b(tup2)} -> {enc2}")
        assert not enc1.startswith(enc2)

In [None]:
encoding_dict = {"".join(chr(x) for x in key).encode("ascii"): e for key, e in encoding.items()}
encoding_dict_key_length = max(len(key) for key in encoding_dict)

### Encoding using a trie

In [None]:
def build_trie(encoding):
    trie = {}
    for tup, enc in sorted(encoding.items(), key=lambda x: x[0]):
        current = trie
        for c in tup[:-1]:
            current = current.setdefault(c, (None, {}))[1]
        if tup[-1] in current:
            assert current[tup[-1]][0] is None
            current[tup[-1]] = (enc, current[tup[-1]][1])
        else:
            current[tup[-1]] = (enc, {})
    
    return trie

In [None]:
def print_trie(trie, indent=""):
    for c, (enc, children) in trie.items():
        print(f"{indent}{b(c)}: {enc}")
        print_trie(children, indent + "  ")

In [None]:
trie = build_trie(encoding)
print_trie(trie)

### Encoding using perfect hashing
This uses ideas from gperf (https://www.dre.vanderbilt.edu/~schmidt/PDF/gperf.pdf).

In [None]:
def construct_perfect_hash(encoding):
    min_c, max_c = minmax(c for tup in encoding.keys() for c in tup)
    
    print("total key characters", sum(len(key) for key in encoding.keys()))
    
    key_lengths = collections.Counter(len(key) for key in encoding.keys())
    print(key_lengths)
    
    print(sum(key_lengths.values()))
    
    asso_size = max_c - min_c + 2  # + extra item for out of range
    print(max_c - min_c)
    asso = [0] * asso_size
        

In [None]:
construct_perfect_hash(encoding)

In [None]:
72**3

## Some predictor ideas

In [None]:
def encoded_error_size(e):
    """ Return size in bits of given error encoded using tablog's stream encoder, assuming perfect adaptation """
    if e == 0:
         return 1
    else:
         return 2 + 1 + math.floor(1 + math.log2(abs(e)))

In [None]:
def encoded_number_size(n):
    return 2 * math.floor(math.log2(n + 1)) + 1

In [None]:
def evaluate_predictor(method):
    print(f"Evaluating {method.__name__}")
    strings = 0
    log_bits_per_char_sum = 0
    log_bits_per_char_unique_sum = 0
    for s, count in tqdm(data.items()):
        string_bits = method(s)
        bits_per_char = string_bits / len(s)
        
        log_bits_per_char_sum += count * math.log(string_bits / len(s))
        log_bits_per_char_unique_sum += math.log(string_bits / len(s))
        
        strings += count

    print(f"    geometric mean {math.exp(log_bits_per_char_sum / strings):.2f} bits/char")
    print(f"    geometric mean (unique) {math.exp(log_bits_per_char_unique_sum / len(data)):.2f} bits/char")

In [None]:
def e_predictor(s):
    ord_e = ord("e")
    
    return \
        encoded_number_size(len(s)) + \
        sum(encoded_error_size(c - ord_e) for c in s)
evaluate_predictor(e_predictor)

In [None]:
def same_char_predictor(s):
    first_prediction = ord("e")
    
    return \
        encoded_number_size(len(s)) + \
        encoded_error_size(s[0] - first_prediction) + \
        sum(encoded_error_size(c - prev) for c, prev in zip(s[1:], s[:-1]))
evaluate_predictor(same_char_predictor)

In [None]:
def reflection_predictor(s):
    two_c = 215
    first_prediction = ord("s")
    
    return \
        encoded_number_size(len(s)) + \
        encoded_error_size(s[0] - first_prediction) + \
        sum(encoded_error_size(c - (two_c - prev)) for c, prev in zip(s[1:], s[:-1]))
evaluate_predictor(reflection_predictor)

In [None]:
def c_style(s):
    return 8 + 8 * len(s)
evaluate_predictor(c_style)

In [None]:
def small_freq_table(s):
    #freq_table = b"etisarnodlc_puf"
    freq_table = b"etisarnodlc_pufmbghyvxTwkERICSz"
    
    ret = encoded_number_size(len(s))
    
    for c in s:
        try:
            index = freq_table.index(c)
        except ValueError:
            ret += 1 + 8
        else:
            ret += encoded_number_size(index + 1)
            
    return ret
evaluate_predictor(small_freq_table)

In [None]:
def huffman_dict_verb_lengths(s, printing=False):
    ret = encoded_number_size(len(s))

    verbatim_length = 0
    verbatim_size = 0
    i = 0
    while True:
        encoded = None
        for j in reversed(range(1, 1 + min(encoding_dict_key_length, len(s) - i))):
            try:
                encoded = encoding_dict[s[i:i+j]]
            except KeyError:
                continue

            if verbatim_length > 0:
                encoded_length = 1 + encoded_number_size(verbatim_length - 1) + verbatim_size
                if printing:
                    print(f"{verbatim_length} verbatim characters: {encoded_length}b, {encoded_length/verbatim_length:.2f}b/char")
                ret += encoded_length
                verbatim_length = 0
                verbatim_size = 0

            encoded_length = 1 + len(encoded)
            if printing:
                print(f"{s[i:i+j]} in dict: {encoded_length}b, {encoded_length/j:.2f}b/char")
            ret += encoded_length
            i += j
            break
            
        if encoded is None:
            i += 1
            verbatim_length += 1
            verbatim_size += 8
    
        if i >= len(s):
            break
            
    if verbatim_length > 0:
        encoded_length = 1 + encoded_number_size(verbatim_length - 1) + verbatim_size
        if printing:
            print(f"(flush) {verbatim_length} verbatim characters: {encoded_length}b, {encoded_length/verbatim_length:.2f}b/char")
        ret += 1 + encoded_number_size(verbatim_length) + verbatim_size
        verbatim_length = 0
        verbatim_size = 0
    
    if printing:
        print(f"{ret / len(s):.2f} bits/char")
    
    return ret
evaluate_predictor(huffman_dict_verb_lengths)
#Evaluating huffman_dict_verb_lengths
#    geometric mean 6.99 bits/char
#    geometric mean (unique) 6.57 bits/char

In [None]:
def huffman_dict(s, printing=False):
    ret = encoded_number_size(len(s))

    i = 0
    while True:
        encoded = None
        for j in reversed(range(1, 1 + min(encoding_dict_key_length, len(s) - i))):
            try:
                encoded = encoding_dict[s[i:i+j]]
            except KeyError:
                continue

            encoded_length = 1 + len(encoded)
            if printing:
                print(f"{s[i:i+j]} in dict: {encoded_length}b, {encoded_length/j:.2f}b/char")
            ret += encoded_length
            i += j
            break
            
        if encoded is None:
            encoded_length = 1 + 8
            if printing:
                print(f"{b(s[i])} verbatim: {encoded_length}b")
            ret += encoded_length
            i += 1
    
        if i >= len(s):
            break
    
    if printing:
        print(f"{ret / len(s):.2f} bits/char")
    
    return ret
evaluate_predictor(huffman_dict)
#Evaluating huffman_dict
#    geometric mean 6.95 bits/char
#    geometric mean (unique) 6.52 bits/char

In [None]:
def huffman_dict_diff_verb(s, printing=False):
    ret = encoded_number_size(len(s))

    i = 0
    previous = ord('e')
    while True:
        encoded = None
        for j in reversed(range(1, 1 + min(encoding_dict_key_length, len(s) - i))):
            try:
                encoded = encoding_dict[s[i:i+j]]
            except KeyError:
                continue

            encoded_length = 1 + len(encoded)
            if printing:
                print(f"{s[i:i+j]} in dict: {encoded_length}b, {encoded_length/j:.2f}b/char")
            ret += encoded_length
            previous = s[i + j - 1]
            i += j
            break
            
        if encoded is None:
            encoded_length = 1
            error = s[i] - previous
            if error == 0:
                encoded_length += 1
            else:
                abs_error = abs(error) - 1
                fixed_bits = 4
                encoded_length += encoded_number_size(abs_error >> fixed_bits) + fixed_bits
            if printing:
                print(f"{b(s[i])} not in dict: {encoded_length}b")
            ret += encoded_length
            previous = s[i]
            i += 1
    
        if i >= len(s):
            break
    
    if printing:
        print(f"{ret / len(s):.2f} bits/char")
    
    return ret
evaluate_predictor(huffman_dict_diff_verb)
#Evaluating huffman_dict_diff_verb
#    geometric mean 6.90 bits/char
#    geometric mean (unique) 6.40 bits/char

In [None]:
def huffman_dict_grouped_hits(s, printing=False):
    global encoding_dict, encoding_dict_key_length
    
    def match_at(i):
        """Return number of characters of the string at position i that match the dictionary and the encoded length"""
        for j in reversed(range(1, 1 + min(encoding_dict_key_length, len(s) - i))):
            try:
                encoded = encoding_dict[s[i:i+j]]
            except KeyError:
                continue
            else:
                return j, len(encoded)
        return (0, None)
    
    def count_matches(i):
        """Return number of successive dictionary matches/mismatches starting at i and whether we finished at the end of the string"""
        count = 0
        while i < len(s):
            match_length, _ = match_at(i)
            if match_length:
                count += 1
                i += match_length
            else:
                return count, False
        return count, True
    
    def count_mismatches(i):
        """Return number of successive dictionary matches/mismatches starting at i and whether we finished at the end of the string"""
        count = 0
        while i < len(s):
            match_length, _ = match_at(i)
            if not match_length:
                count += 1
                i += 1
            else:
                return count, False
        return count, True
    
    def encoded_matches(i):
        length = 0
        encoded_size = 0
        while i < len(s):
            match_length, match_encoded_size = match_at(i)
            if match_length:
                if printing:
                    print(f"  {s[i:i+match_length]}: {match_encoded_size}b, {match_encoded_size/match_length:.2f}b/char")
                i += match_length
                length += match_length
                encoded_size += match_encoded_size
            else:
                break
        return length, encoded_size
    
    def encoded_mismatches(i, prev):
        length = 0
        encoded_size = 0
        while i < len(s):
            match_length, _ = match_at(i)
            if not match_length:
                error = s[i] - prev
                prev = s[i]
                if error == 0:
                    e = 1
                else:
                    abs_error = abs(error) - 1
                    fixed_bits = 3
                    e = encoded_number_size(abs_error >> fixed_bits) + fixed_bits
                
                if printing:
                    print(f"  {bytes([s[i]])}: {e}b")
                i += 1
                length += 1
                encoded_size += e
            else:
                break
        return length, encoded_size
    
    ret = 0
    i = 0
    
    # Handle a first block separately, we allow zero number of matches in it
    block_length, end = count_matches(i)
    if printing:
        print(f"First match block: {block_length}")
    ret += encoded_number_size(block_length)
        # Might be zero, if:
        # a) Starting with a non-match character
        # b) Processing an empty string
    length, encoded_size = encoded_matches(i)
    i += length
    ret += encoded_size
    
    if i > 0:
        prev = s[i - 1]
    else:
        prev = ord("e")
    
    while True:
        block_length, end = count_mismatches(i)
        if printing:
            print(f"Mismatch block: {block_length}")
        assert block_length > 0 or end
        ret += encoded_number_size(block_length)
        if not block_length:
            break
        length, encoded_size = encoded_mismatches(i, prev)
        i += length
        ret += encoded_size
        
        block_length, end = count_matches(i)
        if printing:
            print(f"Match block: {block_length}")
        assert block_length > 0 or end
        ret += encoded_number_size(block_length)
        if not block_length:
            break
        length, encoded_size = encoded_matches(i)
        i += length
        ret += encoded_size
        prev = s[i - 1]
    
    if printing:
        if len(s) > 0:
            print(f"{ret}b, {ret / len(s):.2f} bits/char")
        else:
            print(f"{ret}b")
            
    
    return ret
evaluate_predictor(huffman_dict_grouped_hits)

In [None]:
sum_log_bits_per_char = 0
count = 0
for s in manual_test_data:
    print(s)
    bits = huffman_dict_grouped_hits(s, printing=True)
    if bits > 0 and len(s) > 0:
        sum_log_bits_per_char += math.log(bits / len(s))
        count += 1
    print()
    
print(f"Geometric mean {math.exp(sum_log_bits_per_char / count):.2f}b/char")

In [None]:
smaz_dict = ["\002s,\266", "\003had\232\002leW", "\003on \216", "", "\001yS",
"\002ma\255\002li\227", "\003or \260", "", "\002ll\230\003s t\277",
"\004fromg\002mel", "", "\003its\332", "\001z\333", "\003ingF", "\001>\336",
"\001 \000\003   (\002nc\344", "\002nd=\003 on\312",
"\002ne\213\003hat\276\003re q", "", "\002ngT\003herz\004have\306\003s o\225",
"", "\003ionk\003s a\254\002ly\352", "\003hisL\003 inN\003 be\252", "",
"\003 fo\325\003 of \003 ha\311", "", "\002of\005",
"\003 co\241\002no\267\003 ma\370", "", "", "\003 cl\356\003enta\003 an7",
"\002ns\300\001\"e", "\003n t\217\002ntP\003s, \205",
"\002pe\320\003 we\351\002om\223", "\002on\037", "", "\002y G", "\003 wa\271",
"\003 re\321\002or*", "", "\002=\"\251\002ot\337", "\003forD\002ou[",
"\003 toR", "\003 th\r", "\003 it\366",
"\003but\261\002ra\202\003 wi\363\002</\361", "\003 wh\237", "\002  4",
"\003nd ?", "\002re!", "", "\003ng c", "",
"\003ly \307\003ass\323\001a\004\002rir", "", "", "", "\002se_", "\003of \"",
"\003div\364\002ros\003ere\240", "", "\002ta\310\001bZ\002si\324", "",
"\003and\a\002rs\335", "\002rt\362", "\002teE", "\003ati\316", "\002so\263",
"\002th\021", "\002tiJ\001c\034\003allp", "\003ate\345", "\002ss\246",
"\002stM", "", "\002><\346", "\002to\024", "\003arew", "\001d\030",
"\002tr\303", "", "\001\n1\003 a \222", "\003f tv\002veo", "\002un\340", "",
"\003e o\242", "\002a \243\002wa\326\001e\002", "\002ur\226\003e a\274",
"\002us\244\003\n\r\n\247", "\002ut\304\003e c\373", "\002we\221", "", "",
"\002wh\302", "\001f,", "", "", "", "\003d t\206", "", "", "\003th \343",
"\001g;", "", "", "\001\r9\003e s\265", "\003e t\234", "", "\003to Y",
"\003e\r\n\236", "\002d \036\001h\022", "", "\001,Q", "\002 a\031", "\002 b^",
"\002\r\n\025\002 cI", "\002 d\245", "\002 e\253", "\002 fh\001i\b\002e \v",
"", "\002 hU\001-\314", "\002 i8", "", "", "\002 l\315", "\002 m{",
"\002f :\002 n\354", "\002 o\035", "\002 p}\001.n\003\r\n\r\250", "",
"\002 r\275", "\002 s>", "\002 t\016", "", "\002g \235\005which+\003whi\367",
"\002 w5", "\001/\305", "\003as \214", "\003at \207", "", "\003who\331", "",
"\001l\026\002h \212", "", "\002, $", "", "\004withV", "", "", "", "\001m-", "",
"", "\002ac\357", "\002ad\350", "\003TheH", "", "", "\004this\233\001n\t",
"", "\002. y", "", "\002alX\003e, \365", "\003tio\215\002be\\",
"\002an\032\003ver\347", "", "\004that0\003tha\313\001o\006", "\003was2",
"\002arO", "\002as.", "\002at'\003the\001\004they\200\005there\322\005theird",
"\002ce\210", "\004were]", "", "\002ch\231\002l \264\001p<", "", "",
"\003one\256", "", "\003he \023\002dej", "\003ter\270", "\002cou", "",
"\002by\177\002di\201\002eax", "", "\002ec\327", "\002edB", "\002ee\353", "",
"", "\001r\f\002n )", "", "", "", "\002el\262", "", "\003in i\002en3", "",
"\002o `\001s\n", "", "\002er\033", "\003is t\002es6", "", "\002ge\371",
"\004.com\375", "\002fo\334\003our\330", "\003ch \301\001t\003", "\002hab", "",
"\003men\374", "", "\002he\020", "", "", "\001u&", "\002hif", "",
"\003not\204\002ic\203", "\003ed @\002id\355", "", "", "\002ho\273",
"\002r K\001vm", "", "", "", "\003t t\257\002il\360", "\002im\342",
"\003en \317\002in\017", "\002io\220", "\002s \027\001wA", "", "\003er |",
"\003es ~\002is%", "\002it/", "", "\002iv\272", "",
"\002t #\ahttp://C\001x\372", "\002la\211", "\001<\341", "\003, a\224"]

In [None]:
sum(len(x) + 1 for x in smaz_dict)

In [None]:
241 * 3 + 141*2