# Mock Data for Compressed Matrices

In [3]:
import numpy as np
import constriction
from tqdm import tqdm

## Preparation

### Create Random Quantized Matrices

In [4]:
np.random.seed(202404151)
matrices = np.random.randn(100, 1024, 1024)

In [5]:
quantized_matrices = np.round(matrices * 8).astype(np.int8)
print(f'mins: {quantized_matrices.min(axis=(1, 2))}')
print(f'maxs: {quantized_matrices.max(axis=(1, 2))}')

mins: [-41 -38 -40 -38 -44 -38 -37 -41 -41 -37 -39 -40 -39 -37 -39 -40 -37 -41
 -40 -40 -39 -41 -38 -42 -39 -39 -42 -39 -41 -41 -38 -39 -40 -38 -38 -38
 -42 -41 -37 -44 -38 -38 -37 -39 -39 -36 -39 -42 -40 -40 -41 -38 -36 -41
 -40 -41 -42 -38 -38 -38 -42 -37 -39 -39 -38 -36 -39 -37 -38 -40 -43 -36
 -37 -37 -41 -39 -41 -37 -40 -42 -38 -37 -38 -37 -38 -42 -41 -42 -38 -42
 -40 -46 -39 -37 -42 -41 -37 -40 -38 -42]
maxs: [38 39 36 36 39 37 36 38 36 39 38 37 39 40 37 40 37 38 39 41 38 40 37 38
 36 36 39 39 36 39 37 37 37 38 39 39 41 41 37 36 37 38 35 37 37 40 42 36
 41 37 37 41 40 39 38 40 42 43 38 41 37 36 41 37 37 38 38 39 40 36 38 39
 39 40 39 38 39 37 38 40 39 43 43 44 36 42 44 39 43 39 37 37 38 43 40 38
 40 44 39 39]


### Create Entropy Models

In [6]:
values = []
counts = []
counts_12bit = []
entropies = []
cross_entropies_12bit = []
for quantized_matrix in tqdm(quantized_matrices):
    v, c = np.unique(quantized_matrix, return_counts=True)
    order = np.argsort(c)[::-1]
    v = v[order]
    c = c[order]
    values.append(v)
    counts.append(c)
    entropies.append(np.log2(quantized_matrix.size) - c @ np.log2(c) / quantized_matrix.size)
    c12bit = np.maximum(np.round(c / 256).astype(np.int32), 1)
    excess = sum(c12bit) - (1 << 12)
    assert excess >= 0 and excess <= len(c)
    if excess != 0:
        assert c12bit[excess - 1] > 1
    c12bit[:excess] -= 1
    counts_12bit.append(c12bit)
    cross_entropies_12bit.append(12 - c12bit @ np.log2(c12bit) / (1 << 12))
entropies = np.array(entropies)
cross_entropies_12bit = np.array(cross_entropies_12bit)
overheads = cross_entropies_12bit - entropies
print(f'Maximum absolute overhead: {overheads.max():.4f}')
print(f'Maximum relative overhead: {(overheads * 100 / entropies).max():.2f} %')
entropies, cross_entropies_12bit

100%|██████████| 100/100 [00:02<00:00, 37.27it/s]

Maximum absolute overhead: 0.0471
Maximum relative overhead: 0.93 %





(array([5.04873129, 5.04657062, 5.04677307, 5.04884595, 5.0473163 ,
        5.0486622 , 5.04661374, 5.04849787, 5.04695842, 5.04865761,
        5.04702978, 5.04732894, 5.04798469, 5.04782882, 5.04794853,
        5.04891135, 5.04778863, 5.0474892 , 5.04845118, 5.04870344,
        5.04692647, 5.0466256 , 5.04761184, 5.04772784, 5.04902663,
        5.04909297, 5.0483021 , 5.0477679 , 5.04680325, 5.0470743 ,
        5.04774866, 5.04810709, 5.04799223, 5.04751116, 5.04805406,
        5.04894296, 5.04914948, 5.04813692, 5.04773768, 5.04766498,
        5.04820142, 5.04674641, 5.04612653, 5.04954575, 5.04798425,
        5.04911074, 5.04806961, 5.04883531, 5.04730741, 5.04820349,
        5.0475805 , 5.04728574, 5.0476444 , 5.04874619, 5.04859174,
        5.04892564, 5.04605025, 5.04941888, 5.04747965, 5.04812066,
        5.04752634, 5.04696448, 5.04841835, 5.0466188 , 5.04660691,
        5.04749418, 5.04658043, 5.04970472, 5.04751697, 5.0466669 ,
        5.0487524 , 5.04709681, 5.0476797 , 5.04

In [7]:
inv_vocabs = [{v: i for i, v in enumerate(vs)} for vs in values]

### Entropy Coder

In [8]:
class AnsCoder:
    def __init__(self, precision, word_size, compressed=[]):
        self.precision = precision
        self.word_size = word_size
        self.word_mask = (1 << word_size) - 1
        self.quantile_mask = (1 << precision) - 1
        self.bulk = compressed.copy()
        self.head = 0
        while len(self.bulk) != 0 and (self.head >> word_size) == 0:
            self.head = (self.head << word_size) | self.bulk.pop()

    def push(self, symbol, m):
        if (self.head >> (2 * self.word_size - self.precision)) >= m[symbol]:
            self.bulk.append(self.head & self.word_mask)
            self.head >>= self.word_size

        z = self.head % m[symbol] + sum(m[0:symbol])
        self.head //= m[symbol]
        self.head = (self.head << self.precision) | z

    def pop(self, m):
        z = self.head & self.quantile_mask
        self.head >>= self.precision
        for symbol, m_symbol in enumerate(m):
            if z >= m_symbol:
                z -= m_symbol
            else:
                break
        self.head = self.head * m_symbol + z
        if (self.head >> self.word_size) == 0 and len(self.bulk) != 0:
            self.head = (self.head << self.word_size) | self.bulk.pop()
        return symbol

    def get_compressed(self):
        compressed = self.bulk.copy()
        head = self.head
        while head != 0:
            compressed.append(head & self.word_mask)
            head >>= self.word_size
        return compressed

## Compression 1: independently compressed matrix entries, with non-interleaved matrices

In [None]:
def compress_entries(row, col):
    coder = AnsCoder(12, 16, [1, 1])
    for i in reversed(range(len(quantized_matrices))):
        entry = quantized_matrices[i, row, col]
        symbol_id = inv_vocabs[i][entry]
        coder.push(symbol_id, counts_12bit[i])
    return np.array(coder.get_compressed(), dtype=np.uint16)[::-1]

compressed_entries = [[
        compress_entries(row, col)
        for row in range(1024)
    ] for col in tqdm(range(1024))
]

  0%|          | 0/1024 [00:00<?, ?it/s]

100%|██████████| 1024/1024 [24:52<00:00,  1.46s/it]


In [None]:
lengths = np.array([[len(e) for e in es] for es in compressed_entries])
lengths.sum() * 16 / quantized_matrices.size, cross_entropies_12bit.mean()

(5.293472442626953, 5.087297199350456)

In [None]:
lengths.sum() / 1024

33878.2236328125

In [17]:
compressed_stream = []
coder_offsets = np.zeros((len(matrices), matrices[0].size), dtype=np.uint32)
coders = [AnsCoder(12, 16, [1, 1]) for _ in range(matrices[0].size)]
for i in tqdm(reversed(range(len(matrices)))):
    model = counts_12bit[i]
    inv_vocab = inv_vocabs[i]
    for j, entry, coder in zip(range(quantized_matrices[i].size - 1, -1, -1), reversed(quantized_matrices[i].ravel()), coders):
        symbol_id = inv_vocab[entry]
        coder.push(symbol_id, model)
        if len(coder.bulk) == 1:
            compressed_stream.append(coder.bulk[0])
            coder.bulk = []
        coder_offsets[i, j] = len(compressed_stream)
    print(f'Encoded matrix {i}; words emitted so far: {coder_offsets[i, -1]}')

1it [00:36, 36.04s/it]

Encoded matrix 99; words emitted so far: 0


2it [00:53, 25.03s/it]

Encoded matrix 98; words emitted so far: 0


3it [01:11, 21.76s/it]

Encoded matrix 97; words emitted so far: 6442


4it [01:29, 20.54s/it]

Encoded matrix 96; words emitted so far: 256628


5it [01:49, 20.37s/it]

Encoded matrix 95; words emitted so far: 1048779


6it [02:09, 20.24s/it]

Encoded matrix 94; words emitted so far: 1061698


7it [02:32, 20.99s/it]

Encoded matrix 93; words emitted so far: 1274851


8it [02:56, 21.83s/it]

Encoded matrix 92; words emitted so far: 2035984


9it [03:20, 22.66s/it]

Encoded matrix 91; words emitted so far: 2114211


10it [03:45, 23.42s/it]

Encoded matrix 90; words emitted so far: 2294396


11it [04:09, 23.38s/it]

Encoded matrix 89; words emitted so far: 2910784


12it [04:20, 19.81s/it]

Encoded matrix 88; words emitted so far: 3164586


13it [04:33, 17.83s/it]

Encoded matrix 87; words emitted so far: 3316701


14it [05:01, 20.80s/it]

Encoded matrix 86; words emitted so far: 3820261


15it [05:16, 19.14s/it]

Encoded matrix 85; words emitted so far: 4203805


16it [05:34, 18.64s/it]

Encoded matrix 84; words emitted so far: 4343773


17it [05:59, 20.62s/it]

Encoded matrix 83; words emitted so far: 4762724


18it [06:26, 22.37s/it]

Encoded matrix 82; words emitted so far: 5207027


19it [06:53, 23.83s/it]

Encoded matrix 81; words emitted so far: 5373774


20it [07:18, 24.27s/it]

Encoded matrix 80; words emitted so far: 5727253


21it [07:39, 23.31s/it]

Encoded matrix 79; words emitted so far: 6184446


22it [08:02, 23.23s/it]

Encoded matrix 78; words emitted so far: 6404355


23it [08:21, 21.84s/it]

Encoded matrix 77; words emitted so far: 6708662


24it [08:40, 21.15s/it]

Encoded matrix 76; words emitted so far: 7152625


25it [09:02, 21.30s/it]

Encoded matrix 75; words emitted so far: 7428623


26it [09:21, 20.66s/it]

Encoded matrix 74; words emitted so far: 7701463


27it [09:48, 22.49s/it]

Encoded matrix 73; words emitted so far: 8120841


28it [10:19, 25.05s/it]

Encoded matrix 72; words emitted so far: 8441861


29it [10:48, 26.18s/it]

Encoded matrix 71; words emitted so far: 8702378


30it [11:33, 31.84s/it]

Encoded matrix 70; words emitted so far: 9093002


31it [11:58, 29.80s/it]

Encoded matrix 69; words emitted so far: 9444019


32it [12:38, 32.80s/it]

Encoded matrix 68; words emitted so far: 9709118


33it [13:13, 33.50s/it]

Encoded matrix 67; words emitted so far: 10069516


34it [13:54, 35.91s/it]

Encoded matrix 66; words emitted so far: 10437339


35it [14:37, 37.87s/it]

Encoded matrix 65; words emitted so far: 10716957


36it [15:02, 33.95s/it]

Encoded matrix 64; words emitted so far: 11052762


37it [15:27, 31.30s/it]

Encoded matrix 63; words emitted so far: 11425707


38it [15:50, 28.89s/it]

Encoded matrix 62; words emitted so far: 11722859


39it [16:16, 27.92s/it]

Encoded matrix 61; words emitted so far: 12041996


40it [16:48, 29.34s/it]

Encoded matrix 60; words emitted so far: 12410758


41it [17:15, 28.47s/it]

Encoded matrix 59; words emitted so far: 12726112


42it [17:37, 26.63s/it]

Encoded matrix 58; words emitted so far: 13035503


43it [18:01, 25.82s/it]

Encoded matrix 57; words emitted so far: 13395641


44it [18:24, 25.05s/it]

Encoded matrix 56; words emitted so far: 13725241


45it [18:46, 24.07s/it]

Encoded matrix 55; words emitted so far: 14032251


46it [19:09, 23.63s/it]

Encoded matrix 54; words emitted so far: 14382221


47it [19:31, 23.27s/it]

Encoded matrix 53; words emitted so far: 14721415


48it [19:56, 23.66s/it]

Encoded matrix 52; words emitted so far: 15030574


49it [20:21, 24.29s/it]

Encoded matrix 51; words emitted so far: 15370722


50it [20:48, 24.92s/it]

Encoded matrix 50; words emitted so far: 15714206


51it [21:22, 27.62s/it]

Encoded matrix 49; words emitted so far: 16029057


52it [21:45, 26.31s/it]

Encoded matrix 48; words emitted so far: 16360831


53it [22:16, 27.62s/it]

Encoded matrix 47; words emitted so far: 16706084


54it [23:03, 33.60s/it]

Encoded matrix 46; words emitted so far: 17026550


55it [23:23, 29.47s/it]

Encoded matrix 45; words emitted so far: 17353269


56it [23:58, 31.06s/it]

Encoded matrix 44; words emitted so far: 17697070


57it [24:17, 27.60s/it]

Encoded matrix 43; words emitted so far: 18023450


58it [24:32, 23.72s/it]

Encoded matrix 42; words emitted so far: 18346813


59it [24:55, 23.42s/it]

Encoded matrix 41; words emitted so far: 18687525


60it [25:15, 22.51s/it]

Encoded matrix 40; words emitted so far: 19018870


61it [25:31, 20.49s/it]

Encoded matrix 39; words emitted so far: 19341558


62it [25:46, 18.79s/it]

Encoded matrix 38; words emitted so far: 19679064


63it [26:01, 17.68s/it]

Encoded matrix 37; words emitted so far: 20013156


64it [26:13, 15.99s/it]

Encoded matrix 36; words emitted so far: 20336871


65it [26:26, 15.08s/it]

Encoded matrix 35; words emitted so far: 20670225


66it [26:40, 14.69s/it]

Encoded matrix 34; words emitted so far: 21006190


67it [26:54, 14.51s/it]

Encoded matrix 33; words emitted so far: 21332326


68it [27:07, 14.17s/it]

Encoded matrix 32; words emitted so far: 21663189


69it [27:24, 14.98s/it]

Encoded matrix 31; words emitted so far: 21999555


70it [27:37, 14.52s/it]

Encoded matrix 30; words emitted so far: 22327112


71it [27:49, 13.76s/it]

Encoded matrix 29; words emitted so far: 22655948


72it [28:02, 13.50s/it]

Encoded matrix 28; words emitted so far: 22991946


73it [28:15, 13.22s/it]

Encoded matrix 27; words emitted so far: 23321389


74it [28:28, 13.09s/it]

Encoded matrix 26; words emitted so far: 23649218


75it [28:40, 12.86s/it]

Encoded matrix 25; words emitted so far: 23984877


76it [28:57, 14.00s/it]

Encoded matrix 24; words emitted so far: 24315655


77it [29:07, 13.05s/it]

Encoded matrix 23; words emitted so far: 24644003


78it [29:19, 12.49s/it]

Encoded matrix 22; words emitted so far: 24977315


79it [29:33, 12.96s/it]

Encoded matrix 21; words emitted so far: 25309429


80it [29:46, 13.17s/it]

Encoded matrix 20; words emitted so far: 25637461


81it [29:59, 13.17s/it]

Encoded matrix 19; words emitted so far: 25969725


82it [30:14, 13.61s/it]

Encoded matrix 18; words emitted so far: 26302481


83it [30:26, 13.18s/it]

Encoded matrix 17; words emitted so far: 26632286


84it [30:43, 14.36s/it]

Encoded matrix 16; words emitted so far: 26963759


85it [30:57, 14.28s/it]

Encoded matrix 15; words emitted so far: 27295998


86it [31:12, 14.27s/it]

Encoded matrix 14; words emitted so far: 27625989


87it [31:28, 14.77s/it]

Encoded matrix 13; words emitted so far: 27956976


88it [31:40, 14.17s/it]

Encoded matrix 12; words emitted so far: 28289041


89it [31:52, 13.36s/it]

Encoded matrix 11; words emitted so far: 28620245


90it [32:04, 13.04s/it]

Encoded matrix 10; words emitted so far: 28950100


91it [32:22, 14.56s/it]

Encoded matrix 9; words emitted so far: 29282314


92it [32:42, 16.06s/it]

Encoded matrix 8; words emitted so far: 29613850


93it [32:53, 14.64s/it]

Encoded matrix 7; words emitted so far: 29943586


94it [33:05, 13.79s/it]

Encoded matrix 6; words emitted so far: 30275836


95it [33:16, 13.02s/it]

Encoded matrix 5; words emitted so far: 30607618


96it [33:29, 13.06s/it]

Encoded matrix 4; words emitted so far: 30937587


97it [33:43, 13.35s/it]

Encoded matrix 3; words emitted so far: 31268963


98it [34:02, 14.79s/it]

Encoded matrix 2; words emitted so far: 31600868


99it [34:19, 15.52s/it]

Encoded matrix 1; words emitted so far: 31931008


100it [34:38, 20.79s/it]

Encoded matrix 0; words emitted so far: 32261991





In [18]:
compressed_stream_copy = compressed_stream.copy()

In [25]:
for coder in coders:
    assert len(coder.bulk) == 0
    compressed = coder.get_compressed()
    assert len(compressed) == 2
    compressed_stream += compressed
compressed_stream = np.array(compressed_stream, dtype=np.uint16)

In [43]:
bitrate = len(compressed_stream) * 16 / quantized_matrices.size
len(compressed_stream), bitrate, bitrate - cross_entropies_12bit.mean()

(34691301, 5.293472442626953, 0.2061752432764976)

In [31]:
import pickle

In [45]:
with open("100-compressed-matrices.pickle", "wb") as f:
    pickle.dump({"compressed_stream": compressed_stream, "coder_offsets": coder_offsets}, f)

In [46]:
!ls -lh *.pickle

-rw-rw-r-- 1 robamler robamler 467M Apr 21 22:45 100-compressed-matrices.pickle


In [None]:
test = np.array([i % 100 for i in range(1024**2)]).reshape(1024, 1024)
rowsums = (test.sum(axis=1) % 10000)
sumsums = np.array([rowsums[:i+1].sum() for i in range(1024)])
rowsums, sumsums, sumsums[31::32]

(array([9776,  352,  928, ..., 9872,  448, 1024]),
 array([   9776,   10128,   11056, ..., 1932128, 1932576, 1933600]),
 array([  60928,  112880,  175856,  247056,  298880,  351728,  423200,
         484896,  537616,  599360,  670928,  723520,  775536,  846976,
         909440,  961728, 1023040, 1095376, 1147936, 1199120, 1271328,
        1334160, 1385216, 1447296, 1510400, 1571328, 1623280, 1686256,
        1757456, 1809280, 1862128, 1933600]))

In [None]:
(8*4) % 32

0

In [None]:
[(i, sum(range(8 * i - (8 * i) % 32, 8 * (i + 1)))) for i in range(10)]

[(0, 28),
 (1, 120),
 (2, 276),
 (3, 496),
 (4, 284),
 (5, 632),
 (6, 1044),
 (7, 1520),
 (8, 540),
 (9, 1144)]

In [None]:
[(i, sum(range(32 * (i + 1)))) for i in range(10)]

[(0, 496),
 (1, 2016),
 (2, 4560),
 (3, 8128),
 (4, 12720),
 (5, 18336),
 (6, 24976),
 (7, 32640),
 (8, 41328),
 (9, 51040)]

In [None]:
freqs = np.maximum(np.round(counts / 256).astype(np.int64), 1)
too_much = sum(freqs) - 2**12
print(too_much, 1024**2 - sum(counts))
freqs[:too_much] -= 1 # very hacky but good enough for now
print(2**12 - sum(freqs), 1024**2 - sum(counts))
freqs

19 0
0 0


array([203, 202, 201, 197, 196, 190, 189, 180, 179, 168, 167, 153, 151,
       139, 139, 123, 123, 108, 106,  94,  92,  80,  79,  67,  66,  55,
        54,  45,  44,  36,  35,  28,  28,  22,  21,  17,  16,  12,  12,
         9,   9,   6,   6,   5,   5,   3,   3,   2,   2,   2,   2,   1,
         1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
         1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1])

In [None]:
cross_entropy12bit = 12 - counts @ np.log2(freqs) / (1024**2)
cross_entropy12bit, entropy20bit

(5.054123235441871, 5.048731287507302)

In [None]:
cdf = np.add.accumulate(np.concatenate([np.array([0]), freqs]))
values_pad = np.concatenate([values, np.array([0])])
model_specification = np.stack((cdf, values_pad + 2**15), axis=1).astype(np.uint16).ravel()[:-1]
model_specification, len(model_specification), 2 * len(freqs), 1024**2

(array([    0, 32768,   203, 32767,   405, 32769,   606, 32766,   803,
        32770,   999, 32771,  1189, 32765,  1378, 32772,  1558, 32764,
         1737, 32773,  1905, 32763,  2072, 32774,  2225, 32762,  2376,
        32761,  2515, 32775,  2654, 32776,  2777, 32760,  2900, 32759,
         3008, 32777,  3114, 32778,  3208, 32758,  3300, 32779,  3380,
        32757,  3459, 32756,  3526, 32780,  3592, 32781,  3647, 32755,
         3701, 32754,  3746, 32782,  3790, 32753,  3826, 32783,  3861,
        32752,  3889, 32784,  3917, 32751,  3939, 32785,  3960, 32786,
         3977, 32750,  3993, 32749,  4005, 32787,  4017, 32788,  4026,
        32748,  4035, 32747,  4041, 32789,  4047, 32790,  4052, 32746,
         4057, 32745,  4060, 32791,  4063, 32792,  4065, 32744,  4067,
        32793,  4069, 32743,  4071, 32742,  4072, 32794,  4073, 32741,
         4074, 32795,  4075, 32796,  4076, 32740,  4077, 32797,  4078,
        32739,  4079, 32798,  4080, 32737,  4081, 32738,  4082, 32799,
      

In [None]:
inv_vocab = {value: i for i, value in enumerate(values)}

In [None]:
distribution = constriction.stream.model.Categorical(freqs.astype(np.float64))
# fake_next_data = np.array([1, 1], dtype=np.uint32) # pretend that there's more data to come (for the next matrix) so that decoding doesn't interfere with next row
def compress_row(row):
    row = np.array([inv_vocab[value] for value in row], dtype=np.int32)
    # coder = constriction.stream.stack.AnsCoder(fake_next_data)
    coder = AnsCoder(12, 16, [1, 1])
    for elem in row[::-1]:
        coder.push(elem, freqs)
    # coder.encode_reverse(row.astype(np.int32), distribution)
    return np.array(coder.get_compressed(), dtype=np.uint16)[::-1]
    # return coder.get_compressed()

In [None]:
compressed = [compress_row(row) for row in quantized_matrix]

In [None]:
len(compressed[0]) * 16 / 1024, cross_entropy12bit

(5.109375, 5.054123235441871)

In [None]:
offsets = np.zeros((2 * 1024,), dtype=np.uint16)
pos = len(model_specification) + len(offsets)
mask = (1 << 16) - 1
for i in range(1024):
    # little endian encoding
    offsets[2 * i] = pos & mask
    offsets[2 * i + 1] = pos >> 16
    pos += len(compressed[i])
offsets, len(offsets)

(array([2201,    0, 2528, ...,    5, 6956,    5], dtype=uint16), 2048)

In [None]:
serialized = np.concatenate([offsets, model_specification.astype(np.uint16)] + compressed)

In [None]:
len(offsets) / 256

8.0

In [None]:
[serialized[i] for i in offsets]

[20827,
 2201,
 273,
 2201,
 3819,
 2201,
 1188,
 2201,
 2,
 2201,
 1935,
 2201,
 11,
 2201,
 4584,
 2201,
 46,
 2201,
 192,
 2201,
 11852,
 2201,
 88,
 2201,
 2,
 2201,
 6161,
 2201,
 1,
 2201,
 61,
 2201,
 129,
 2201,
 83,
 2201,
 77,
 2201,
 21074,
 2201,
 145,
 2201,
 1504,
 2201,
 990,
 2201,
 48,
 2201,
 15750,
 2201,
 2074,
 2201,
 5,
 2201,
 116,
 2201,
 32,
 2201,
 4109,
 2201,
 630,
 2201,
 260,
 2201,
 6,
 2201,
 31833,
 2201,
 684,
 2201,
 34983,
 2201,
 3,
 2201,
 12,
 2201,
 1052,
 2201,
 12148,
 2201,
 24,
 2201,
 714,
 2201,
 46,
 2201,
 2,
 2201,
 42,
 2201,
 10,
 2201,
 8,
 2201,
 16,
 2201,
 1,
 2201,
 749,
 2201,
 20962,
 2201,
 19080,
 2201,
 4698,
 2201,
 51,
 2201,
 1807,
 2201,
 54159,
 2201,
 18108,
 2201,
 9,
 2201,
 3024,
 2201,
 1044,
 2201,
 74,
 2201,
 333,
 2201,
 250,
 2201,
 378,
 2201,
 20,
 2201,
 1417,
 2201,
 30,
 2201,
 25,
 2201,
 185,
 2201,
 1311,
 2201,
 3512,
 2201,
 1897,
 2201,
 4,
 2201,
 34,
 2201,
 13,
 2201,
 1777,
 2201,
 10330,
 2201,


In [None]:
model_specification

array([    0, 32768,   203, 32767,   405, 32769,   606, 32766,   803,
       32770,   999, 32771,  1189, 32765,  1378, 32772,  1558, 32764,
        1737, 32773,  1905, 32763,  2072, 32774,  2225, 32762,  2376,
       32761,  2515, 32775,  2654, 32776,  2777, 32760,  2900, 32759,
        3008, 32777,  3114, 32778,  3208, 32758,  3300, 32779,  3380,
       32757,  3459, 32756,  3526, 32780,  3592, 32781,  3647, 32755,
        3701, 32754,  3746, 32782,  3790, 32753,  3826, 32783,  3861,
       32752,  3889, 32784,  3917, 32751,  3939, 32785,  3960, 32786,
        3977, 32750,  3993, 32749,  4005, 32787,  4017, 32788,  4026,
       32748,  4035, 32747,  4041, 32789,  4047, 32790,  4052, 32746,
        4057, 32745,  4060, 32791,  4063, 32792,  4065, 32744,  4067,
       32793,  4069, 32743,  4071, 32742,  4072, 32794,  4073, 32741,
        4074, 32795,  4075, 32796,  4076, 32740,  4077, 32797,  4078,
       32739,  4079, 32798,  4080, 32737,  4081, 32738,  4082, 32799,
        4083, 32800,

In [None]:
# pad to a multiple of 1024 entries for now so it's easier to read in
pad_len = (1024 - len(serialized) % 1024) % 1024
serialized = np.concatenate([serialized, np.zeros((pad_len,), dtype=np.uint16)])

In [None]:
serialized.tofile('mock-matrix.bin')

In [None]:
serialized.shape, len(offsets), len(model_specification)

((335872,), 2048, 153)

In [None]:
!ls -l mock-matrix.bin

-rw-rw-r-- 1 robamler robamler 671744 Apr 21 15:32 mock-matrix.bin


In [None]:
670720 / 2 / 512

655.0

In [None]:
word1 = serialized[offsets[0]]
word2 = serialized[offsets[0] + 1]
quantile = (word2 % (2**24)) >> 8
print(f'quantile = {quantile}')
symbol_id = (cdf <= quantile).sum() - 1
cdf[symbol_id-1:symbol_id+2], symbol_id, values[symbol_id]

quantile = 194


(array([], dtype=int64), 0, 0)

In [None]:
quantized_matrix[0, 0]

-2

In [None]:
len(offsets) / 512

4.0

In [None]:
offsets[1::2][200:300]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=uint16)

In [None]:
offsets[::2][1000:]

array([65010, 65335,   127,   452,   774,  1098,  1421,  1748,  2076,
        2402,  2729,  3055,  3379,  3704,  4032,  4359,  4686,  5012,
        5337,  5660,  5983,  6307,  6629,  6956], dtype=uint16)

In [None]:
original_offsets = np.array([offsets[2*i] + (offsets[2*i+1] << 16) for i in range(1024)])
original_offsets

array([  2201,   2528,   2850, ..., 333987, 334309, 334636])

In [None]:
np.array([serialized[i+1] for i in original_offsets][:100])

array([49878, 31450, 43115, 33592, 18477, 37607,   617, 20837, 18140,
         777, 29705, 32476, 54692, 61970,  8413, 56750, 41112, 35941,
       59411, 29014, 51298, 34952, 29823, 27221,  6038, 16029, 56187,
       28383, 34615, 12035, 36892, 45747, 61146,  4349,  8142, 14792,
       30269, 15191,  1630, 44158, 46566, 16718, 63925, 63123,  9758,
       61283, 36444, 33762,  7186, 22353, 13345, 54457, 52447, 32507,
       12906, 25571, 11489, 24612, 18984, 19395, 59093, 14584, 50986,
         821, 19552, 42200, 50218, 54110, 10002, 41743, 33211, 47796,
       49983, 49960, 51074, 26130, 12662, 13385, 61763, 46585, 12404,
        1704, 11640, 64241, 13902, 53453, 33513, 15027, 65340,  6777,
       47872, 53306,  1701, 30464, 25630, 34004, 14576, 61048,   832,
       54095], dtype=uint16)

In [None]:
coder = AnsCoder(12, 16, list(serialized[original_offsets[0]:][::-1]))

In [None]:
np.all(serialized[original_offsets[0]:original_offsets[1]] == compressed[0])

True

In [None]:
gt_first_symbols = np.array([inv_vocab[quantized_matrix[i, 0]] for i in range(1024)])
gt_first_symbols

array([ 3, 16, 11, ..., 17, 12,  2])

In [None]:
js_first_symbols = np.array([3,16,11,4,11,3,3,1,9,3,5,29,7,2,1,23,0,19,10,1,11,11,5,14,10,27,17,29,9,30,0,3,29,1,42,13,8,17,8,19,7,1,13,8,8,34,26,4,18,9,5,6,20,30,3,4,20,0,14,18,9,12,9,4,19,6,5,4,9,3,2,15,4,4,10,7,1,5,1,7,0,8,22,16,8,1,3,15,32,15,16,0,8,9,5,6,12,27,4,4,2,15,16,11,26,6,15,2,25,15,13,8,0,21,9,17,9,22,39,3,31,20,14,25,5,15,3,30,23,23,7,18,6,26,12,3,13,5,21,35,27,2,20,11,16,8,8,43,6,6,3,12,14,33,11,14,19,41,9,8,18,12,6,9,31,0,23,30,5,0,27,1,14,1,11,11,6,22,3,1,14,40,22,5,23,18,7,7,43,2,0,9,23,40,16,44,6,2,16,12,25,4,3,11,25,28,13,14,26,15,9,12,0,23,3,22,5,10,18,13,7,26,5,9,5,5,26,8,10,4,23,5,7,26,1,11,9,9,15,21,22,26,8,10,21,16,0,9,3,2,5,29,5,21,14,3,0,7,5,2,4,2,10,13,6,16,19,28,8,33,9,14,25,2,34,8,18,11,11,9,4,17,7,12,13,18,2,2,9,13,4,10,15,16,19,20,2,0,17,3,13,7,6,18,7,8,7,16,23,0,4,9,6,3,5,10,3,11,10,11,19,5,14,19,7,29,1,2,10,2,8,18,28,3,1,7,5,20,16,14,11,2,1,11,39,16,38,34,1,9,12,18,8,7,13,19,20,9,18,7,26,2,9,1,2,30,11,11,4,29,1,6,6,26,6,22,43,9,27,7,3,7,15,30,5,16,18,11,8,33,9,6,14,0,5,14,25,9,8,16,13,21,17,38,0,22,34,4,20,14,20,5,32,14,0,8,34,12,27,5,14,4,13,35,12,25,6,5,6,12,9,3,15,19,16,10,16,6,21,16,15,22,13,2,14,30,0,12,11,4,0,3,13,17,30,29,24,3,9,12,19,2,15,17,3,26,3,10,0,12,26,24,15,10,5,14,9,27,1,20,1,4,28,1,17,2,21,1,7,5,16,3,32,4,6,5,0,27,1,29,6,21,18,0,17,2,19,14,9,9,20,8,8,1,22,20,9,1,26,4,14,9,0,38,7,9,13,0,6,0,8,14,0,15,22,0,10,0,19,11,30,21,5,9,0,3,0,26,55,13,26,13,5,5,3,21,4,0,4,3,27,4,13,9,28,7,23,10,14,6,7,23,1,4,15,21,20,8,16,25,22,4,13,2,20,18,21,0,22,5,10,0,7,11,19,3,6,14,17,6,6,1,23,6,18,21,6,11,17,20,9,47,6,20,2,3,17,4,22,9,31,9,1,11,10,4,21,7,0,5,11,9,5,6,3,9,2,18,21,7,13,11,15,13,34,15,6,0,24,19,41,33,32,10,16,5,12,3,12,1,9,4,9,7,14,4,4,22,8,41,14,0,5,8,19,14,4,10,0,29,21,22,23,18,25,1,17,6,11,15,16,10,7,22,13,3,4,2,14,13,16,5,21,18,16,10,0,12,0,5,29,11,9,11,9,21,13,27,23,21,9,7,1,18,10,10,4,19,4,19,18,42,11,18,12,14,32,1,1,1,19,13,6,3,0,12,14,23,39,4,0,6,25,24,10,24,1,10,17,17,7,8,5,10,0,23,4,14,12,26,14,13,4,1,6,23,5,20,0,2,6,9,7,6,1,7,0,5,23,17,12,25,1,7,4,6,0,10,21,18,6,9,16,17,9,0,3,17,9,5,9,6,4,6,21,6,14,6,0,11,4,9,2,7,1,6,24,17,8,24,27,21,21,9,9,3,0,6,0,31,21,21,7,11,7,15,21,19,10,7,3,15,4,7,7,4,7,13,29,3,17,4,17,14,10,19,29,7,5,5,3,1,3,20,1,18,22,7,8,18,0,3,4,17,0,11,7,30,11,13,11,12,27,9,26,2,16,28,12,12,16,24,21,6,6,3,10,5,2,6,12,8,0,13,4,2,17,23,31,5,20,14,23,6,29,1,23,4,25,30,9,4,10,8,24,0,24,1,21,6,10,18,2,11,22,12,4,1,19,12,10,1,5,3,17,5,10,2,2,8,3,24,29,24,2,18,0,3,0,14,1,2,34,11,11,18,16,4,4,10,5,19,35,3,9,25,7,7,20,0,10,22,34,10,10,5,22,5,6,11,11,25,14,11,0,9,11,2,20,4,17,9,8,6,0,0,4,2,25,5,7,17,12,2])
np.all(gt_first_symbols == js_first_symbols)

True

In [None]:
quantized_matrix[:, 0] % 2**16

array([65534, 65528,     6, ..., 65527, 65530,     1], dtype=int32)

In [None]:
quantile = [freqs[i] for i in gt_first_symbols][-10:]
quantile

[203, 203, 196, 201, 55, 190, 180, 108, 151, 201]

In [None]:
cdf

array([   0,  203,  405,  606,  803,  999, 1189, 1378, 1558, 1737, 1905,
       2072, 2225, 2376, 2515, 2654, 2777, 2900, 3008, 3114, 3208, 3300,
       3380, 3459, 3526, 3592, 3647, 3701, 3746, 3790, 3826, 3861, 3889,
       3917, 3939, 3960, 3977, 3993, 4005, 4017, 4026, 4035, 4041, 4047,
       4052, 4057, 4060, 4063, 4065, 4067, 4069, 4071, 4072, 4073, 4074,
       4075, 4076, 4077, 4078, 4079, 4080, 4081, 4082, 4083, 4084, 4085,
       4086, 4087, 4088, 4089, 4090, 4091, 4092, 4093, 4094, 4095, 4096])

In [None]:
len(cdf)

77

In [None]:
np.array([inv_vocab[quantized_matrix[i, 0]] for i in range(1024)])

array([ 3, 16, 11, ..., 17, 12,  2])

In [None]:
serialized[2*1024 + 153]

20827

In [None]:
len(model_specification)

153

In [None]:
compressed[0][:10]

array([20827, 49878, 32399, 30639, 50865, 53134,  1796, 27504,  2233,
       14518], dtype=uint16)

In [None]:
[AnsCoder(12, 16, list(compressed[i][::-1])).head % (1<<12) for i in range(1024)]
# coder.pop(freqs)
# coder.head

[726,
 2778,
 2155,
 824,
 2093,
 743,
 617,
 357,
 1756,
 777,
 1033,
 3804,
 1444,
 530,
 221,
 3502,
 152,
 3173,
 2067,
 342,
 2146,
 2184,
 1151,
 2645,
 1942,
 3741,
 2939,
 3807,
 1847,
 3843,
 28,
 691,
 3802,
 253,
 4046,
 2504,
 1597,
 2903,
 1630,
 3198,
 1510,
 334,
 2485,
 1683,
 1566,
 3939,
 3676,
 994,
 3090,
 1873,
 1057,
 1209,
 3295,
 3835,
 618,
 995,
 3297,
 36,
 2600,
 3011,
 1749,
 2296,
 1834,
 821,
 3168,
 1240,
 1066,
 862,
 1810,
 783,
 443,
 2740,
 831,
 808,
 1922,
 1554,
 374,
 1097,
 323,
 1529,
 116,
 1704,
 3448,
 2801,
 1614,
 205,
 745,
 2739,
 3900,
 2681,
 2816,
 58,
 1701,
 1792,
 1054,
 1236,
 2288,
 3704,
 832,
 847,
 598,
 2682,
 2875,
 2169,
 3650,
 1280,
 2683,
 491,
 3596,
 2682,
 2451,
 1730,
 197,
 3314,
 1754,
 2981,
 1781,
 3455,
 4025,
 682,
 3861,
 3239,
 2642,
 3611,
 1156,
 2772,
 649,
 3847,
 3470,
 3487,
 1460,
 3033,
 1240,
 3690,
 2236,
 747,
 2426,
 1176,
 3343,
 3971,
 3729,
 545,
 3222,
 2195,
 2895,
 1729,
 1703,
 4049,
 1349,

In [None]:
gt_first_symbols

array([ 3, 16, 11, ..., 17, 12,  2])

In [None]:
quantized_matrix[:, 1023] % (2**16)

array([   11,    17,     5, ...,     9, 65532, 65535], dtype=int32)

In [None]:
1364968150 % (2**12)

726

In [None]:
(1364968150 >> 12) * 197 + 726

65649794

In [None]:
gt_checksum = (quantized_matrix.astype(np.uint16).astype(np.uint32)**2).sum(axis=1).astype(np.uint32)
gt_checksum

array([3836677573, 3860787223, 3898272600, ..., 3862884404, 3859483710,
       3872849774], dtype=uint32)

In [None]:
js_checksum = np.array([3836677573,3860787223,3898272600,3873900219,3881497336,3860791599,3890934752,3867872476,3871011871,3876916429,3879795958,3862888370,3852537666,3880056720,3850832077,3869835243,3888186726,3869967731,3869181512,3861313528,3863152026,3864853711,3891334524,3885565165,3874417693,3844279456,3851098689,3886350394,3855807536,3873635431,3871404419,3910730173,3905744531,3899586419,3877307966,3869310218,3871149755,3872454990,3898014554,3869442911,3873502470,3872851945,3884643441,3880319127,3873374112,3887660864,3910461944,3874290061,3865640414,3861706414,3852796673,3860531120,3862229372,3839820983,3887396842,3872062835,3864722643,3891850695,3853323402,3881763097,3874945733,3860529602,3853585312,3876907586,3827896291,3862366182,3875206230,3828553118,3825929177,3855414781,3891194734,3827499310,3850961846,3841395294,3889498758,3875469143,3862097146,3874817122,3858303261,3855027793,3860132429,3871278978,3869180893,3882810193,3865382927,3901817929,3862759527,3860919552,3888842618,3849387670,3856208410,3871543068,3868393137,3859480626,3852928497,3909021093,3896047866,3866821293,3901681269,3875215428,3857772391,3895655704,3876777800,3851224066,3872717329,3859876191,3863286372,3879931745,3887921281,3861318370,3865901957,3854497040,3846247731,3882417393,3856600893,3855545346,3866690753,3856075480,3861055298,3847297236,3902342580,3897884783,3881369132,3877967171,3884249978,3876125341,3890672865,3887924819,3896966109,3854367537,3881891745,3839948851,3847818883,3872588263,3862234913,3871802234,3870355430,3864070755,3876648933,3857651050,3857511460,3860925807,3820293577,3832612136,3864462061,3851093444,3881368678,3874028485,3854498080,3882550277,3869966301,3842179136,3859613473,3865510448,3844803480,3884516022,3893036281,3847686787,3847034090,3884256082,3857908441,3837592536,3863023895,3881231379,3855547095,3853062293,3869706187,3834191279,3874292941,3907449319,3883856709,3889889898,3858303287,3888705129,3864854260,3864066332,3869708465,3856725533,3856597886,3847821229,3889627485,3848471214,3862889651,3869576371,3887529657,3859608992,3865637958,3834450136,3822260889,3886741496,3850826565,3844540018,3818856465,3868783089,3853454235,3845324687,3872975546,3915047084,3847163882,3882290954,3907579714,3883725049,3861314066,3895126553,3890936718,3869837904,3864069495,3874420087,3892375312,3857777521,3875729247,3892637259,3854371052,3860006256,3819899183,3863547693,3847949822,3865643507,3898800503,3826056598,3842972926,3890933471,3875474857,3885039229,3858694247,3856467323,3836417865,3895921648,3872846780,3883991417,3867865483,3841918445,3875468576,3831690859,3846636026,3854364841,3875338301,3865772445,3862361887,3856467154,3885563788,3836548546,3879531162,3864854082,3894218417,3881630846,3870750666,3870751914,3906011253,3877173268,3864853746,3902078093,3854366765,3858037717,3845328441,3875734605,3846372942,3864986834,3862494058,3861313246,3894212956,3879270390,3869046819,3873108678,3881240492,3858693550,3877439167,3856470545,3886481602,3859088860,3836546915,3861840163,3861577427,3840086334,3836547630,3869175953,3894211526,3846237753,3843750366,3892115780,3885694972,3845068627,3866684923,3883859018,3862104714,3842838697,3870625176,3867086897,3887791715,3855544487,3860133616,3862100641,3873638794,3903130123,3867873431,3878742732,3856463862,3827892787,3892117970,3873505496,3856862592,3861317437,3853976030,3855681376,3919246714,3885038941,3889496422,3867998042,3831960722,3838256574,3843883643,3862760322,3861184747,3854889354,3839820140,3845454361,3845065099,3865773744,3876783307,3874157269,3888443565,3848467615,3866164633,3855025531,3876519378,3850046041,3871147010,3843365362,3893553575,3868657842,3860402359,3894738619,3846240438,3829604703,3848599278,3854107745,3856599343,3898014423,3892511010,3872715870,3876522180,3879795963,3839692977,3864724153,3864598433,3850305368,3863806703,3877301649,3879536000,3848342438,3823834245,3898796651,3881629188,3858694459,3866949738,3814393762,3889102587,3866032782,3858690732,3854374023,3858433521,3876129580,3872849022,3910857872,3833394291,3853323993,3872979672,3859218352,3837465163,3838904539,3863412037,3914394162,3881239317,3850176894,3855552873,3826846797,3855685657,3861052666,3864333231,3824094448,3877174103,3862760468,3861841851,3856594816,3870885685,3855287047,3879799246,3860659376,3867865758,3864202589,3853060329,3867609489,3885564977,3869575883,3875205031,3857123818,3873900164,3847557035,3861315001,3834188888,3872849414,3879403532,3868525804,3841131598,3897750101,3852924600,3865900970,3876783193,3878222589,3889100439,3875863772,3898802086,3884389134,3857258527,3827371308,3848082224,3884646954,3814787895,3902859445,3879536198,3855549943,3867611452,3854107434,3865380654,3864199739,3889361424,3881501018,3847812351,3860924298,3897226546,3880581362,3875080437,3842702914,3865772429,3863939079,3828682853,3880450331,3909679747,3898275156,3845983003,3881627193,3889892384,3869702159,3864857878,3885039979,3851873793,3891324681,3879533998,3851091195,3871409021,3863937417,3820946964,3892637558,3868390469,3853322075,3867083510,3867083409,3868131166,3829731646,3859481219,3851879495,3885038989,3874550859,3852667776,3871013231,3879267673,3849913291,3858035180,3871143885,3892900973,3875997240,3862234040,3851745269,3870885755,3867733257,3884119695,3856336898,3900767250,3830129258,3860531511,3867603078,3867475306,3886872360,3858827624,3858960318,3868919639,3890282426,3869966374,3893693869,3846638927,3901552497,3868914839,3862494133,3891460966,3923574750,3887265238,3871408055,3831436749,3811770342,3875076719,3873111023,3838115987,3859741978,3887794052,3864460140,3857644318,3878090044,3862363423,3851483291,3883596463,3863284913,3870491736,3863544145,3837200442,3874685425,3861317129,3833397542,3827107264,3843619557,3880718364,3861711409,3855411834,3857118576,3847949409,3859089989,3882417054,3844540076,3844411676,3846505479,3858163814,3865906301,3867476451,3860270254,3873373045,3878876679,3873237087,3898145747,3887791755,3849522310,3883863746,3836546820,3864727532,3867209515,3906142876,3923176387,3831824047,3828682881,3828682537,3855025801,3808498484,3879794237,3874026739,3841920679,3839559177,3850566920,3866692625,3875205015,3871801312,3883861606,3847157504,3896444310,3904824505,3832614614,3864588418,3850697870,3896047350,3878226378,3864987111,3851093678,3880319951,3860003988,3857123004,3843229751,3875076809,3873902740,3879144350,3848998076,3827629542,3882940529,3857121313,3860922596,3848339612,3888315971,3890276913,3889230659,3893947893,3839692141,3874554877,3835886437,3880582743,3842441103,3816753186,3888187868,3856987511,3858436689,3879532924,3867475152,3872720550,3889624339,3864195793,3869570832,3911382347,3841523246,3850567178,3854629918,3839954073,3852927962,3897488263,3854893428,3874551514,3850178322,3866949796,3902206990,3848734401,3859083680,3920949634,3882419148,3887922768,3850172841,3863938048,3866298260,3877175116,3890802197,3890150038,3843363244,3876126762,3885956924,3865116853,3867865643,3839952960,3857121943,3869443064,3846899206,3831826944,3872850490,3879663192,3864991086,3876915239,3896049687,3886349490,3852670174,3861455674,3873897736,3885434713,3838773595,3858557541,3878485920,3887528250,3883988158,3861185290,3870358224,3884513700,3868000657,3825277837,3907191434,3820551593,3844144814,3888970972,3884250861,3891984456,3842966190,3840483096,3888448007,3863413025,3872325621,3881500851,3863808061,3885696811,3857907911,3861972994,3870094941,3921082397,3870885430,3862887473,3896572825,3842836633,3853583584,3856204000,3873893900,3871928856,3847028856,3876389654,3883070868,3910073154,3826586427,3857515833,3868261754,3900638022,3863016274,3858169421,3876126098,3837070972,3896570329,3879401838,3878221627,3871536707,3855414543,3861447586,3859873656,3874947389,3889622390,3825793199,3903514012,3878356294,3870621301,3868792371,3874549101,3844671216,3906533324,3898536377,3871534495,3852009035,3844144443,3879141275,3889497083,3849388920,3837730577,3874034207,3868132837,3882415826,3879141213,3879924794,3832745074,3830645990,3917934388,3847553974,3875603489,3853716475,3859743270,3865906991,3852401217,3849263062,3864593086,3882544798,3873893211,3868264968,3883987984,3854764971,3848079835,3872586680,3892117690,3888968601,3874554570,3854232431,3859352350,3839038665,3882681255,3858826214,3857383519,3872583947,3861180690,3893423878,3880449122,3889623233,3877571405,3873374142,3860008059,3848599615,3880842421,3880582130,3868654249,3838774237,3865642820,3885433067,3870882583,3863804696,3875727182,3860266961,3869834799,3858170199,3866958289,3898668479,3855156315,3887792463,3845460973,3875732074,3855022256,3895264163,3877435737,3883856630,3865118639,3869310806,3847552536,3879666383,3866554684,3859741664,3885437426,3831432438,3855549389,3882678691,3879797554,3872062702,3834578258,3852270477,3890802289,3863543234,3851354632,3871410422,3865646905,3848602597,3851095399,3887917096,3880709362,3875867221,3889753868,3855939916,3854242710,3867084641,3841789286,3868785151,3882291277,3821865052,3897490297,3881371031,3862235536,3913739899,3901553073,3866556958,3894345855,3846244607,3840603024,3877702118,3885040257,3830643333,3833794081,3872455461,3855155963,3852924696,3838381431,3853060824,3852669156,3841527063,3891196279,3856074883,3862497055,3851747004,3879661044,3885561036,3837723912,3820690547,3870362616,3858033850,3888053585,3870357513,3900500488,3848601499,3829202578,3877567646,3858038807,3849781860,3902214095,3827635790,3868913864,3823437830,3855287889,3853453310,3908496871,3871013586,3869180557,3855412704,3868387852,3859350742,3872981025,3883071883,3835498309,3856337102,3848867099,3838905598,3853453125,3862489292,3881497367,3866298885,3904175069,3879535442,3875474008,3890147258,3878486138,3863144226,3865247519,3860663021,3855287536,3898670863,3862232756,3896707594,3858694842,3877433297,3903516744,3860659779,3881499886,3860531644,3895257915,3876391543,3861573895,3851878227,3896703976,3841262565,3925539032,3850830554,3860399497,3842445431,3863941185,3854893484,3855157814,3874815691,3885429388,3839295238,
     3883728688,3829600541,3848729309,3872061248,3887401540,3878221738,3864201169,3846637760,3875991174,3864333784,3862105863,3866554523,3848338001,3906140052,3871010925,3854761951,3887657887,3833530625,3837592644,3870622706,3852011686,3887794036,3833920156,3848472510,3880187906,3888579043,3884381020,3846635912,3857907527,3871149651,3855940956,3838120381,3879796915,3852408131,3865116814,3872064254,3911122642,3861836616,3900109573,3868129467,3856467204,3876912209,3879663232,3839037558,3906794079,3869702312,3876254869,3884123803,3826583501,3825269717,3863415236,3874415559,3875995054,3866166219,3867609213,3879925851,3882418661,3862891176,3839558749,3884907974,3886866097,3848994665,3863541446,3853715501,3860136954,3857512432,3856338419,3881110269,3879272525,3844280702,3856863977,3851484021,3868002324,3879005781,3846764262,3825403891,3847031538,3817280271,3871543407,3849785512,3883725190,3874556727,3898010854,3875861392,3904700301,3889496947,3841787308,3842966726,3876127275,3840867973,3860398998,3842054780,3898139045,3855681501,3863937121,3866432085,3864592647,3895651905,3849386139,3865507865,3857123669,3829731253,3862497232,3868921492,3874030854,3845585629,3883729123,3855946131,3859483766,3839823035,3890939346,3888447765,3841521075,3848860450,3867342936,3862884404,3859483710,3883070957])

In [None]:
(3836677573 + quantized_matrix[0].astype(np.uint16).astype(np.uint32).sum() * 25) % (2**32)

380561252.0

In [None]:
3836677573

3836677573

In [None]:
np.sum(gt_checksum != js_checksum)

1

In [None]:
np.log2(3836677573)

31.837210382706186

In [None]:
quantized_matrix[:, -1][1000:]

array([ 11,  -8,  -2,   1,   4,  10,   9,  -1,  -1,  -5,   8, -14,  11,
         4,   3, -12,  -7,  -3,  12,  10, -11,   9,  -4,  -1], dtype=int8)

In [None]:
2**15

32768

In [None]:
quantized_matrix @ (quantized_matrix @ np.arange(1024) % 1024) % 1024

array([654, 125, 355, ..., 136, 676, 492])

In [None]:
np.array([[inv_vocab[x] for x in row] for row in quantized_matrix])

array([[ 3, 14, 27, ...,  5, 15, 21],
       [16,  7,  7, ..., 30,  6, 34],
       [11, 22,  0, ..., 31, 11,  9],
       ...,
       [17,  6, 12, ..., 19,  4, 18],
       [12, 12, 12, ...,  7,  6,  8],
       [ 2,  8, 11, ...,  7,  9,  1]])

In [None]:
quantized_matrix.sum(axis=1)

array([-393, -311,  172, ..., -224,   28,  106])