In [None]:
import sys
sys.path.append('MET_IBF')
from collections import Counter
from random import sample

import numpy as np
import pandas as pd
import plotly.express as px
px.defaults.height = 500

from MET_IBF import met
import partial_decoding as prdc

In [None]:
rng = np.random.default_rng()

## Arbitrary distribution

In [None]:
pool_size = 1000
log_u = 32
p = rng.beta(2, 5, size=pool_size)
# p = rng.uniform(size=pool_size)
load = 1.12
expected_set_size = p.sum()
expected_set_size

In [None]:
m = int(expected_set_size * load)
m

In [None]:
x = np.array(sorted(sample(range(2**log_u), len(p))), dtype=int)

In [None]:
pp = p/p.sum()
cs = np.cumsum(pp)

In [None]:
desired_p = np.array([0.046, 0.427, 0.398, 0.129])
deg = np.array([
    [ 6, 3, 1, 4,],
    [14, 0, 2, 6,],
])

m_cells = prdc.partition_m(m - prdc.extra_cells(deg.size, len(deg)), len(deg))

In [None]:
idx = np.searchsorted(cs, np.cumsum(desired_p), side='right')[:-1]
idx

In [None]:
# Verify that the resulting probability is close enough
# max distance between entries:
np.abs(np.array([sum(a) for a in np.split(p, idx)])/p.sum() - desired_p).max()

In [None]:
x_boundries_by_p = x[idx]
# Now try to split the space without knowing the probabilities
x_boundries_by_u = np.cumsum(desired_p) * 2**log_u
# Same, but based on the local set
x_boundries_by_x = x[(np.cumsum(desired_p)[:-1] * len(x)).astype(int)]

def gen_key2type_by_bounds(bounds):
    return lambda x: np.searchsorted(bounds, x)

In [None]:
def simulate_once():
    # Now choose elements by their probabilites
    selected_mask = rng.random(len(p)) < p
    selected_x = x[selected_mask]

    ibfs = {
        'met_p': met.METIBF(deg, m_cells, gen_key2type_by_bounds(x_boundries_by_p)),
        'met_x': met.METIBF(deg, m_cells, gen_key2type_by_bounds(x_boundries_by_x)),
        'met_u': met.METIBF(deg, m_cells, gen_key2type_by_bounds(x_boundries_by_u)),
        'ibf+0.0': met.IBF.create(m=m, k=3),
        # 'ibf+8.5': met.IBF.create(m=int(m*1.085), k=3),
    }

    results = {'size': len(selected_x)}
    for name, t in ibfs.items():
        t.insert_from(selected_x)
        peeled = t.peel()
        for e in peeled:
            assert e in selected_x
        results[name] = len(peeled)

    return results

In [None]:
%%time
n_sim = 500

df = pd.DataFrame.from_records(
    (simulate_once() for _ in range(n_sim)),
)

In [None]:
df.mean().sort_values(ascending=False)

In [None]:
df.median().sort_values(ascending=False)

In [None]:
cols = df.columns[1:]
ratio_cols = cols + '_r'
for col, ratio_col in zip(cols, ratio_cols):
    df[ratio_col] = df[col]/df['size']
df[ratio_cols].mean().sort_values(ascending=False)

In [None]:
px.ecdf(df[ratio_cols], marginal='rug')

## Fixed distribution

In [None]:
rep = 1000
log_u = 32
dist = Counter({
    .05: rep,
    # .25: rep,
    # .50: rep,
    # .75: rep,
    .95: rep,
})
p = np.fromiter(dist.keys(), dtype=float)
c = np.fromiter(dist.values(), dtype=int)
load = 1.23

In [None]:
expected_set_size = p@c
expected_set_size

In [None]:
m = int(expected_set_size * load)
m

In [None]:
def met_config1(c, p, n, m, key2p):
    deg = np.array([
        [ 5,  3,  1,  1,  0,],
        [ 5,  2,  2,  1,  1,],
        [ 5,  2,  1,  1,  1,],
    ])

    m -= prdc.extra_cells(deg.size, len(deg))
    m_cells = prdc.partition_m(m, len(deg))

    def key2type(x):
        return (p >= key2p[x]).argmax()

    return deg, m_cells, key2type

def met_config2(c, p, n, m, key2p):
    deg = np.array([
        [ 5,  3,  1,  2,  0,],
        [ 9,  1,  2,  0,  1,],
        [ 1,  3,  1,  1,  1,],
    ])

    m -= prdc.extra_cells(deg.size, len(deg))
    m_cells = prdc.partition_m(m, len(deg))

    def key2type(x):
        return (p >= key2p[x]).argmax()

    return deg, m_cells, key2type

def met_config3(c, p, n, m, key2p):
    deg = np.array([
        [ 5, 1,],
        [ 1, 1,],
        [ 5, 1,],
    ])

    m_cells = np.array([467, 399, 363])

    def key2type(x):
        return (p >= key2p[x]).argmax()

    return deg, m_cells, key2type


def construct_tables(c, p, n, m, key2p) -> dict[str, met.InvertibleBloomFilterAPI]:
    return {
        # 'met1': met.METIBF(*met_config1(c, p, n, m, key2p)),
        # 'met2': met.METIBF(*met_config2(c, p, n, m, key2p)),
        'met3': met.METIBF(*met_config3(c, p, n, m, key2p)),
        'ibf3': met.IBF.create(m, 3),
        # 'ibf4': met.IBF.create(m, 4),
    }

def simulate_once(c, p, m):
    n = rng.binomial(c, p)
    
    s = sample(range(2**log_u), n.sum())
    keys_by_p = np.array_split(s, n.cumsum()[:-1])
    
    key2p = {
        key: prob
        for prob, keys in zip(p, keys_by_p)
        for key in keys
    }
    
    ibfs = construct_tables(c, p, n, m, key2p)

    results = {'size': len(key2p)}
    for name, t in ibfs.items():
        t.insert_from(s)
        peeled = t.peel()
        for e in peeled:
            assert e in s
        results[name] = len(peeled)

    return results

## Simulation

In [None]:
%%time
n_sim = 500

df = pd.DataFrame.from_records(
    (simulate_once(c, p, m) for _ in range(n_sim)),
)

In [None]:
df.mean().sort_values(ascending=False)

In [None]:
df.median().sort_values(ascending=False)

In [None]:
cols = df.columns[1:]
ratio_cols = cols + '_r'
for col, ratio_col in zip(cols, ratio_cols):
    df[ratio_col] = df[col]/df['size']
df[ratio_cols].mean().sort_values(ascending=False)

In [None]:
px.ecdf(df[ratio_cols], marginal='rug')

# Optimize deg given $p$

In [None]:
from scipy.optimize import basinhopping, OptimizeResult

def noop_optimizer(fun, x0, args, **options):
    return OptimizeResult(x=x0, fun=fun(x0), success=True, nfev=1)

def accept_test(f_new, x_new, f_old, x_old):
    return f_old/f_new > 0.7

def callback(x, f, accept):
    callback.n += 1
    if accept:
        print(prdc.unpack(x))
        print(f'cost={f}')
    if (callback.n % 30) == 0:
        print(f'called #{callback.n} times')

In [None]:
callback.n = 0

res = basinhopping(
    func=prdc.cost,
    x0=prdc.pack(prdc.deg, prdc.m_cells),
    # niter=5,
    callback=callback,
    accept_test=accept_test,
    take_step=prdc.MyTakeStep(
        prdc.n_cell_types,
        prdc.n_data_types,
        stepsize=.25
    ),
    minimizer_kwargs=dict(method=noop_optimizer),
)

In [None]:
prdc.unpack(np.array([  5.,   1.,   1.,   1.,   5.,   1., 467., 399., 363.]))

In [None]:
array([  5.,   1.,   1.,   1.,   5.,   1., 467., 399., 363.])
cost=0.021485157675628953

(array([[4., 1.],
        [0., 1.],
        [3., 1.]]),
 array([461., 315., 453.]))


# Old

In [None]:
from iblt.pyblt import PYBLT as Iblt

In [None]:
iblt = Iblt(value_size=0, num_hashes=4, m=32)

In [None]:
iblt.insert(15)

In [None]:
iblt.insert(152)

In [None]:
iblt.erase(356)

In [None]:
iblt.list_entries()

In [None]:
iblt.peel()

In [None]:
from random import getrandbits

N = 20
iterations = 1000

def once():
    iblt = Iblt(value_size=0, num_hashes=4, m=32)
    for _ in range(N):
        iblt.insert(getrandbits(16))
    return len(iblt.peel())

In [None]:
import itertools

In [None]:
import numpy as np

In [None]:
sum(map(lambda x: once(), range(iterations)))/iterations

In [None]:
import fpfz
import numpy as np

In [None]:
uc = fpfz.UniverseSizeCalculator()
mc = fpfz.MemoryCalculator(uc=uc)
mg = fpfz.MatrixGenerator(uc=uc, mc=mc)

rng = np.random.default_rng()

In [None]:
n=1000
m=30
k=4
d=3

while len(mx := np.unique(mg.random_iblt(m=m, n=3*n//2, k=k), axis=0)) < n:
    continue

mx = rng.choice(mx, size=n, replace=False)

In [None]:
def generate_examples(mx, d, n_examples):
    label = np.array([
        rng.choice(mx.shape[0], d, replace=False, shuffle=False)
        for _ in range(n_examples)
    ])
    X = mx[label].sum(1)
    Y = np.zeros((n_examples, n))
    np.put_along_axis(Y, label, 1, axis=1)
    return X,Y,label

In [None]:
import torch

In [None]:
class NeuralNetwork(torch.nn.Module):
    def __init__(self, m, n):
        super().__init__()
               
        self.linear_relu_stack = torch.nn.Sequential(
            torch.nn.Linear(m, 1024),
            torch.nn.ReLU(),
            torch.nn.Linear(1024, 1024),
            torch.nn.ReLU(),
            torch.nn.Linear(1024, n),
            torch.nn.Softmax(dim=1),
        )

    def forward(self, x):
        return self.linear_relu_stack(x)

In [None]:
model = NeuralNetwork(m=m, n=n)

In [None]:
loss_fn = torch.nn.MultiLabelMarginLoss()
optimizer = torch.optim.Adagrad(model.parameters())

In [None]:
model.train()

In [None]:
for _ in range(1024):
    X, Y, label = generate_examples(mx, d, 2**10)
    X = torch.from_numpy(X).float()
    Y = torch.from_numpy(Y)
    label = torch.from_numpy(label)
    y = torch.zeros_like(Y, dtype=torch.long)
    y[:, 0:d] = label
    y[:, d] = -1

    pred = model(X)
    loss = loss_fn(pred, y)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    print(loss.item())

In [None]:
mx

In [None]:
generate_examples(mx, 3, 1)

In [None]:
mx[[7, 5, 3]]

In [None]:
r = model(torch.FloatTensor([[1, 1, 1, 2, 2, 2]])).detach()

In [None]:
r.sort().indices[0, -3:]