In [None]:
from pathlib import Path

import pandas as pd
import numpy as np

import fpfz
import primes

dicts_dir = Path('.')
dict_format = 'calc_m_{}.pkl'

def load_dict():
    d = {}
    for l in dicts_dir.glob(dict_format.format('*')):
        d.update(fpfz.read_from_disk(l))
        print(f'loaded {l}, |d| = {len(d)}')
    print('done')
    return d

def dump_dict(d):
    from time import time
    fpfz.write_to_disk(
        dicts_dir / dict_format.format(int(time()*100)),
        d
    )

In [None]:
pg     = primes.Generator()
pp     = primes.Power(pg)
calc_u = fpfz.UniverseSizeCalculator(pg)
calc_m = fpfz.MemoryCalculator(pg, calc_u, rec_cache=load_dict())

In [None]:
# dump_dict(calc_m.rec_cache)

In [None]:
cell_factor = 3

## Build data for LFFZ

In [None]:
hc = pd.read_csv('csv/hash_collision.csv')
hc['fail_prob'] = hc.n_fail / hc.reps
hc = hc.pivot_table(index=['n', 'N']).drop(columns=['n_fail', 'reps']).fail_prob

In [None]:
def hash_collision_prob(n_items, n_buckets):
    try:
        return hc[n_items, n_buckets]
    except KeyError:
        print(f'not found {log_items=} {n_buckets=}')
        return fpfz.hash_collision_prob(n_items, n_buckets)

In [None]:
def compute_method_for_phis_range(method_n, p, n, phis, cell_size):
    return method_n(n)*cell_size + p(n)*phis

def find_best_lffz_per_phi(method_n, p, phis, ns, cell_size):
    res = np.empty((len(ns), len(phis)), dtype=int)
    for i, n in enumerate(ns):
        res[i] = compute_method_for_phis_range(method_n, p, n, phis, cell_size)
    return res

def find_best_lffz_for_N(method_nd, N, phis, log_n, cell_factor, density=2**10):
    ns = hc.index.unique(level='n')
    ns = ns[(N <= ns) & (ns <=  2**log_n)]

    return find_best_lffz_per_phi(
        method_n=lambda n: method_nd(n=n, d=N),
        p=lambda n: hash_collision_prob(n_items=n, n_buckets=N) if n!=2**log_n else 0,
        phis=phis,
        ns=ns,
        cell_size=cell_factor*log_n,
    ).min(axis=0)

def find_best_lffz_for_method(method_nd, Ns, phis, log_n, cell_factor, cb):
    res = np.empty((len(Ns), len(phis)), dtype=int)
    for i,N in enumerate(Ns):
        res[i] = find_best_lffz_for_N(method_nd, N, phis, log_n, cell_factor)
        cb(i, len(Ns))

    return res

In [None]:
methods = {
    'OLS': lambda n, d: calc_m.ols(n=n, d=d-1),
    'EGH': lambda n, d: calc_m.egh(n=n, d=d-1),
    'REC': calc_m.recursive,
}

In [None]:
Ns     = np.arange(3, 20)
phis   = 2**np.arange(5, 35)
log_ns = [
    # 8,
    16,
]

In [None]:
from ipywidgets import IntProgress
bar = IntProgress()

def progress(i, size):
    i += 1
    bar.value = i
    bar.max   = size
    bar.description = f'{i/size:3.3%}'

bar

In [None]:
lffz_res = []

for method, f in methods.items():
    res = np.empty((len(log_ns), len(Ns), len(phis)), dtype=int)

    print(method)

    for i,log_n in enumerate(log_ns):
        bar.value = i
        res[i] = find_best_lffz_for_method(f, Ns, phis, log_n, cell_factor, cb=progress)

    lffz_res.append(res)

lffz_res = np.stack(lffz_res)

### Create DataFrame from result array

In [None]:
df = []
for i, method in enumerate(methods):
    for j, log_n in enumerate(log_ns):
        for k, N in enumerate(Ns):
            for l, phi in enumerate(phis):
                df.append((method, log_n, N, phi, lffz_res[i, j, k, l]))

lffz_df = pd.DataFrame.from_records(df, columns=('method', 'log_n', 'N', 'phi', 'expected_size'))
lffz_df

## Build DataFrame for IBLT

#### Basic properties

In [None]:
df = pd.read_csv('csv/IBLT_decoding_stats.csv')
df['p_fail'] = 1 - df.n_success / df.reps
df['table_size'] = df.m * df.log_n * cell_factor
df.drop(columns=['reps', 'n_success'], inplace=True)
df

#### size per phi

In [None]:
df = df.merge(pd.Series(phis, name='phi'), how='cross')
df['expected_size'] = np.ceil(df.table_size + df.p_fail*df.phi).astype(int)
df

#### Take the minimum per comb

In [None]:
iblt_df = df.groupby(['log_n', 'N', 'phi']).expected_size.min().reset_index()
iblt_df

## Combining all

### Using ratio from IBF

In [None]:
df = pd.merge(lffz_df, iblt_df, how='left', on=['log_n', 'N', 'phi'], suffixes=('', '_IBF'))
df['expected_size_ratio'] = df.expected_size / df.expected_size_IBF
df

## Plot

In [None]:
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
sns.set_theme()

In [None]:
px.defaults.height = 700

In [None]:
sub = df[
    (df.log_n==16) &
    (df.expected_size_ratio<1) &
    # (df.phi<=10**10) &
    (True)
]

In [None]:
fig = px.scatter_3d(sub, x='N', y='phi', z='expected_size_ratio', symbol='method', color='method', log_z=True, log_y=True)
fig.show()

In [None]:
g = sns.relplot(
    data=sub,
    kind="line",
    x="phi",
    y="expected_size_ratio",
    col="N",
    hue="method",
    # size="coherence",
    style="method",
    facet_kws=dict(sharey=False),
    col_wrap=3,
)
g.set(xscale="log")
g.set(yscale="log")

In [None]:
sub = df[(df.expected_size_ratio <= 1.3) & (df.log_n==16) & (df.N==12)]

fig = px.scatter(sub, x='phi', y='expected_size_ratio', symbol='method', color='method', log_x=True, log_y=True)
fig.show()

#### Sanity check for good case

FPFZ total cost:

In [None]:
log_n = 12
phi = 268435456
N = 16

In [None]:
phis

In [None]:
n = 65536

In [None]:
fpfz_table_size = calc_m.recursive(n=n, d=N) * cell_factor * log_n 
print(f'{fpfz_table_size:,}b')

The penalty in terms of elements:

In [None]:
f'{phi // log_n:,}'

The expected size:

In [None]:
fpfz_expected_size = fpfz_table_size + phi * hash_collision_prob(n_buckets=N, n_items=n)
print(f'{fpfz_expected_size:,}b')

In [None]:
iblt_df[(iblt_df.log_n==log_n) & (iblt_df.N==N) & (iblt_df.phi==phi)]

#### Sanity check

In [None]:
lffz_df.iloc[[2128]]

In [None]:
sub = df[(df.log_n==16) & (df.N==3) & (df.phi==8192)]

In [None]:
sub.iloc[[sub.expected_size.argmin()]]

In [None]:
433/507

In [None]:
iblt_df[(iblt_df.log_n==16) & (iblt_df.N==13) & (iblt_df.phi==1024)]

In [None]:
df[(df.N==13) & (df.log_n==16) & (df.phi==1024) & (df.expected_size==1389.)]

In [None]:
lffz_df[(lffz_df.N==13) & (lffz_df.log_n==16) & (lffz_df.phi==1024)]

# Backyard

In [None]:
for n in np.geomspace(N, 2**log_n, num=100, dtype=int):
    I = calc_m.ols(n, d=N) * cell_size
    p = fpfz.hash_collision_prob(N, n)
    res = I+p*phis

print(res)

In [None]:
df[['log_n', 'N', 'm']].value_counts().index

In [None]:
calc_m.ols()

In [None]:
dots = sns.load_dataset('dots')

In [None]:
dots

In [None]:
sns.relplot(
    data=dots, kind="line",
    x="time", y="firing_rate", col="align",
    hue="choice", size="coherence", style="choice",
    facet_kws=dict(sharex=False),
)


### Plotly

In [None]:
import plotly.express as px
fig = px.bar(x=["a", "b", "c"], y=[1, 3, 2])
fig.layout.title = 'what'
fig.show()

In [None]:
fig.layout.title = 'asdfa'

**TODO**: add lffz to the table with column with the method tag. Try choosing the best lffz?