In [1]:
from IPython.display import display
import pandas as pd 
import numpy as np
from scipy.stats import gmean
import itertools
import math

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)


In [2]:
df = pd.read_csv("collisions_summary.csv")
# Remove useless experiments! 
df = df[df['C_size'] > df['log2(nbytes)']]

# Get all the possible values of C
C_sizes = df["C_size"].unique()
print(f"C_sizes are {C_sizes}")
print(f"Column Names are {df.columns.values.tolist()}")

C_sizes are [18 20 22 24 16]
Column Names are ['C_size', 'A_size', 'log2(nbytes)', 'difficulty', '#points', '#distinguished_points', 'log2(#distinguished_points)', '#collisions', 'log2(#collisions)', '#updates', 'time(sec)']


In [3]:
# Ignore experiments where we have more memory than the problem size
df = df[df['C_size'] > df['log2(nbytes)']]
# Get all C_sizes 
C_sizes = df["C_size"].unique()


In [4]:
print("avg stats for a normal run...\n"
     "scale: sqrt(n^3/mem)\n"
     "----------------------\n")

for c in C_sizes:
    print(f"C_size = {c}, scale = 2^{c*3/2}/sqrt(mem)\n")
    df_C = df[df["C_size"] == c]
    
    difficulties = df_C["difficulty"].unique()
    rams = df_C["log2(nbytes)"].unique()
    
    for d in difficulties:
        df_C_d = df_C[df_C['difficulty'] == d]

        for m in rams:
            df_C_d_m = df_C_d[df_C_d['log2(nbytes)'] == m]
            
            # Calculate the mean of the desired column (e.g., 'Column1')
            ndist_avg = gmean(df_C_d_m["#distinguished_points"])
            npoints_avg = gmean(df_C_d_m["#points"])
            collisions_avg = gmean(df_C_d_m["#collisions"])
            
            # in the papaer they have divided by sqrt(n^3/w)
            lg2_scale_factor = (c*3 - m)/2
            scale_factor = 2**lg2_scale_factor
            
            ndist_scaled = ndist_avg / scale_factor
            npoints_scaled =  npoints_avg / scale_factor
            ncollisions_scaled = collisions_avg / scale_factor
        
            ndist_std_scaled = df_C_d_m["#distinguished_points"].std() / scale_factor
            log2_npoints = np.log2(df_C_d_m["#points"]) # row
            log2_npoints_std =  log2_npoints.std() # single value
            
            npoints_std_scaled = (2**log2_npoints_std) / scale_factor
            ncollisions_std_scaled = df_C_d_m["#collisions"].std() / scale_factor

            nupdates_avg = df_C_d_m["#updates"].mean()
            
            # skip entries that don't have values
            if not (math.isnan(ndist_avg) or math.isnan(collisions_avg)):
                # print(f"collisions_avg = 2^{np.log2(collisions_avg):.2f}, npoints_std = 2^{log2_npoints_std:.2f}")
                print(f"- diff = {d}, mem=2^{m:.03f}, nruns = {df_C_d_m.shape[0]}, #npoints/#ndist={npoints_avg/ndist_avg:.2f} should be {2**d}")
                print(f"- #points= {npoints_scaled:.03f}*sqrt(n^3/mem) where sqrt(n^3/mem) = 2^{np.log2(np.sqrt(2**(3*c - m) )):.2f}.")
                print(f"* Note: The paper simulation always have G*sqrt(n^3/mem) where G < 10")
                print("Geometric mean:")
                print(f"- #points = 2^{np.log2(npoints_avg):.2f}, σ(lg2(npoints)) = {log2_npoints_std:.2f}")
                print("Average by summation")
                print(f"- #points = 2^{np.log2( df_C_d_m['#points'].mean() ) : .2f}, σ(#points) = 2^{np.log2( df_C_d_m['#points'].std() ) : .2f}")
                print(f"- #function_updates = {nupdates_avg:.2f}, in theory it should be {2**c / (2*collisions_avg):.2f}")  
        print("-"*40)

    print("="*40 + "\n")

avg stats for a normal run...
scale: sqrt(n^3/mem)
----------------------

C_size = 18, scale = 2^27.0/sqrt(mem)

- diff = 6, mem=2^13.322, nruns = 3642, #npoints/#ndist=59.83 should be 64
- #points= 36.202*sqrt(n^3/mem) where sqrt(n^3/mem) = 2^20.34.
* Note: The paper simulation always have G*sqrt(n^3/mem) where G < 10
Geometric mean:
- #points = 2^25.52, σ(lg2(npoints)) = 2.01
Average by summation
- #points = 2^ 26.58, σ(#points) = 2^ 26.88
- #function_updates = 2650.94, in theory it should be 3.09
----------------------------------------

C_size = 20, scale = 2^30.0/sqrt(mem)

- diff = 7, mem=2^14.322, nruns = 1726, #npoints/#ndist=120.80 should be 128
- #points= 34.422*sqrt(n^3/mem) where sqrt(n^3/mem) = 2^22.84.
* Note: The paper simulation always have G*sqrt(n^3/mem) where G < 10
Geometric mean:
- #points = 2^27.94, σ(lg2(npoints)) = 1.93
Average by summation
- #points = 2^ 28.87, σ(#points) = 2^ 29.04
- #function_updates = 3229.02, in theory it should be 4.59
-------------------