<a href="https://colab.research.google.com/github/byatrsa/bloomfilter/blob/master/DCOMB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Library

In [None]:
!pip install bitarray

import math
import time
import hashlib
import pandas as pd

from bitarray import bitarray
from google.colab import files

Collecting bitarray
  Downloading bitarray-2.3.5.tar.gz (88 kB)
[?25l[K     |███▊                            | 10 kB 21.9 MB/s eta 0:00:01[K     |███████▍                        | 20 kB 23.1 MB/s eta 0:00:01[K     |███████████▏                    | 30 kB 25.3 MB/s eta 0:00:01[K     |██████████████▉                 | 40 kB 26.6 MB/s eta 0:00:01[K     |██████████████████▌             | 51 kB 21.2 MB/s eta 0:00:01[K     |██████████████████████▎         | 61 kB 16.8 MB/s eta 0:00:01[K     |██████████████████████████      | 71 kB 13.4 MB/s eta 0:00:01[K     |█████████████████████████████▋  | 81 kB 14.7 MB/s eta 0:00:01[K     |████████████████████████████████| 88 kB 5.1 MB/s 
[?25hBuilding wheels for collected packages: bitarray
  Building wheel for bitarray (setup.py) ... [?25l[?25hdone
  Created wheel for bitarray: filename=bitarray-2.3.5-cp37-cp37m-linux_x86_64.whl size=171997 sha256=ae035ea891334fdfb3e75894765af1388a49c58c0ba3671c85c5263a41898986
  Stored in directo

# CONFIG: Set constants' value

In [None]:
 # static bloom filter capacity
n = 20

 # default false positive probablity
p = 0.001

 # pk code group size in ubf
a = 1000

# set the number of block that will be used
block_count = 1000

# DATASET: Load from CSV
Load dataset from prepared csv files.

In [None]:
## Read csv dataset
source_block_df = pd.read_csv('https://gist.github.com/alanmsmxyz/24cfbeb098ed165892505aafeaea548b/raw/b6054da3ab0fc03c4c0e4a471543ff94494d45e1/bf_block.csv')
source_index_df = pd.read_csv('https://gist.github.com/alanmsmxyz/24cfbeb098ed165892505aafeaea548b/raw/b6054da3ab0fc03c4c0e4a471543ff94494d45e1/bf_index.csv')
keys_df = pd.read_csv('https://gist.github.com/alanmsmxyz/24cfbeb098ed165892505aafeaea548b/raw/b6054da3ab0fc03c4c0e4a471543ff94494d45e1/bf_keys.csv')

In [None]:
# filter n amount of block to be used
block_df = source_block_df.head(block_count)

# get all index for filtered blocks
latest_block = block_df['block_id'][len(block_df) - 1]
index_df = source_index_df.loc[source_index_df['block_id'] <= latest_block]

print(index_df)

       block_id                                              index
0             0  57ce88c29d3090cc656de5fcc4a901b7f3e1ac8ff94e9f...
1             0  51c6fd564056fcfbc694f4790b6cac2893736989512ee0...
2             0  3fe7b323ed8cd89fa6c2e8a21b89b49309c5699f015686...
3             0  e6d04db9125c80d94525f2cbde09a5b3afdaed9cd94e02...
4             0  44989af264f3528fc7012772c79b59cbbc223d7951326e...
...         ...                                                ...
27631       999  51c6fd564056fcfbc694f4790b6cac289373698935bc88...
27632       999  3fe7b323ed8cd89fa6c2e8a21b89b49309c5699f4facc1...
27633       999  e6d04db9125c80d94525f2cbde09a5b3afdaed9cdf9021...
27634       999  44989af264f3528fc7012772c79b59cbbc223d7937b8fc...
27635       999  e3c7f9f7913aef3942ae3b839c347ff04e3d7fd3470bd6...

[27636 rows x 2 columns]


# CLASS: Bloom Filter

Class: __Bloom Filter__
> Consist of:
- Function: __init__: to initialize the class, functions, and class methods functions used
- Function: __add__: to insert a specified desired element to the array
- Function: __check__: to an existence of a specified element in the array
- Function: (classmethod) __get_size__: to calculate the size of the array
- Function: (classmethod) __get_hash_count__: to calculate the hash count needed to hash a specified element

In [None]:
class BloomFilter(object):

	'''
	Class for Bloom filter, using SHA256 hash function
	'''

	def __init__(self, items_count, fp_prob):
		'''
    Bloom Filter is a bit of array of specified size (m) and initially sets to zero

    Glosarium:
		  n = items_count : int
			  Number of items expected to be stored in bloom filter
		  p = fp_prob : float
			  False Positive probability in decimal
      k = hash count
        Hash count needed for specified value. Formula commented alongside the function.
      m = size of array
        m CAN'T BE INPUTED MANUALLY without calculating the items count and hash count. Otherwise, collision increases.
		'''

		# Initialize false positive probability in decimal
		self.fp_prob = fp_prob

		# Initialize size (m) of bit array to use
		self.size = self.get_size(items_count, fp_prob)

		# Initialize number of hash functions (k) to use
		self.hash_count = self.get_hash_count(self.size, items_count)

		# Initialize bit array of given size
    # Creating the array that will use the bloom filter method
		self.bit_array = bitarray(self.size)

		# Initialize all bits as 0
		self.bit_array.setall(0)

	@classmethod
	def from_bit_array(self, text, n, fp_prob):
		ba = bitarray(len(text))

		for i in range(len(text)):
			ba[i] = int(text[i])

		bf = BloomFilter(n, fp_prob)

		bf.bit_array = ba

		return bf

	def add(self, item):
		'''
		Encode and insert an item into the filter
		'''
		digests = []
		for i in range(self.hash_count):

			# create digest for given item.
			# using SHA256
      # checking the bit value
      # set bit value = position mod m
			digest = int(hashlib.sha256(item.encode()).hexdigest(),16) % self.size
			digests.append(digest)

			# set the bit True in bit_array
			self.bit_array[digest] = True

	def check(self, item):
		'''
		Check for existence of an item in filter
		'''
		for i in range(self.hash_count):
			digest = int(hashlib.sha256(item.encode()).hexdigest(),16) % self.size
			if self.bit_array[digest] == False:

				# if any of bit is False then,its not present
				# in filter
				# else there is probability that it exist
				return False
		return True

	@classmethod
	def get_size(self, n, p):
		'''
		Return the size of bit array(m) to used using
		following formula
		m = -(n * lg(p)) / (lg(2)^2)
		n : int
			number of items expected to be stored in filter
		p : float
			False Positive probability in decimal
		'''
		m = -(n * math.log(p))/(math.log(2)**2)
		return int(m)

	@classmethod
	def get_hash_count(self, m, n):
		'''
		Return the hash function(k) to be used using
		following formula
		k = (m/n) * lg(2)

		m : int
			size of bit array
		n : int
			number of items expected to be stored in filter
		'''
		k = (m/n) * math.log(2)
		return int(k)

# FUNCTION: create_list


In [None]:
def create_list(array_count, element_count, fp_prob):
  '''
  Function to create multiple array that recall bloom filter method.
  '''
  array_bf = []
  a = array_count
  n = element_count
  p = fp_prob

  for i in range(a):
    BF = BloomFilter(n, p)
    array_bf.append(BF)

  return array_bf

  # baca data dari csv, insert.

# DCOMB: Layer 1

## Static

In [None]:
def create_layer_1_static(layer_1_static_list):
  layer_1_static_list.append(BloomFilter(n, p))

  return layer_1_static_list

In [None]:
def add_layer_1_static(layer_1_static_list, block_id, block_indexes_df):
  layer_1_static_list

  for index in block_indexes_df['index']:
    layer_1_static_list[block_id].add(index)

  return layer_1_static_list

In [None]:
def query_layer_1_static(layer_1_static_list, block_id, index):
  return layer_1_static_list[block_id].check(index)

## Dynamic

In [None]:
def create_layer_1_dynamic(layer_1_dynamic_list, block_indexes_df):
  layer_1_dynamic_list.append(BloomFilter(len(block_indexes_df.index), p))

  return layer_1_dynamic_list

In [None]:
def add_layer_1_dynamic(layer_1_dynamic_list, block_id, block_indexes_df):
  for index in block_indexes_df['index']:
    layer_1_dynamic_list[block_id].add(index)

  return layer_1_dynamic_list

In [None]:
def query_layer_1_dynamic(layer_1_dynamic_list, block_id, index):
  return layer_1_dynamic_list[block_id].check(index)

# DCOMB: Layer 2

In [None]:
def get_pk_from_index(index):
  # index consist of
  # stream head hash (40)+ cipher hash (40) + pk
  return index[80:]

In [None]:
def pk_to_pk_code(pk):
  return ''.join(format(x, 'b') for x in bytearray(pk, 'UTF-8'))

In [None]:
def get_longest_pk(block_index_df):
  longest_pk = ''

  for index in block_index_df['index']:
    current_index_pk = get_pk_from_index(index)

    if len(current_index_pk) > len(longest_pk):
      longest_pk = current_index_pk

  return longest_pk

In [None]:
def get_padded_pk_code_list(block_index_df, block_longest_pk_code):
  block_pk_code_length = len(block_longest_pk_code)
  block_pk_code_list = []

  for index in block_index_df['index']:
    # padd 0 (ljust) of each index pk until len(pk_code) == len(block_longest_pk_code)
    pk_code = pk_to_pk_code(get_pk_from_index(index))
    pk_code = pk_code.ljust(block_pk_code_length, '0')

    block_pk_code_list.append(pk_code)

  return block_pk_code_list

## 2.1. UBF1

In [None]:
def create_ubf1(ubf1_list, block_id, block_longest_pk_code):
  f = len(block_longest_pk_code)
  union_bits_count = f - a + 1

  ubf1 = create_list(union_bits_count, n, p)

  ubf1_list.append(ubf1)
  
  return ubf1_list

In [None]:
def add_ubf1(ubf1_list, block_id, block_longest_pk_code, block_pk_code_list):
  for pk_code in block_pk_code_list:    

    for u in range(len(ubf1_list[block_id])):
      ubf1_list[block_id][u].add(pk_code[:a])

      pk_code = pk_code[1:]
    
  return ubf1_list

In [None]:
def query_ubf1(ubf1_list, block_id, pk_code):
  query_result = True

  for u in range(len(ubf1_list[block_id])):
    query_result = query_result and ubf1_list[block_id][u].check(pk_code[:a])

    pk_code = pk_code[1:]

  return query_result

## 2.1. UBF2

In [None]:
def create_ubf2(ubf2_list, block_id, block_longest_pk_code):
  f = len(block_longest_pk_code)
  union_bits_count = -(f // -a) # equal to ceil(f / a)

  ubf2 = create_list(union_bits_count, n, p)

  ubf2_list.append(ubf2)
  
  return ubf2_list

In [None]:
def add_ubf2(ubf2_list, block_id, block_longest_pk_code, block_pk_code_list):  
  for pk_code in block_pk_code_list:
    temp = pk_code 

    for u in range(len(ubf2_list[block_id])):
      ubf2_list[block_id][u].add(pk_code[:a])

      pk_code = pk_code[a:]

  return ubf2_list

In [None]:
def query_ubf2(ubf2_list, block_id, pk_code):
  query_result = True

  for u in range(len(ubf2_list[block_id])):
    query_result = query_result and ubf2_list[block_id][u].check(pk_code[:a])

    pk_code = pk_code[a:]

  return query_result

## 2.2. DBF

In [None]:
def create_dbf(dbf_list, block_id, block_longest_pk_code):
  dbf = create_list(len(block_longest_pk_code), n, p)

  dbf_list.append(dbf)
  
  return dbf_list

In [None]:
def add_dbf(dbf_list, block_id, block_longest_pk_code, block_indexes_df):  
  for i in range(len(block_longest_pk_code)):
    if int(block_longest_pk_code[i]) == 0: continue
    
    for index in block_indexes_df['index']:
      dbf_list[block_id][i].add(index)

  return dbf_list

In [None]:
def query_dbf(dbf_list, block_id, longest_pk_code, index):  
  query_result = True

  for i in range(len(block_longest_pk_code)):
    if int(block_longest_pk_code[i]) == 0: continue
    
    query_result = query_result and dbf_list[block_id][i].check(index)

  return query_result

# GLOBAL VARIABLES
To store generated dcomb result

In [None]:
layer_1_static_list = []
layer_1_dynamic_list = []
layer_2_ubf1_list = []
layer_2_ubf2_list = []
layer_2_dbf_list = []

# BENCHMARK: Insert
Insert is done per block basis

In [None]:
insert_time_df = pd.DataFrame(data = {
    'block_id': [],
    'layer_1_static': [],
    'layer_1_dynamic': [],
    'layer_2_ubf1': [],
    'layer_2_ubf2': [],
    'layer_2_dbf': [],
    'preparation': [], # time used to grab pk code from index, convert it to pk code
})

# iterate through every block available in prepared data
for block_id in block_df['block_id']:
  # get all indexes for corresponding block
  block_indexes_df = index_df.loc[index_df['block_id'] == block_id]

  # preparation
  t0_prep = time.perf_counter()

  block_longest_pk = get_longest_pk(block_indexes_df)
  block_longest_pk_code = pk_to_pk_code(block_longest_pk)

  pk_code_list = get_padded_pk_code_list(block_indexes_df, block_longest_pk_code)

  t1_prep = time.perf_counter()
  tn_prep = t1_prep - t0_prep
  # preparation



  # insert to layer 1 static
  t0_l1s = time.perf_counter()

  layer_1_static_list = create_layer_1_static(layer_1_static_list)
  layer_1_static_list = add_layer_1_static(layer_1_static_list, block_id, block_indexes_df)

  t1_l1s = time.perf_counter()
  tn_l1s = t1_l1s - t0_l1s
  # endof insert to layer 1 static



  # insert to layer 1 dynamic
  t0_l1d = time.perf_counter()

  layer_1_dynamic_list = create_layer_1_dynamic(layer_1_dynamic_list, block_indexes_df)
  layer_1_dynamic_list = add_layer_1_dynamic(layer_1_dynamic_list, block_id, block_indexes_df)

  t1_l1d = time.perf_counter()
  tn_l1d = t1_l1d - t0_l1d
  # endof insert to layer 1 dynamic



  # insert to layer 2 ubf1
  t0_ubf1 = time.perf_counter()

  layer_2_ubf1_list = create_ubf1(layer_2_ubf1_list, block_id, block_longest_pk_code)
  layer_2_ubf1_list = add_ubf1(layer_2_ubf1_list, block_id, block_longest_pk_code, pk_code_list)

  t1_ubf1 = time.perf_counter()
  tn_ubf1 = t1_ubf1 - t0_ubf1
  # endof insert to layer 2 ubf1


  # insert to layer 2 ubf2
  t0_ubf2 = time.perf_counter()

  layer_2_ubf2_list = create_ubf2(layer_2_ubf2_list, block_id, block_longest_pk_code)
  layer_2_ubf2_list = add_ubf2(layer_2_ubf2_list, block_id, block_longest_pk_code, pk_code_list)

  t1_ubf2 = time.perf_counter()
  tn_ubf2 = t1_ubf2 - t0_ubf2
  # endof insert to layer 2 ubf2


  # insert to layer 2 dbf
  t0_dbf = time.perf_counter()

  layer_2_dbf_list = create_dbf(layer_2_dbf_list, block_id, block_longest_pk_code)
  layer_2_dbf_list = add_dbf(layer_2_dbf_list, block_id, block_longest_pk_code, block_indexes_df)

  t1_dbf = time.perf_counter()
  tn_dbf = t1_dbf - t0_dbf
  # endof insert to layer 2 dbf


  insert_time_df = insert_time_df.append({
      'block_id': block_id,
      'layer_1_static': tn_l1s,
      'layer_1_dynamic': tn_l1d,
      'layer_2_ubf1': tn_ubf1,
      'layer_2_ubf2': tn_ubf2,
      'layer_2_dbf': tn_dbf,
      'preparation': tn_prep,
  }, ignore_index=True)

insert_time_df['block_id'] = insert_time_df['block_id'].apply(int)

In [None]:
# export to csv
insert_time_df.to_csv('insert_time_df.csv', index=False)
files.download('insert_time_df.csv')

print(insert_time_df)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

     block_id  layer_1_static  ...  layer_2_dbf  preparation
0           0        0.000196  ...     0.181354     0.000939
1           1        0.000198  ...     0.177693     0.000873
2           2        0.000201  ...     0.204356     0.001001
3           3        0.000193  ...     0.185383     0.000910
4           4        0.000201  ...     0.175817     0.000879
..        ...             ...  ...          ...          ...
995       995        0.001017  ...     0.994705     0.004000
996       996        0.000985  ...     1.010114     0.003947
997       997        0.000959  ...     1.001142     0.004365
998       998        0.000971  ...     1.013230     0.004289
999       999        0.000964  ...     0.999831     0.003978

[1000 rows x 7 columns]


# BENCHMARK: Query

In [None]:
query_time_df = pd.DataFrame(data = {
    'block_id': [],
    'index': [],
    'result_layer_1_static': [],
    'time_layer_1_static': [],
    'result_layer_1_dynamic': [],
    'time_layer_1_dynamic': [],
    'result_layer_2_ubf1': [],
    'time_layer_2_ubf1': [],
    'result_layer_2_ubf2': [],
    'time_layer_2_ubf2': [],
    'result_layer_2_dbf': [],
    'time_layer_2_dbf': [],
    'preparation': [],
})

# iterate through every block available in prepared data
for block_id in block_df['block_id']:
  block_indexes_df = index_df.loc[index_df['block_id'] == block_id]

  block_longest_pk = get_longest_pk(block_indexes_df)
  block_longest_pk_code = pk_to_pk_code(block_longest_pk)

  for index in block_indexes_df['index']:
    # # preparation
    t0_prep = time.perf_counter()

    pk = get_pk_from_index(index)
    pk_code = pk_to_pk_code(pk)
    pk_code = pk_code.ljust(len(block_longest_pk_code), '0')

    t1_prep = time.perf_counter()
    tn_prep = t1_prep - t0_prep
    # # preparation


    # # query to layer 1 static
    t0_l1s = time.perf_counter()

    r_l1s = layer_1_static_list[block_id].check(index)

    t1_l1s = time.perf_counter()
    tn_l1s = t1_l1s - t0_l1s
    # # endof query to layer 1 static



    # # query to layer 1 dynamic
    t0_l1d = time.perf_counter()

    r_l1d = layer_1_dynamic_list[block_id].check(index)

    t1_l1d = time.perf_counter()
    tn_l1d = t1_l1d - t0_l1d
    # # endof query to layer 1 dynamic



    # # query to layer 2 ubf1
    t0_ubf1 = time.perf_counter()

    r_ubf1 = query_ubf1(layer_2_ubf1_list, block_id, pk_code)

    t1_ubf1 = time.perf_counter()
    tn_ubf1 = t1_ubf1 - t0_ubf1
    # # endof query to layer 2 ubf1


    # # query to layer 2 ubf2
    t0_ubf2 = time.perf_counter()

    r_ubf2 = query_ubf2(layer_2_ubf2_list, block_id, pk_code)

    t1_ubf2 = time.perf_counter()
    tn_ubf2 = t1_ubf2 - t0_ubf2
    # # endof query to layer 2 ubf2


    # # insert to layer 2 dbf
    t0_dbf = time.perf_counter()

    r_dbf = query_dbf(layer_2_dbf_list, block_id, block_longest_pk_code, index)

    t1_dbf = time.perf_counter()
    tn_dbf = t1_dbf - t0_dbf
    # # endof insert to layer 2 dbf


    query_time_df = query_time_df.append({
        'block_id': block_id,
        'index': index,
        'result_layer_1_static': r_l1s,
        'time_layer_1_static': tn_l1s,
        'result_layer_1_dynamic': r_l1d,
        'time_layer_1_dynamic': tn_l1d,
        'result_layer_2_ubf1': r_ubf1,
        'time_layer_2_ubf1': tn_ubf1,
        'result_layer_2_ubf2': r_ubf2,
        'time_layer_2_ubf2': tn_ubf2,
        'result_layer_2_dbf': r_dbf,
        'time_layer_2_dbf': tn_dbf,

        'preparation': tn_prep,
    }, ignore_index=True)

query_time_df['block_id'] = query_time_df['block_id'].apply(int)
query_time_df['result_layer_1_static'] = query_time_df['result_layer_1_static'].apply(bool)
query_time_df['result_layer_1_dynamic'] = query_time_df['result_layer_1_dynamic'].apply(bool)
query_time_df['result_layer_2_ubf1'] = query_time_df['result_layer_2_ubf1'].apply(bool)
query_time_df['result_layer_2_ubf2'] = query_time_df['result_layer_2_ubf2'].apply(bool)
query_time_df['result_layer_2_dbf'] = query_time_df['result_layer_2_dbf'].apply(bool)

In [None]:
# export to csv
query_time_df.to_csv('query_time_df.csv', index=False)
files.download('query_time_df.csv')

print(query_time_df)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

       block_id  ... preparation
0             0  ...    0.000107
1             0  ...    0.000136
2             0  ...    0.000130
3             0  ...    0.000120
4             0  ...    0.000125
...         ...  ...         ...
27631       999  ...    0.000129
27632       999  ...    0.000127
27633       999  ...    0.000115
27634       999  ...    0.000130
27635       999  ...    0.000112

[27636 rows x 13 columns]


# BENCHMARK: FPP

In [None]:
fpp_df = pd.DataFrame(data = {
    'block_id': [],
    'layer_1_static': [],
    'layer_1_dynamic': [],
    'layer_2_ubf1': [],
    'layer_2_ubf2': [],
    'layer_2_dbf': [],
})

for block_id in block_df['block_id']:

  # calculate fpp for layer 1 static
  l1s_n = n # n for layer 1 static
  l1s_k = layer_1_static_list[block_id].hash_count # k for layer 1 static

  l1s_fpp = pow(1 - (pow(1 - (1 / 383), (l1s_n * l1s_k))), l1s_k)
  # endof calculate fpp for layer 1 static



  # calculate fpp for layer 1 dynamic
  l1d_n = len(index_df.loc[index_df['block_id'] == block_id].index) # n for layer 1 dynamic
  l1d_k = layer_1_dynamic_list[block_id].hash_count # k layer 1 dynamic

  l1d_fpp = pow(1 - (pow(1 - (1 / 383), (l1d_n * l1d_k))), l1d_k)
  # endof calculate fpp for layer 1 dynamic



  block_indexes_df = index_df.loc[index_df['block_id'] == block_id]
  block_longest_pk = get_longest_pk(block_indexes_df)
  block_longest_pk_code = pk_to_pk_code(block_longest_pk)

  f = len(block_longest_pk_code) # f for ubf1, ubf2, and dbf



  # calculate fpp for layer 2 ubf1

  # get constant from ubf_id 0 for the block
  # since all bf in same block are having the same properties
  ubf1_k = layer_2_ubf1_list[block_id][0].hash_count # k for ubf1
  ubf1_m = layer_2_ubf1_list[block_id][0].size # m for ubf1

  ubf1_fpp1 = (f - a + 2)
  ubf1_fpp2 = pow(1 - pow(math.e, -1 * ((n * ubf1_k) / ubf1_m)), ubf1_k)
  ubf1_fpp = ubf1_fpp1 * ubf1_fpp2 
  # endof calculate fpp for layer 2 ubf1



  # calculate fpp for layer 2 ubf2
  
  # get constant from ubf_id 0 for the block
  # since all bf in same block are having the same properties
  ubf2_k = layer_2_ubf2_list[block_id][0].hash_count # k for ubf2
  ubf2_m = layer_2_ubf2_list[block_id][0].size # m for ubf2

  ubf2_fpp1 = -(f // -a) + 1
  ubf2_fpp2 = pow(1 - pow(math.e, -1 * ((n * ubf2_k) / ubf2_m)), ubf2_k)
  ubf2_fpp = ubf2_fpp1 * ubf2_fpp2 
  # endof calculate fpp for layer 2 ubf2


  # calculate fpp for layer 2 dbf
  dbf_q = a

  dbf_fpp1 = pow(p, dbf_q)
  dbf_fpp2 = pow(1 - p, f - dbf_q + 2)
  dbf_fpp = dbf_fpp1 * dbf_fpp2 
  # endof calculate fpp for layer 2 dbf

  fpp_df = fpp_df.append({
    'block_id': block_id,
    'layer_1_static': l1s_fpp,
    'layer_1_dynamic': l1d_fpp,
    'layer_2_ubf1': ubf1_fpp,
    'layer_2_ubf2': ubf2_fpp,
    'layer_2_dbf': dbf_fpp,
  }, ignore_index=True)

fpp_df['block_id'] = fpp_df['block_id'].apply(int)

In [None]:
# export to csv
fpp_df.to_csv('fpp_df.csv', index=False)
files.download('fpp_df.csv')

print(fpp_df)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

     block_id  layer_1_static  ...  layer_2_ubf2  layer_2_dbf
0           0        0.000148  ...      0.004137          0.0
1           1        0.000148  ...      0.004137          0.0
2           2        0.000148  ...      0.004137          0.0
3           3        0.000148  ...      0.004137          0.0
4           4        0.000148  ...      0.004137          0.0
..        ...             ...  ...           ...          ...
995       995        0.000148  ...      0.004137          0.0
996       996        0.000148  ...      0.004137          0.0
997       997        0.000148  ...      0.004137          0.0
998       998        0.000148  ...      0.004137          0.0
999       999        0.000148  ...      0.004137          0.0

[1000 rows x 6 columns]


# BENCHMARK: Storage

In [None]:
layer_1_static_size = 0
for block_bf in layer_1_static_list:
  layer_1_static_size += block_bf.size

print('layer 1 static:', layer_1_static_size, 'bits')

layer 1 static: 287000 bits


In [None]:
layer_1_dynamic_size = 0
for block_bf in layer_1_dynamic_list:
  layer_1_dynamic_size += block_bf.size

print('layer 1 dynamic:', layer_1_dynamic_size, 'bits')

layer 1 dynamic: 396747 bits


In [None]:
layer_2_ubf1_size = 0
for block in layer_2_ubf1_list:
  for bf in block:
    layer_2_ubf1_size += bf.size

print('layer 2 ubf1:', layer_2_ubf1_size, 'bits')

layer 2 ubf1: 299260927 bits


In [None]:
layer_2_ubf2_size = 0
for block in layer_2_ubf2_list:
  for bf in block:
    layer_2_ubf2_size += bf.size

print('layer 2 ubf2:', layer_2_ubf2_size, 'bits')

layer 2 ubf2: 861000 bits


In [None]:
layer_2_dbf_size = 0
for block in layer_2_dbf_list:
  for bf in block:
    layer_2_dbf_size += bf.size

print('layer 2 dbf:', layer_2_dbf_size, 'bits')

layer 2 dbf: 585973927 bits


# EXPORT: CSV

In [None]:
layer_1_static_df = pd.DataFrame(data = {
    'block_id': [],
    'bit_array': [],
})


for i in range(len(layer_1_static_list)):
  bit_array = ''.join(str(x) for x in layer_1_static_list[i].bit_array)
  layer_1_static_df = layer_1_static_df.append({
      'block_id': i,
      'bit_array': bit_array
  }, ignore_index=True)


layer_1_static_df['block_id'] = layer_1_static_df['block_id'].apply(int)

# print(layer_1_static_df)

In [None]:
layer_1_static_df.to_csv('layer_1_static_df.csv', index=False)
files.download('layer_1_static_df.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
layer_1_dynamic_df = pd.DataFrame(data = {
    'block_id': [],
    'bit_array': [],
})

for i in range(len(layer_1_dynamic_list)):
  bit_array = ''.join(str(x) for x in layer_1_dynamic_list[i].bit_array)
  layer_1_dynamic_df = layer_1_dynamic_df.append({
      'block_id': i,
      'bit_array': bit_array
  }, ignore_index=True)


layer_1_dynamic_df['block_id'] = layer_1_dynamic_df['block_id'].apply(int)

# print(layer_1_dynamic_df)

In [None]:
layer_1_dynamic_df.to_csv('layer_1_dynamic_df.csv', index=False)
files.download('layer_1_dynamic_df.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
layer_2_ubf1_df = pd.DataFrame(data = {
    'block_id': [],
    'ubf_id': [],
    'bit_array': [],
})

for i in range(len(layer_2_ubf1_list)):
  for j in range(len(layer_2_ubf1_list[i])):
    bit_array = ''.join(str(x) for x in layer_2_ubf1_list[i][j].bit_array)

    layer_2_ubf1_df = layer_2_ubf1_df.append({
        'block_id': i,
        'ubf_id': j,
        'bit_array': bit_array
    }, ignore_index=True)
  
  
layer_2_ubf1_df['block_id'] = layer_2_ubf1_df['block_id'].apply(int)
layer_2_ubf1_df['ubf_id'] = layer_2_ubf1_df['ubf_id'].apply(int)

In [None]:
layer_2_ubf1_df.to_csv('layer_2_ubf1_df.csv', index=False)
files.download('layer_2_ubf1_df.csv')

In [None]:
layer_2_ubf2_df = pd.DataFrame(data = {
    'block_id': [],
    'ubf_id': [],
    'bit_array': [],
})

for i in range(len(layer_2_ubf2_list)):
  for j in range(len(layer_2_ubf2_list[i])):
    bit_array = ''.join(str(x) for x in layer_2_ubf2_list[i][j].bit_array)

    layer_2_ubf2_df = layer_2_ubf2_df.append({
        'block_id': i,
        'ubf_id': j,
        'bit_array': bit_array
    }, ignore_index=True)
  
  
layer_2_ubf2_df['block_id'] = layer_2_ubf2_df['block_id'].apply(int)
layer_2_ubf2_df['ubf_id'] = layer_2_ubf2_df['ubf_id'].apply(int)

In [None]:
layer_2_ubf2_df.to_csv('layer_2_ubf2_df.csv', index=False)
files.download('layer_2_ubf2_df.csv')

**IMPORTANT NOTE!**

**Exporting dbf to csv might cause runtime to crash (out of memory)**

**Since block having ~1000 bloom filters**

In [None]:
layer_2_dbf_df = pd.DataFrame(data = {
    'block_id': [],
    'dbf_id': [],
    'bit_array': [],
})

for i in range(len(layer_2_dbf_list)):
  for j in range(len(layer_2_dbf_list[i])):
    bit_array = ''.join(str(x) for x in layer_2_dbf_list[i][j].bit_array)

    layer_2_dbf_df = layer_2_dbf_df.append({
        'block_id': i,
        'dbf_id': j,
        'bit_array': bit_array
    }, ignore_index=True)

layer_2_dbf_df['block_id'] = layer_2_dbf_df['block_id'].apply(int)
layer_2_dbf_df['dbf_id'] = layer_2_dbf_df['dbf_id'].apply(int)

In [None]:
layer_2_dbf_df.to_csv('layer_2_dbf_df.csv', index=False)
files.download('layer_2_dbf_df.csv')