In [1]:
import sys
import os
import time
import gc
import math
import random as random
import csv

from sortedcontainers import SortedList as SortedcontainersSortedList, SortedDict as SortedcontainersSortedDict
from pyskiplist import SkipList as PySkipList

from witchcraft.sortedlist import SplitList, MonoboundSplitList
from witchcraft.sorteddict import RoaringTeleportList, RoaringSplitList

In [3]:
a = MonoboundSplitList()

a.insert(4)
result_true = a.lookup(4)
result_false = a.lookup(5)

assert result_true == True
assert result_false == False

print('Monobound works!')

del a
del result_true
del result_false

Monobound works!


# Welcome.

In this notebook there are 4 algorithms:

* Roaring Teleport Lists
* Roaring Split List
* Split List
* Monobound Split List

**Roaring** refers to the usage of [Roaring Bitmaps](https://arxiv.org/pdf/1603.06549.pdf) as indexes.

That are compared with `sortedcontainers`'s SortedList.

## Invariants

All data structures have the following invariants:

* Geometrically distributed random heights that hold both the data and the indexes
* Clever use of min and max values in order to speed up lookups and inserts

# Official Benchmarks

We run comparisons for two abstract data structures:

`SortedList` - SplitList, MonoboundSplitList, SkipList from `pyskiplist`, SortedList from `sortedcontainers`

`SortedDict` - RoaringSplitList, RoaringTeleportList, SortedDict from `sortedcontainers`

## SortedList

In [3]:
# INSERT

def insert_sortedcontainers_list(lst, data):
    for el in data:
        _ = lst.add(el)
        
def insert_sortedcontainers_dict(dct, data):
    for el in data:
        _ = dct.setdefault(el)

def insert_our(lst, data):
    for el in data:
        _ = lst.insert(el)
        
def insert_pyskiplist(lst, data):
    for el in data:
        _ = lst.insert(el, el)
        
# LOOKUP

def lookup_sortedcontainers_list(lst, data):
    for el in data:
        _ = el in lst
        
def lookup_sortedcontainers_dict(dct, data):
    for el in data:
        _ = dct.get(el)

def lookup_our(lst, data):
    for el in data:
        _ = lst.lookup(el)
        
def lookup_pyskiplist(lst, data):
    for el in data:
        _ = lst.search(el)

In [100]:
n_seeds = 100 # number of randomized trials for each configuration
n_elements_list = [int(el) for el in [1e1, 1e2, 1e3, 1e4, 1e5, 1e6]]
loads = [2_000, 10_000]
element_max_size = int(2e6)

total_n_trials = n_seeds * len(n_elements_list)
trial_count = 0

with open('benchmarks.csv', 'w', newline='') as file:
    
    writer = csv.writer(file)
    writer.writerow(["Abstract Data Type", "Implementation", "Data Size", "Runtime", "Operation"])

    for n_elements in n_elements_list:

            for seed in range(n_seeds):
                trial_count += 1

                print(f'-- Trial {trial_count}/{total_n_trials}. Seed: {seed}. Data size: {n_elements}. Running...', end=' ')

                trial_start = time.time()
                
                random.seed(seed)
                trial_insert_data = [random.randint(1, element_max_size) for i in range(n_elements)] 

                random.seed(seed+1)
                trial_lookup_not_inserted_data = [random.randint(1, element_max_size) for i in range(n_elements)] 

                # SortedList
                sortedcontainers_list = SortedcontainersSortedList()
                pyskiplist = PySkipList()
                
                # SortedDict
                our_teleport_list = TeleportList()
                sortedcontainers_sorted_dict = SortedcontainersSortedDict()

                # INSERT

                t = %timeit -q -o -r 1 -n 1 insert_sortedcontainers_list(sortedcontainers_list, trial_insert_data)
                writer.writerow(['SortedList', f'SortedContainersSortedList', n_elements, f'{t.average:.8f}', 'insert'])
                
                t = %timeit -q -o -r 1 -n 1 insert_pyskiplist(pyskiplist, trial_insert_data)
                writer.writerow(['SortedList', f'PySkipList', n_elements, f'{t.average:.8f}', 'insert'])
                
                t = %timeit -q -o -r 1 -n 1 insert_our(our_teleport_list, trial_insert_data)
                writer.writerow(['SortedDict', f'RoaringTeleportList', n_elements, f'{t.average:.8f}', 'insert'])
                
                t = %timeit -q -o -r 1 -n 1 insert_sortedcontainers_dict(sortedcontainers_sorted_dict, trial_insert_data)
                writer.writerow(['SortedDict', f'SortedContainersSortedDict', n_elements, f'{t.average:.8f}', 'insert'])

                # LOOKUP
                
                # lookup data that was inserted
                t = %timeit -q -o -r 1 -n 1 lookup_sortedcontainers_list(sortedcontainers_list, trial_insert_data)
                writer.writerow(['SortedList', f'SortedContainersSortedList', n_elements, f'{t.average:.8f}', 'lookup_inserted'])
                
                t = %timeit -q -o -r 1 -n 1 lookup_pyskiplist(pyskiplist, trial_insert_data)
                writer.writerow(['SortedList', f'PySkipList', n_elements, f'{t.average:.8f}', 'lookup_inserted'])
                
                t = %timeit -q -o -r 1 -n 1 lookup_our(our_teleport_list, trial_insert_data)
                writer.writerow(['SortedDict', f'RoaringTeleportList', n_elements, f'{t.average:.8f}', 'lookup_inserted'])
                
                t = %timeit -q -o -r 1 -n 1 lookup_sortedcontainers_dict(sortedcontainers_sorted_dict, trial_insert_data)
                writer.writerow(['SortedDict', f'SortedContainersSortedDict', n_elements, f'{t.average:.8f}', 'lookup_inserted'])

                # lookup data that was NOT inserted
                t = %timeit -q -o -r 1 -n 1 lookup_sortedcontainers_list(sortedcontainers_list, trial_lookup_not_inserted_data)
                writer.writerow(['SortedList', f'SortedContainersSortedList', n_elements, f'{t.average:.8f}', 'lookup_not_inserted'])
                
                t = %timeit -q -o -r 1 -n 1 lookup_pyskiplist(pyskiplist, trial_lookup_not_inserted_data)
                writer.writerow(['SortedList', f'PySkipList', n_elements, f'{t.average:.8f}', 'lookup_not_inserted'])
                
                t = %timeit -q -o -r 1 -n 1 lookup_our(our_teleport_list, trial_lookup_not_inserted_data)
                writer.writerow(['SortedDict', f'RoaringTeleportList', n_elements, f'{t.average:.8f}', 'lookup_not_inserted'])
                
                t = %timeit -q -o -r 1 -n 1 lookup_sortedcontainers_dict(sortedcontainers_sorted_dict, trial_lookup_not_inserted_data)
                writer.writerow(['SortedDict', f'SortedContainersSortedDict', n_elements, f'{t.average:.8f}', 'lookup_not_inserted'])
                
                for load in loads:
                    
                    # SortedList
                    our_splitlist = SplitList(load)
                    our_mono_splist = MonoboundSplitList(load)
                    
                    # SortedDict
                    our_roaring_splist = RoaringSplitList(load)
                    
                    # INSERT

                    t = %timeit -q -o -r 1 -n 1 insert_our(our_splitlist, trial_insert_data)
                    writer.writerow(['SortedList', f'SplitList-{load}', n_elements, f'{t.average:.8f}', 'insert'])

                    t = %timeit -q -o -r 1 -n 1 insert_our(our_mono_splist, trial_insert_data)
                    writer.writerow(['SortedList', f'MonoboundSplitList-{load}', n_elements, f'{t.average:.8f}', 'insert'])

                    t = %timeit -q -o -r 1 -n 1 insert_our(our_roaring_splist, trial_insert_data)
                    writer.writerow(['SortedDict', f'RoaringSplitList-{load}', n_elements, f'{t.average:.8f}', 'insert'])

                    # LOOKUP
                    
                    # lookup data that was inserted
                    t = %timeit -q -o -r 1 -n 1 lookup_our(our_splitlist, trial_insert_data)
                    writer.writerow(['SortedList', f'SplitList-{load}', n_elements, f'{t.average:.8f}', 'lookup_inserted'])

                    t = %timeit -q -o -r 1 -n 1 lookup_our(our_mono_splist, trial_insert_data)
                    writer.writerow(['SortedList', f'MonoboundSplitList-{load}', n_elements, f'{t.average:.8f}', 'lookup_inserted'])

                    t = %timeit -q -o -r 1 -n 1 lookup_our(our_roaring_splist, trial_insert_data)
                    writer.writerow(['SortedDict', f'RoaringSplitList-{load}', n_elements, f'{t.average:.8f}', 'lookup_inserted'])
                    
                    # lookup data that was NOT inserted
                    t = %timeit -q -o -r 1 -n 1 lookup_our(our_splitlist, trial_lookup_not_inserted_data)
                    writer.writerow(['SortedList', f'SplitList-{load}', n_elements, f'{t.average:.8f}', 'lookup_not_inserted'])

                    t = %timeit -q -o -r 1 -n 1 lookup_our(our_mono_splist, trial_lookup_not_inserted_data)
                    writer.writerow(['SortedList', f'MonoboundSplitList-{load}', n_elements, f'{t.average:.8f}', 'lookup_not_inserted'])

                    t = %timeit -q -o -r 1 -n 1 lookup_our(our_roaring_splist, trial_lookup_not_inserted_data)
                    writer.writerow(['SortedDict', f'RoaringSplitList-{load}', n_elements, f'{t.average:.8f}', 'lookup_not_inserted'])
                    
                    # small cleanup
                    del our_splitlist
                    del our_mono_splist
                    del our_roaring_splist
                                        
                # cleanup
                del trial_insert_data
                del trial_lookup_not_inserted_data
                
                del sortedcontainers_list
                del pyskiplist
                del our_teleport_list
                del sortedcontainers_sorted_dict

                gc.collect() # Sir garbage collector, please clean up this mess.

                trial_total = time.time() - trial_start

                print(f'Done: {trial_total:.2f}s --')
            

-- Trial 1/600. Seed: 0. Data size: 10. Running... Done: 0.12s --
-- Trial 2/600. Seed: 1. Data size: 10. Running... Done: 0.12s --
-- Trial 3/600. Seed: 2. Data size: 10. Running... Done: 0.12s --
-- Trial 4/600. Seed: 3. Data size: 10. Running... Done: 0.13s --
-- Trial 5/600. Seed: 4. Data size: 10. Running... Done: 0.12s --
-- Trial 6/600. Seed: 5. Data size: 10. Running... Done: 0.12s --
-- Trial 7/600. Seed: 6. Data size: 10. Running... Done: 0.13s --
-- Trial 8/600. Seed: 7. Data size: 10. Running... Done: 0.12s --
-- Trial 9/600. Seed: 8. Data size: 10. Running... Done: 0.12s --
-- Trial 10/600. Seed: 9. Data size: 10. Running... Done: 0.13s --
-- Trial 11/600. Seed: 10. Data size: 10. Running... Done: 0.12s --
-- Trial 12/600. Seed: 11. Data size: 10. Running... Done: 0.12s --
-- Trial 13/600. Seed: 12. Data size: 10. Running... Done: 0.12s --
-- Trial 14/600. Seed: 13. Data size: 10. Running... Done: 0.12s --
-- Trial 15/600. Seed: 14. Data size: 10. Running... Done: 0.12s --