In [74]:
from operator import attrgetter
from sortedcontainers import SortedList as SortedcontainersSortedList, SortedDict as SortedcontainersSortedDict
from pyskiplist import SkipList as PySkipList
import bisect
import math
import random as random
from pyroaring import BitMap
import sys
import time
import gc

# hack to add python modules from the src folder 
sys.path.append('../src')
from bisect_killer import cy_monobound

# Welcome.

In this notebook there are 4 algorithms:

* Roaring Teleport Lists
* Roaring Split List
* Split List
* Monobound Split List

**Roaring** refers to the usage of [Roaring Bitmaps](https://arxiv.org/pdf/1603.06549.pdf) as indexes.

That are compared with `sortedcontainers`'s SortedList.

## Invariants

All data structures have the following invariants:

* Geometrically distributed random heights that hold both the data and the indexes
* Clever use of min and max values in order to speed up lookups and inserts

### Roaring MinMaxDict

This is the container that will hold both the indexes and the data within each "height".

In [90]:
class RoaringMinMaxBitmap():
    def __init__(self, *arg, **kwargs):
        #super().__init__()
        self.indexes = BitMap()
        self.max = float('-inf')
        self.min = float('inf')

    def insert(self, key, value=None):
        self.indexes.add(key)
        self.max = self.indexes.max()
        self.min = self.indexes.min()

    ## Discards a VALUE/wipes it out of the index and dict
    def discard(self, key):
        #self.pop(key)
        self.indexes.discard(key)
        if not self.indexes:
            self.max = float('-inf')
            self.min = float('inf')
        else:
            self.max = self.indexes.max()
            self.min = self.indexes.min()

    def __lt__(self, other):
        return self.indexes.min() < other

## Teleport List

This is the Teleport list, the fastest data structure we have for inserts and deletions.

In [91]:
class TeleportList(dict):

    def __init__(self):
        super().__init__()
        self.height = -1
        self.subindexes = []

    def insert(self, key, value=None):

        height = int(-(math.log2(random.random())))

        if self.height < height:
            for i in range(height - self.height):
                self.subindexes.append(RoaringMinMaxBitmap())
            self.height = height

        highest = self.subindexes[height]

        if key not in highest.indexes:
            highest.insert(key, value)
            dict.__setitem__(self, key, value)

    def lookup(self, key):

        for i in self.subindexes:
            if i.min <= key <= i.max:
                if key in i.indexes:
                    return dict.__getitem__(self, key)
        return False

    def delete(self, key):
        for i in self.subindexes:
            if i.min <= key <= i.max:
                if key in i.indexes:
                    dict.__setitem__(self, key, '<deleted>')

    def discard(self, key):
        for i in self.subindexes:
            if i.min <= key <= i.max:
                if key in i.indexes:
                    i.discard(key)
                    dict.__delitem__(self, key)

    def show_hedges(self):
        for i in self.subindexes:
            print(i.indexes)

    def show_minmax(self):
        for i in self.subindexes:
            print(f'({i.min}, {i.max})')

### Roaring MaxDict

This is the container that will hold both the indexes and the data within each "height".

In [92]:
class RoaringMaxDict:
    def __init__(self):
        self.indexes = BitMap()
        self.max = float("-inf")

    def insert(self, key):
        self.indexes.add(key)
        self.max = self.indexes.max()

    def discard(self, key):
        self.indexes.discard(key)
        if not self.indexes:
            self.max = float('-inf')
        else:
            self.max = self.indexes.max()

    def __lt__(self, other):
        if isinstance(other, int):
            return self.max < other
        else:
            return self.max < other.max

class SortableSubList:
    def __init__(self):
        self.sublists = []

## Split List

This is the Split list, the most stable data structure we have.

In [93]:
def splitter_two(arr, load):
    half = load // 2
    zs = arr[0:half]
    arr = arr.difference(zs)
    return zs

# if overload is detected, splits and adds a new split into levellist
def Overload(blist, i, load):
    B = RoaringMaxDict()
    candidate_sublist = blist.sublists[i]
    B.indexes = splitter_two(candidate_sublist.indexes, load)
    B.max = B.indexes.max()
    bisect.insort_left(blist.sublists, B)

class RoaringSplitList(dict):
    def __init__(self, load):
        super().__init__()
        self.height = -1
        self.blists = []
        self.load = load

    def lookup(self, key):
        for he in self.blists:
            if key <= he.sublists[-1].max:
                i = bisect.bisect_left(he.sublists, key)
                if  i != len(he.sublists) and he.sublists[i].indexes[0] <= key:
                    if key in he.sublists[i].indexes:
                        return dict.__getitem__(self, key)
                    
#         print(key) #can be used to detect if the search works or not
        return False

    def insert(self, key, value=None): # Nikita added `value=None` by default
        ## Getting the estimated geometric distribution
        height = int(-(math.log2(random.random())))
        ## Checking whether we need to add new edges
        if self.height < height:
            for i in range(height - self.height):
                B = SortableSubList()
                C = RoaringMaxDict()
                B.sublists.append(C)
                self.blists.append(B)
            self.height = height

        ## Getting the to-be-added list
        blist = self.blists[height]

        L = len(blist.sublists)
        i = bisect.bisect_left(blist.sublists, key)

        if i == 0 or L == 1:
            updated_maxlist = blist.sublists[0].indexes
            updated_maxlist.add(key)
            dict.__setitem__(self, key, value)
            blist.sublists[0].max = updated_maxlist.max()
            if len(updated_maxlist) == self.load:
                    Overload(blist, 0, self.load)
        elif i == L:
            updated_maxlist = blist.sublists[-1].indexes
            updated_maxlist.add(key)
            dict.__setitem__(self, key, value)
            blist.sublists[-1].max = key
            if len(updated_maxlist) == self.load:
                    Overload(blist, i-1, self.load)

        else:
            updated_maxlist = blist.sublists[i].indexes
            if updated_maxlist[0] <= key:
                updated_maxlist.add(key)
                dict.__setitem__(self, key, value)
                blist.sublists[i].max = updated_maxlist.max()
                if len(updated_maxlist) == self.load:
                    Overload(blist, i, self.load)
            else:
                updated_maxlist = blist.sublists[i-1].indexes
                updated_maxlist.add(key)
                dict.__setitem__(self, key, value)
                blist.sublists[i-1].max = key
                if len(updated_maxlist) == self.load:
                    Overload(blist, i-1, self.load)

    def show_hedges(self):
        for i in self.blists:
            maxes = [j.max for j in i.sublists]
            print(maxes)

    def show_edges(self):
        for i in self.blists:
            print("--------" + str(len(i.sublists)) +"-----------")
            for j in i.sublists:
                print(j.indexes)

    def show_minmax(self):
        for i in self.blists:
            print(f'({i.min}, {i.max})')

In [94]:
## Split List

In [95]:
#splits the overloaded list into two consecutive parts
def splitterSimple(arr, load):
    half = load // 2
    zs = [0] * half
    for i in range(half-1, -1, -1):
        zs[i] = arr.pop()
    return zs

def splitter2(arr, load):
    half = load // 2 
    res = [arr.pop() for i in range(half)]
    res.reverse()
    return res

# if overload is detected, splits and adds a new split into levellist
def OverloadSimple(blist, i, load):
    B = IntervalList()
    candidate_sublist = blist.sublists[i]
    B.indexes = splitterSimple(candidate_sublist.indexes, load)
    B.min = B.indexes[0]
    #B.i = - blist.sublists[i].i - 1
    B.max = candidate_sublist.max
    candidate_sublist.max = candidate_sublist.indexes[-1]
    #blist.sublists.insert(i+1, B)
    bisect.insort_left(blist.sublists, B)

class IntervalList:
    def __init__(self):
        self.indexes = []
        self.max = float("-inf")
        self.min = float("inf")

    def __lt__(self, other):
        if isinstance(other, int):
            return self.max < other
        else:
            return self.max < other.max

class LevelList:
    def __init__(self):
        self.sublists = []
        self.min = float("inf")
        self.max = float("-inf")

    def __lt__(self, other):
        return self.max > other

class SplitList:# Rucy, rename it!
    def __init__(self, load=2000):
        self.height = -1
        self.blists = []
        self.load = load
    
    def delete(self, nr): #not tested in this notebook
        for he in self.blists:
            hee = he.sublists
            if len(hee) != 0: 
                if not( nr > hee[-1].max or nr < hee[0].min): #skipping
                    i = bisect.bisect_left(hee, nr)

                    if  i != len(hee) and not(hee[i].min > nr):
                        j = bisect.bisect_left(hee[i].indexes, nr)
                        if j != len(hee[i].indexes) and hee[i].indexes[j] == nr:
                            if len(hee[i].indexes) == 1:
                                del hee[i]
                                
                            else:
                                del hee[i].indexes[j]
                                hee[i].max = hee[i].indexes[-1]
                                hee[i].min = hee[i].indexes[0]
                            return True

#         print(nr) #can be used to detect if the search works or not
        return False
    
    def lookup(self, nr):

        for he in self.blists:
            hee = he.sublists
            if len(hee) != 0: 
                if not( nr > hee[-1].max or nr < hee[0].min): #skipping
                    i = bisect.bisect_left(hee, nr)

                    if i != len(hee) and not(hee[i].min > nr):
                        j = bisect.bisect_left(hee[i].indexes, nr)
                        if j != len(hee[i].indexes) and hee[i].indexes[j] == nr:#speed up
                            return True
                        
#         print(nr)
        return False


    def insert(self, nr):
        ## Getting the estimated geometric distribution
        height = int(-(math.log2(random.random())))
        ## Checking whether we need to add new edges
        if self.height < height:
            for i in range(height - self.height):
                B = LevelList()
                C = IntervalList()
                B.sublists.append(C)
                self.blists.append(B)
            self.height = height

        ## Getting the to-be-added list
        blist = self.blists[height]

        ## Doing the search to see which Intervallist it should be in
        L = len(blist.sublists)
        i = bisect.bisect_left(blist.sublists, nr)

        ## If it's smaller than all other elements then just insort it
        if i == 0: #or L == 1:
            candid = blist.sublists[0]
            bisect.insort_left(candid.indexes, nr)
            candid.max = candid.indexes[-1]
            candid.min = candid.indexes[0]
            
            if len(candid.indexes) == self.load:
                    OverloadSimple(blist, 0, self.load)
        ## If it's bigger than all the other elements than just append it
        elif i == L:
            candid = blist.sublists[-1]
            candid.indexes.append(nr)
            candid.max = nr
            candid.min = candid.indexes[0]
            if len(candid.indexes) == self.load:
                    OverloadSimple(blist, i-1, self.load)

            ## Else add it
        else:
            candidate_sublist = blist.sublists[i]
            # if the element is also bigger than the minimum of the current list than we insort it
            if candidate_sublist.min <= nr:
                bisect.insort_left(candidate_sublist.indexes, nr)
                candidate_sublist.min = candidate_sublist.indexes[0]
                if len(blist.sublists[i].indexes) == self.load:
                    OverloadSimple(blist, i, self.load)
            # then the element must be smaller then the min of the current list but therefore
            # bigger than the max of the previous list-- so we just append it
            else:
                candidate_sublist = blist.sublists[i-1]
                candidate_sublist.indexes.append(nr)
                candidate_sublist.max = nr

                if len(candidate_sublist.indexes) == self.load:
                    OverloadSimple(blist, i-1, self.load)

    def show_hedges(self):
        for i in self.blists:
            maxes = [j.max for j in i.sublists]
            print(maxes)

    def show_edges(self):
        for i in self.blists:
            print("--------" + str(len(i.sublists)) +"-----------")
            for j in i.sublists:
                print(j.indexes)

    def show_minmax(self):
        for i in self.blists:
            print(f'({i.min}, {i.max})')


## MonoboundSplitList

SplitList with `bisect` replaced by monobound binary search implemented in Cython

In [96]:
from array import array

#splits the overloaded list into two consecutive parts
def monoboundSplitterSimple(arr, load):
    half = load // 2
    zs = array('l', [0]) * half
    for i in range(half-1, -1, -1):
        zs[i] = arr.pop()
    return zs

# if overload is detected, splits and adds a new split into levellist
def monoboundOverloadSimple(blist, i, load):
    B = MonoboundIntervalList()
    candidate_sublist = blist.sublists[i]
    B.indexes = monoboundSplitterSimple(candidate_sublist.indexes, load)
    B.min = B.indexes[0]
    #B.i = - blist.sublists[i].i - 1
    B.max = candidate_sublist.max
    candidate_sublist.max = candidate_sublist.indexes[-1]
    #blist.sublists.insert(i+1, B)
    bisect.insort_left(blist.sublists, B)

class MonoboundIntervalList:
    def __init__(self):
        self.indexes = array('l')
        self.max = float("-inf")
        self.min = float("inf")

    def __lt__(self, other):
        if isinstance(other, int):
            return self.max < other
        else:
            return self.max < other.max

class MonoboundSplitList:# Rucy, rename it!
    def __init__(self, load=2000):
        self.height = -1
        self.blists = []
        self.load = load
    
    def delete(self, nr): #not tested in this notebook
        for he in self.blists:
            hee = he.sublists
            if len(hee) != 0: 
                if not( nr > hee[-1].max or nr < hee[0].min): #skipping
                    i = bisect.bisect_left(hee, nr)

                    if  i != len(hee) and not(hee[i].min > nr):
                        j = cy_monobound.binary_search(hee[i].indexes, len(hee[i].indexes), nr)
                        if j >= 0:
                            if len(hee[i].indexes) == 1:
                                del hee[i]
                                
                            else:
                                del hee[i].indexes[j]
                                hee[i].max = hee[i].indexes[-1]
                                hee[i].min = hee[i].indexes[0]
                            return True

#         print(nr) #can be used to detect if the search works or not
        return False
    
    def lookup(self, nr):

        for he in self.blists:
            hee = he.sublists
            if len(hee) != 0: 
                if not( nr > hee[-1].max or nr < hee[0].min): #skipping
                    i = bisect.bisect_left(hee, nr)

                    if i != len(hee) and not(hee[i].min > nr):
                        j = cy_monobound.binary_search(hee[i].indexes, len(hee[i].indexes), nr)
                        
                        # this binary_search implementation returns either the found index or -1, so we only need one check
                        if j >= 0:#speed up
                            return True

        #         print(nr)
        return False


    def insert(self, nr):
        ## Getting the estimated geometric distribution
        height = int(-(math.log2(random.random())))
        ## Checking whether we need to add new edges
        if self.height < height:
            for i in range(height - self.height):
                B = LevelList()
                C = MonoboundIntervalList()
                B.sublists.append(C)
                self.blists.append(B)
            self.height = height

        ## Getting the to-be-added list
        blist = self.blists[height]

        ## Doing the search to see which MonoboundIntervalList it should be in
        L = len(blist.sublists)
        i = bisect.bisect_left(blist.sublists, nr)

        ## If it's smaller than all other elements then just insort it
        if i == 0: #or L == 1:
            candid = blist.sublists[0]
            bisect.insort_left(candid.indexes, nr)
            candid.max = candid.indexes[-1]
            candid.min = candid.indexes[0]
            
            if len(candid.indexes) == self.load:
                    monoboundOverloadSimple(blist, 0, self.load)
        ## If it's bigger than all the other elements than just append it
        elif i == L:
            candid = blist.sublists[-1]
            candid.indexes.append(nr)
            candid.max = nr
            candid.min = candid.indexes[0]
            if len(candid.indexes) == self.load:
                    monoboundOverloadSimple(blist, i-1, self.load)

            ## Else add it
        else:
            candidate_sublist = blist.sublists[i]
            # if the element is also bigger than the minimum of the current list than we insort it
            if candidate_sublist.min <= nr:
                bisect.insort_left(candidate_sublist.indexes, nr)
                candidate_sublist.min = candidate_sublist.indexes[0]
                if len(blist.sublists[i].indexes) == self.load:
                    monoboundOverloadSimple(blist, i, self.load)
            # then the element must be smaller then the min of the current list but therefore
            # bigger than the max of the previous list-- so we just append it
            else:
                candidate_sublist = blist.sublists[i-1]
                candidate_sublist.indexes.append(nr)
                candidate_sublist.max = nr

                if len(candidate_sublist.indexes) == self.load:
                    monoboundOverloadSimple(blist, i-1, self.load)

    def show_hedges(self):
        for i in self.blists:
            maxes = [j.max for j in i.sublists]
            print(maxes)

    def show_edges(self):
        for i in self.blists:
            print("--------" + str(len(i.sublists)) +"-----------")
            for j in i.sublists:
                print(j.indexes)

    def show_minmax(self):
        for i in self.blists:
            print(f'({i.min}, {i.max})')


# Benchmarks.

Let's do some simple ones.

Insert and lookup of 1 million elements.

In [73]:
random.seed(0)
load = 10_000 #need to find optimal load
tlist = TeleportList()
rslist = RoaringSplitList(load)
slist = SortedList()
splist = SplitList()
monobound_splist = MonoboundSplitList(load)

nr = 1_000_000

ten_thousand_integers = [random.randint(1, 2000000) for i in range(nr)]

def insert_tlist(tl):
    for i in range(nr):
        tl.insert(ten_thousand_integers[i])

def insert_rslist(rtl):
    for i in range(nr):
        rtl.insert(ten_thousand_integers[i])

def insert_slist(sl):
    for i in range(nr):
        sl.add(ten_thousand_integers[i])

def insert_nlist(novus):
    for i in range(nr):
        novus.insert(ten_thousand_integers[i])

%timeit -r 1 -n 1 insert_tlist(tlist)
%timeit -r 1 -n 1 insert_rslist(rslist)
%timeit -r 1 -n 1 insert_slist(slist)
%timeit -r 1 -n 1 insert_nlist(splist)
%timeit -r 1 -n 1 insert_nlist(monobound_splist)

1.88 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
2.35 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
2.62 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
3.71 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
4.06 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [12]:
def lookup_tlist(tl):
    for i in range(nr):
        tl.lookup(ten_thousand_integers[i])

def lookup_rslist(rtl):
    for i in range(nr):
        rtl.lookup(ten_thousand_integers[i])

def lookup_slist(sl):
    for i in range(nr):
        ten_thousand_integers[i] in sl

def lookup_nlist(novus):
    for i in range(nr):
        novus.lookup(ten_thousand_integers[i])

%timeit -r 10 -n 1 lookup_tlist(tlist)
%timeit -r 2 -n 1 lookup_rslist(rslist)
%timeit -r 2 -n 1 lookup_slist(slist)
%timeit -r 2 -n 1 lookup_nlist(splist)
%timeit -r 2 -n 1 lookup_nlist(monobound_splist)

908 ms ± 9.4 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)
2 s ± 215 µs per loop (mean ± std. dev. of 2 runs, 1 loop each)
1.96 s ± 428 µs per loop (mean ± std. dev. of 2 runs, 1 loop each)
5.74 s ± 14.2 ms per loop (mean ± std. dev. of 2 runs, 1 loop each)
2.21 s ± 56.8 ms per loop (mean ± std. dev. of 2 runs, 1 loop each)


#### MonoboundSplitList vs SortedContainers, Different load factors:

2,000 : 30% slower insert, 30% slower lookup

10,000: 44% slower insert, 0 to 13 % slower lookup

25,000: 84% slower insert, 0.9% slower lookup

35,000: 119% slower insert, 6% **faster** lookup

50,000: 260% slower insert, 10% **faster** lookup

## Testing more loading factors

In [None]:
loads = [i * 100 for i in range(1, 210, 20)] # from 100 to 20100

for load in loads:
    print(f'Testing new load: {load}...')
    splist_monobound = MonoboundSplitList(load=load)
#     splist_interpolated = SplitListInterpolated(load=load) # removed
    splist = SplitList(load=load)
    
    insert_nlist(splist_monobound)
#     insert_nlist(splist_interpolated) # removed
    insert_nlist(splist)
    
    print('Monobound:    ', end='')
    %timeit -r 5 -n 2 lookup_nlist(splist_monobound)   
#     print('Interpolated: ', end='') # removed
#     %timeit -r 5 -n 2 lookup_nlist(splist_interpolated) # removed
    print('Bisect:       ', end='')
    %timeit -r 5 -n 2 lookup_nlist(splist)
    
    print('\n')

In [None]:
loads = [2_000, 10_000, 14_000, 16_000, 18_000] # more specific loads, finding the optimal binary search + bucket

for load in loads:
    print(f'Testing new load: {load}...')
    splist_monobound = MonoboundSplitList(load=load)
#     splist_interpolated = SplitListInterpolated(load=load) # removed
    
    insert_nlist(splist_monobound)
#     insert_nlist(splist_interpolated) # removed
    
    print('Monobound:    ', end='')
    %timeit -r 10 -n 5 lookup_nlist(splist_monobound)   
#     print('Interpolated: ', end='')
#     %timeit -r 10 -n 5 lookup_nlist(splist_interpolated) # removed
    
    print('\n')

In [None]:
loads = [50_000, 100_000, 200_000, 500_000, 1_000_000] # just some random huge bucket sizes

for load in loads:
    print(f'Testing new load: {load}...')
    splist_monobound = MonoboundSplitList(load=load)
#     splist_interpolated = SplitListInterpolated(load=load) # removed
    
    insert_nlist(splist_monobound)
#     insert_nlist(splist_interpolated) # removed
    
    print('Monobound:    ', end='')
    %timeit -r 10 -n 5 lookup_nlist(splist_monobound)   
#     print('Interpolated: ', end='')
#     %timeit -r 10 -n 5 lookup_nlist(splist_interpolated) # removed
    
    print('\n')

In [None]:
#maybe useful for benchmarking
#nr = 100000

def ten_k_novus_s(novus):
    for i in range(nr):
        novus.lookup(ten_thousand_integers[i])

def ten_k_hg_s(HG):
    for i in range(nr):
        HG.lookup(ten_thousand_integers[i])

def ten_k_sl_s(sl):
    for i in range(nr):
        ten_thousand_integers[i] in sl

def ten_k_skeep_s(skeep):
    for i in range(nr):
        ten_thousand_integers[i] in skeep

lon = []
los = []
loskip = []
lot = []
n = 1
for j in range(n):
    timesnovus = []
    timess = []
    timestele = []
    timesskip = []
    for k in range(100, 100000, 500):
        #nlist = NovusList(i)
        nr = k
        hgraph = TeleportList()
        skeepFast = SkipList()
        slist = SortedList()
        nlist = NovusList(2000)
        random.seed(j)
        ten_thousand_integers = [random.randint(1, 1000000) for i in range(nr)]
        #ten_k_novus(nlist, ten_thousand_integers)  
        #ten_k_hg(hgraph, ten_thousand_integers)
        #ten_k_sl(slist, ten_thousand_integers)
        #ten_k_skeep(skeepFast, ten_thousand_integers)
        #ten_thousand_integers = [random.randint(1, 1000000) for i in range(nr)]
        t1 = %timeit -q -o -r 1 -n 1 ten_k_novus(nlist,ten_thousand_integers)
        t2 = %timeit -q -o -r 1 -n 1 ten_k_hg(hgraph,ten_thousand_integers)
        t3 = %timeit -q -o -r 1 -n 1 ten_k_sl(slist,ten_thousand_integers)
        t4 = %timeit -q -o -r 1 -n 1 ten_k_skeep(skeepFast,ten_thousand_integers)
        timesnovus.append(t1.best)
        timess.append(t2.best)
        timestele.append(t3.best)
        timesskip.append(t4.best)
        
    if j == 0:
        lon = np.array(timesnovus)
        los = np.array(timess)
        lot = np.array(timestele)
        loskip = np.array(timesskip)
        
    else:
        lon += np.array(timesnovus)
        los += np.array(timess)
        lot += np.array(timestele)
        loskip += np.array(timesskip)
lon /= n
los /= n
lot /= n
loskip /= n
#print(sum(times)/n)
plt.plot(np.arange(100,100000,500), lon, label="novus")
plt.plot(np.arange(100,100000,500), los,  label="tele")
plt.plot(np.arange(100,100000,500), lot,  label="sc")
plt.plot(np.arange(100,100000,500), loskip,  label="skip")
plt.legend()
plt.ylabel("time")
plt.xlabel("size")
plt.title("time effectivness")
plt.show()


## Testing more loading factors

In [80]:
loads = [i * 100 for i in range(1, 210, 20)] # from 100 to 20100

for load in loads:
    print(f'Testing new load: {load}...')
    splist_monobound = MonoboundSplitList(load=load)
#     splist_interpolated = SplitListInterpolated(load=load) # removed
    splist = SplitList(load=load)
    
    insert_nlist(splist_monobound)
#     insert_nlist(splist_interpolated) # removed
    insert_nlist(splist)
    
    print('Monobound:    ', end='')
    %timeit -r 5 -n 2 lookup_nlist(splist_monobound)   
#     print('Interpolated: ', end='') # removed
#     %timeit -r 5 -n 2 lookup_nlist(splist_interpolated) # removed
    print('Bisect:       ', end='')
    %timeit -r 5 -n 2 lookup_nlist(splist)
    
    print('\n')

Testing new load: 100...
Monobound:    286 ms ± 5.23 ms per loop (mean ± std. dev. of 5 runs, 2 loops each)
Interpolated: 280 ms ± 1e+03 µs per loop (mean ± std. dev. of 5 runs, 2 loops each)
Bisect:       562 ms ± 5.54 ms per loop (mean ± std. dev. of 5 runs, 2 loops each)


Testing new load: 2100...
Monobound:    198 ms ± 2.67 ms per loop (mean ± std. dev. of 5 runs, 2 loops each)
Interpolated: 200 ms ± 2.36 ms per loop (mean ± std. dev. of 5 runs, 2 loops each)
Bisect:       424 ms ± 3.28 ms per loop (mean ± std. dev. of 5 runs, 2 loops each)


Testing new load: 4100...
Monobound:    184 ms ± 977 µs per loop (mean ± std. dev. of 5 runs, 2 loops each)
Interpolated: 186 ms ± 2.75 ms per loop (mean ± std. dev. of 5 runs, 2 loops each)
Bisect:       473 ms ± 70.6 ms per loop (mean ± std. dev. of 5 runs, 2 loops each)


Testing new load: 6100...
Monobound:    181 ms ± 4.33 ms per loop (mean ± std. dev. of 5 runs, 2 loops each)
Interpolated: 185 ms ± 1.55 ms per loop (mean ± std. dev. of 

In [82]:
loads = [2_000, 10_000, 14_000, 16_000, 18_000] # more specific loads, finding the optimal binary search + bucket

for load in loads:
    print(f'Testing new load: {load}...')
    splist_monobound = MonoboundSplitList(load=load)
#     splist_interpolated = SplitListInterpolated(load=load) # removed
    
    insert_nlist(splist_monobound)
#     insert_nlist(splist_interpolated) # removed
    
    print('Monobound:    ', end='')
    %timeit -r 10 -n 5 lookup_nlist(splist_monobound)   
#     print('Interpolated: ', end='')
#     %timeit -r 10 -n 5 lookup_nlist(splist_interpolated) # removed
    
    print('\n')

Testing new load: 2000...
Monobound:    196 ms ± 1.68 ms per loop (mean ± std. dev. of 10 runs, 5 loops each)
Interpolated: 197 ms ± 1.79 ms per loop (mean ± std. dev. of 10 runs, 5 loops each)


Testing new load: 10000...
Monobound:    171 ms ± 3.31 ms per loop (mean ± std. dev. of 10 runs, 5 loops each)
Interpolated: 173 ms ± 2.12 ms per loop (mean ± std. dev. of 10 runs, 5 loops each)


Testing new load: 14000...
Monobound:    159 ms ± 3.72 ms per loop (mean ± std. dev. of 10 runs, 5 loops each)
Interpolated: 163 ms ± 1.83 ms per loop (mean ± std. dev. of 10 runs, 5 loops each)


Testing new load: 16000...
Monobound:    154 ms ± 1.42 ms per loop (mean ± std. dev. of 10 runs, 5 loops each)
Interpolated: 162 ms ± 2.24 ms per loop (mean ± std. dev. of 10 runs, 5 loops each)


Testing new load: 18000...
Monobound:    156 ms ± 2.5 ms per loop (mean ± std. dev. of 10 runs, 5 loops each)
Interpolated: 161 ms ± 2.32 ms per loop (mean ± std. dev. of 10 runs, 5 loops each)




In [85]:
loads = [50_000, 100_000, 200_000, 500_000, 1_000_000] # just some random huge bucket sizes

for load in loads:
    print(f'Testing new load: {load}...')
    splist_monobound = MonoboundSplitList(load=load)
#     splist_interpolated = SplitListInterpolated(load=load) # removed
    
    insert_nlist(splist_monobound)
#     insert_nlist(splist_interpolated) # removed
    
    print('Monobound:    ', end='')
    %timeit -r 10 -n 5 lookup_nlist(splist_monobound)   
#     print('Interpolated: ', end='')
#     %timeit -r 10 -n 5 lookup_nlist(splist_interpolated) # removed
    
    print('\n')

Testing new load: 50000...
Monobound:    131 ms ± 2.41 ms per loop (mean ± std. dev. of 10 runs, 5 loops each)
Interpolated: 142 ms ± 4.99 ms per loop (mean ± std. dev. of 10 runs, 5 loops each)


Testing new load: 100000...
Monobound:    135 ms ± 2.94 ms per loop (mean ± std. dev. of 10 runs, 5 loops each)
Interpolated: 141 ms ± 20.5 ms per loop (mean ± std. dev. of 10 runs, 5 loops each)


Testing new load: 200000...
Monobound:    160 ms ± 23.4 ms per loop (mean ± std. dev. of 10 runs, 5 loops each)
Interpolated: 138 ms ± 2.31 ms per loop (mean ± std. dev. of 10 runs, 5 loops each)


Testing new load: 500000...
Monobound:    134 ms ± 1.69 ms per loop (mean ± std. dev. of 10 runs, 5 loops each)
Interpolated: 136 ms ± 1.64 ms per loop (mean ± std. dev. of 10 runs, 5 loops each)


Testing new load: 1000000...
Monobound:    133 ms ± 3.75 ms per loop (mean ± std. dev. of 10 runs, 5 loops each)
Interpolated: 136 ms ± 2.25 ms per loop (mean ± std. dev. of 10 runs, 5 loops each)




# Official Benchmarks

We run comparisons for two abstract data structures:

`SortedList` - SplitList, MonoboundSplitList, SkipList from `pyskiplist`, SortedList from `sortedcontainers`

`SortedDict` - RoaringSplitList, RoaringTeleportList, SortedDict from `sortedcontainers`

In [69]:
import csv

## SortedList

In [97]:
# INSERT

def insert_sortedcontainers_list(lst, data):
    for el in data:
        _ = lst.add(el)
        
def insert_sortedcontainers_dict(dct, data):
    for el in data:
        _ = dct.setdefault(el)

def insert_our(lst, data):
    for el in data:
        _ = lst.insert(el)
        
def insert_pyskiplist(lst, data):
    for el in data:
        _ = lst.insert(el, el)
        
# LOOKUP

def lookup_sortedcontainers_list(lst, data):
    for el in data:
        _ = el in lst
        
def lookup_sortedcontainers_dict(dct, data):
    for el in data:
        _ = dct.get(el)

def lookup_our(lst, data):
    for el in data:
        _ = lst.lookup(el)
        
def lookup_pyskiplist(lst, data):
    for el in data:
        _ = lst.search(el)

In [100]:
n_seeds = 100 # number of randomized trials for each configuration
n_elements_list = [int(el) for el in [1e1, 1e2, 1e3, 1e4, 1e5, 1e6]]
loads = [2_000, 10_000]
element_max_size = int(2e6)

total_n_trials = n_seeds * len(n_elements_list)
trial_count = 0

with open('benchmarks.csv', 'w', newline='') as file:
    
    writer = csv.writer(file)
    writer.writerow(["Abstract Data Type", "Implementation", "Data Size", "Runtime", "Operation"])

    for n_elements in n_elements_list:

            for seed in range(n_seeds):
                trial_count += 1

                print(f'-- Trial {trial_count}/{total_n_trials}. Seed: {seed}. Data size: {n_elements}. Running...', end=' ')

                trial_start = time.time()
                
                random.seed(seed)
                trial_insert_data = [random.randint(1, element_max_size) for i in range(n_elements)] 

                random.seed(seed+1)
                trial_lookup_not_inserted_data = [random.randint(1, element_max_size) for i in range(n_elements)] 

                # SortedList
                sortedcontainers_list = SortedcontainersSortedList()
                pyskiplist = PySkipList()
                
                # SortedDict
                our_teleport_list = TeleportList()
                sortedcontainers_sorted_dict = SortedcontainersSortedDict()

                # INSERT

                t = %timeit -q -o -r 1 -n 1 insert_sortedcontainers_list(sortedcontainers_list, trial_insert_data)
                writer.writerow(['SortedList', f'SortedContainersSortedList', n_elements, f'{t.average:.8f}', 'insert'])
                
                t = %timeit -q -o -r 1 -n 1 insert_pyskiplist(pyskiplist, trial_insert_data)
                writer.writerow(['SortedList', f'PySkipList', n_elements, f'{t.average:.8f}', 'insert'])
                
                t = %timeit -q -o -r 1 -n 1 insert_our(our_teleport_list, trial_insert_data)
                writer.writerow(['SortedDict', f'RoaringTeleportList', n_elements, f'{t.average:.8f}', 'insert'])
                
                t = %timeit -q -o -r 1 -n 1 insert_sortedcontainers_dict(sortedcontainers_sorted_dict, trial_insert_data)
                writer.writerow(['SortedDict', f'SortedContainersSortedDict', n_elements, f'{t.average:.8f}', 'insert'])

                # LOOKUP
                
                # lookup data that was inserted
                t = %timeit -q -o -r 1 -n 1 lookup_sortedcontainers_list(sortedcontainers_list, trial_insert_data)
                writer.writerow(['SortedList', f'SortedContainersSortedList', n_elements, f'{t.average:.8f}', 'lookup_inserted'])
                
                t = %timeit -q -o -r 1 -n 1 lookup_pyskiplist(pyskiplist, trial_insert_data)
                writer.writerow(['SortedList', f'PySkipList', n_elements, f'{t.average:.8f}', 'lookup_inserted'])
                
                t = %timeit -q -o -r 1 -n 1 lookup_our(our_teleport_list, trial_insert_data)
                writer.writerow(['SortedDict', f'RoaringTeleportList', n_elements, f'{t.average:.8f}', 'lookup_inserted'])
                
                t = %timeit -q -o -r 1 -n 1 lookup_sortedcontainers_dict(sortedcontainers_sorted_dict, trial_insert_data)
                writer.writerow(['SortedDict', f'SortedContainersSortedDict', n_elements, f'{t.average:.8f}', 'lookup_inserted'])

                # lookup data that was NOT inserted
                t = %timeit -q -o -r 1 -n 1 lookup_sortedcontainers_list(sortedcontainers_list, trial_lookup_not_inserted_data)
                writer.writerow(['SortedList', f'SortedContainersSortedList', n_elements, f'{t.average:.8f}', 'lookup_not_inserted'])
                
                t = %timeit -q -o -r 1 -n 1 lookup_pyskiplist(pyskiplist, trial_lookup_not_inserted_data)
                writer.writerow(['SortedList', f'PySkipList', n_elements, f'{t.average:.8f}', 'lookup_not_inserted'])
                
                t = %timeit -q -o -r 1 -n 1 lookup_our(our_teleport_list, trial_lookup_not_inserted_data)
                writer.writerow(['SortedDict', f'RoaringTeleportList', n_elements, f'{t.average:.8f}', 'lookup_not_inserted'])
                
                t = %timeit -q -o -r 1 -n 1 lookup_sortedcontainers_dict(sortedcontainers_sorted_dict, trial_lookup_not_inserted_data)
                writer.writerow(['SortedDict', f'SortedContainersSortedDict', n_elements, f'{t.average:.8f}', 'lookup_not_inserted'])
                
                for load in loads:
                    
                    # SortedList
                    our_splitlist = SplitList(load)
                    our_mono_splist = MonoboundSplitList(load)
                    
                    # SortedDict
                    our_roaring_splist = RoaringSplitList(load)
                    
                    # INSERT

                    t = %timeit -q -o -r 1 -n 1 insert_our(our_splitlist, trial_insert_data)
                    writer.writerow(['SortedList', f'SplitList-{load}', n_elements, f'{t.average:.8f}', 'insert'])

                    t = %timeit -q -o -r 1 -n 1 insert_our(our_mono_splist, trial_insert_data)
                    writer.writerow(['SortedList', f'MonoboundSplitList-{load}', n_elements, f'{t.average:.8f}', 'insert'])

                    t = %timeit -q -o -r 1 -n 1 insert_our(our_roaring_splist, trial_insert_data)
                    writer.writerow(['SortedDict', f'RoaringSplitList-{load}', n_elements, f'{t.average:.8f}', 'insert'])

                    # LOOKUP
                    
                    # lookup data that was inserted
                    t = %timeit -q -o -r 1 -n 1 lookup_our(our_splitlist, trial_insert_data)
                    writer.writerow(['SortedList', f'SplitList-{load}', n_elements, f'{t.average:.8f}', 'lookup_inserted'])

                    t = %timeit -q -o -r 1 -n 1 lookup_our(our_mono_splist, trial_insert_data)
                    writer.writerow(['SortedList', f'MonoboundSplitList-{load}', n_elements, f'{t.average:.8f}', 'lookup_inserted'])

                    t = %timeit -q -o -r 1 -n 1 lookup_our(our_roaring_splist, trial_insert_data)
                    writer.writerow(['SortedDict', f'RoaringSplitList-{load}', n_elements, f'{t.average:.8f}', 'lookup_inserted'])
                    
                    # lookup data that was NOT inserted
                    t = %timeit -q -o -r 1 -n 1 lookup_our(our_splitlist, trial_lookup_not_inserted_data)
                    writer.writerow(['SortedList', f'SplitList-{load}', n_elements, f'{t.average:.8f}', 'lookup_not_inserted'])

                    t = %timeit -q -o -r 1 -n 1 lookup_our(our_mono_splist, trial_lookup_not_inserted_data)
                    writer.writerow(['SortedList', f'MonoboundSplitList-{load}', n_elements, f'{t.average:.8f}', 'lookup_not_inserted'])

                    t = %timeit -q -o -r 1 -n 1 lookup_our(our_roaring_splist, trial_lookup_not_inserted_data)
                    writer.writerow(['SortedDict', f'RoaringSplitList-{load}', n_elements, f'{t.average:.8f}', 'lookup_not_inserted'])
                    
                    # small cleanup
                    del our_splitlist
                    del our_mono_splist
                    del our_roaring_splist
                                        
                # cleanup
                del trial_insert_data
                del trial_lookup_not_inserted_data
                
                del sortedcontainers_list
                del pyskiplist
                del our_teleport_list
                del sortedcontainers_sorted_dict

                gc.collect() # Sir garbage collector, please clean up this mess.

                trial_total = time.time() - trial_start

                print(f'Done: {trial_total:.2f}s --')
            

-- Trial 1/600. Seed: 0. Data size: 10. Running... Done: 0.12s --
-- Trial 2/600. Seed: 1. Data size: 10. Running... Done: 0.12s --
-- Trial 3/600. Seed: 2. Data size: 10. Running... Done: 0.12s --
-- Trial 4/600. Seed: 3. Data size: 10. Running... Done: 0.13s --
-- Trial 5/600. Seed: 4. Data size: 10. Running... Done: 0.12s --
-- Trial 6/600. Seed: 5. Data size: 10. Running... Done: 0.12s --
-- Trial 7/600. Seed: 6. Data size: 10. Running... Done: 0.13s --
-- Trial 8/600. Seed: 7. Data size: 10. Running... Done: 0.12s --
-- Trial 9/600. Seed: 8. Data size: 10. Running... Done: 0.12s --
-- Trial 10/600. Seed: 9. Data size: 10. Running... Done: 0.13s --
-- Trial 11/600. Seed: 10. Data size: 10. Running... Done: 0.12s --
-- Trial 12/600. Seed: 11. Data size: 10. Running... Done: 0.12s --
-- Trial 13/600. Seed: 12. Data size: 10. Running... Done: 0.12s --
-- Trial 14/600. Seed: 13. Data size: 10. Running... Done: 0.12s --
-- Trial 15/600. Seed: 14. Data size: 10. Running... Done: 0.12s --