## <u>Cuckoo Filter</u>
### Maggie Drew & Adam Gibbs

Implementation of Cuckoo Filter for estimating set membership on data streams. Implementation psuedocode from the following paper: https://www.cs.cmu.edu/~dga/papers/cuckoo-conext2014.pdf. This document also helped with figuring out Partial-Kay Cuckoo Hashing: https://williams-cs.github.io/cs358-s20/lectures/lecture10/bloom.pdf. This notebook contains the following:

1. Imports used in the notebook
2. All global variables and python data structures (lists, dictionaries, etc.) used
3. Code for Cuckoo Filter functions
4. Cuckoo filter test on simulated data stream
5. Results
6. Analysis of Results 

In [1]:
# IMPORTS
import matplotlib as plt
import numpy as np 
import pandas as pd
import gzip
import random
import math

# uncomment if library is needed
# !{sys.executable} -m pip install mmh3
import mmh3


In [2]:
# VARIABLES and DATA STRUCTURES

# dataset to be used as data stream
dataset = './input.txt'
dataset2 = './input2.txt'

# number of buckets in hash table
bucket_size = 29
# number of fingerprints in each bucket
entries_per_bucket = 2

# fingerprint size (in bits)
fingerprint_size = 7
fingerprint_mod = 2**(fingerprint_size)



In [3]:
# Class containing basic Cuckoo Filter with insert, lookup, and delete functionality
# Creates a Cuckoo Filter with buckets_size buckets
# Each bucket has depth 1 and is initialized to NaN to start
# Uses mmh3 for hash function, so elements passed into functions should be strings
#    however, the element will be cast to a string, so as long as element can be a string you're good

class CuckooFilter():


    # CONSTRUCTOR
    def __init__(self, bucket_size, k=5):
        self.bucket_size = bucket_size
        self.buckets = np.empty(bucket_size)
        self.buckets[:] = np.NaN
        self.coefficients = []
        for i in range(0,k):
            self.coefficients.append(random.uniform(0,7919))
    
    # HELPER METHODS

    # creates fingerprint for element in datastream
    def create_fingerprint(self, x):
        return mmh3.hash(str(x),5) % fingerprint_mod

    # defines our hash function
    # for intehers we have modulus bucket_size
    def hash1(self, x):
        return mmh3.hash(str(x),1) % self.bucket_size

    def hash2(self, x):
        return mmh3.hash(str(x),2) % self.bucket_size

    # def k_ind_hash(self, x):
    #     hash_val = self.coefficients[0]
    #     for i in range(1, len(self.coefficients)):
    #         hash_val += self.coefficients[i]*math.pow(int(x),i)
    #     return round(hash_val) % self.bucket_size
        

    # method used to relocate entries when an incoming element has no free bucket
    # returns true if all old buckets are successfully relocated
    # returns false if all buckets are filled
    def relocate(self, old, bucket):

        count = 0
        while count < len(self.buckets):
            # h1 = self.hash_func(f) % self.bucket_size
            # h2 = (h1 ^ self.hash_func(f)) % self.bucket_size
            # if h1 == bucket:
            #     new_bucket = int(h2)
            # else:
            #     new_bucket = int(h1)
            #print(bucket)
            #print(old)
            bucket = (bucket ^ self.hash2(int(old))) % self.bucket_size

            if pd.isnull(self.buckets[bucket]): 
                self.buckets[bucket] = old
                #print(str(old) + " in " + str(bucket))
                return True
            else:
                temp = self.buckets[bucket]
                self.buckets[bucket] = old
                #print(str(old) + " in " + str(bucket))
                old = temp
            count += 1
        
        return False
    
    # CORE METHODS - insert, lookup, and delete

    # method used to insert an element
    # returns true if the element is inserted
    #    or returns true if element has already been inserted
    # returns false if the element is not inserted
    def insert(self, x):

        # get fingerprint for element
        f = self.create_fingerprint(x)

        # find two hash function values
        # second hash function is... 
        # [hash_func(x) + fingerprint(x)] % bucket_size
        h1 = self.hash1(x)
        h2 = (h1 ^ self.hash2(f)) % self.bucket_size

        # if first bucket is empty or x is already in it,,, add f and return true
        # elif second bucket is empty or x is already in it,,, add f and return true
        # else randomly pick bucket 1 or 2, then,
        # add f to that bucket and relocate the value it replaces, 
        #    if relocation is successful,,, return true
        #    if relocation is not successful,,, undo changes to buckets and return false
        if pd.isnull(self.buckets[h1]) or self.buckets[h1] == f:
            self.buckets[h1] = f
            #print(str(x) + " in " + str(h1))
            return True
        elif pd.isnull(self.buckets[h2]) or self.buckets[h2] == f:
            self.buckets[h2] = f
            #print(str(x) + " in " + str(h2))
            return True
        else:
            rand_num = random.random()
            if rand_num < 0.5:
                # store buckets in case relocation is unsuccessful
                old_buckets = np.copy(self.buckets)
                old = self.buckets[h1]
                self.buckets[h1] = f
                added = self.relocate(old, h1)
                
                # if relocation is unsuccessful, restore buckets return false
                if not added:
                    self.buckets = old_buckets
                    return added
                else:
                    return added
            else:
                # store buckets in case relocation is unsuccessful
                old_buckets = np.copy(self.buckets)
                old = self.buckets[h2]
                self.buckets[h2] = f
                added = self.relocate(old, h2)
                
                # if relocation is unsuccessful, restore buckets return false
                if not added:
                    self.buckets = old_buckets
                    return added
                else:
                    return added

    # method used to determine if an element is in the filter
    # gets the fingerprint and calcuates the hash functions, then 
    # if element is in first or second bucket,,, return true
    # otherwise,,, return false
    def lookup(self, x):
        f = self.create_fingerprint(x)
        h1 = self.hash1(x)
        h2 = (h1 ^ self.hash2(f)) % self.bucket_size

        #print(str(x) + " h1: " + str(h1) + " h2: " + str(h2))
        if self.buckets[h1] == f or self.buckets[h2] == f:
            return True
        else:
            return False

    # method used to delete an element in the filter
    # gets the fingerprint and calcuates the hash functions, then 
    # if element is in first bucket, put a NaN in that bucket,,, return true 
    # if element is in second bucket, put a NaN in that bucket,,, return true 
    # otherwise element was not in filter,,, return false
    def delete(self, x):
        f = self.create_fingerprint(x)
        h1 = self.hash1(x)
        h2 = (h1 ^ self.hash2(f)) % self.bucket_size

        if self.buckets[h1] == f:
            self.buckets[h1] = np.NaN
            return True
        elif self.buckets[h2] == f:
            self.buckets[h2] = np.NaN
            return True
        
        return False


In [4]:
# Class containing Extended Cuckoo Filter with insert, lookup, and delete functionality
# Creates a Cuckoo Filter with buckets_size buckets
# Each bucket has depth 1 and is initialized to NaN to start
# Uses mmh3 for hash function, so elements passed into functions should be strings
#    however, the element will be cast to a string, so as long as element can be a string you're good

class ExtendedCuckooFilter():


    # CONSTRUCTOR
    def __init__(self, bucket_size, entries_per_bucket, k=5):
        self.bucket_size = bucket_size
        self.entries_per_bucket = entries_per_bucket
        #self.buckets = [[np.NaN for num in range(0, entries_per_bucket)] for num in range(0, bucket_size)]
        self.buckets = np.empty((bucket_size, entries_per_bucket))
        self.buckets[:][:] = np.NaN
        self.coefficients = []
        for i in range(0,k):
            self.coefficients.append(random.uniform(0,7919))
    
    # HELPER METHODS

    # creates fingerprint for element in datastream
    def create_fingerprint(self, x):
        return mmh3.hash(str(x)) % fingerprint_mod

    # defines our hash function
    # for intehers we have modulus bucket_size
    def hash_func(self, x):
        return mmh3.hash(str(x)) % self.bucket_size

    def k_ind_hash(self, x):
        hash_val = self.coefficients[0]
        for i in range(1, len(self.coefficients)):
            hash_val += self.coefficients[i]*math.pow(int(x),i)
        return round(hash_val) % self.bucket_size
        

    # method used to relocate entries when an incoming element has no free bucket
    # returns true if all old buckets are successfully relocated
    # returns false if all buckets are filled
    def relocate(self, f, bucket):

        count = 0
        while count < len(self.buckets):
            # h1 = self.hash_func(f)
            # h2 = (h1 + hash_func(f)) % self.bucket_size
            # if h1 == bucket:
            #     new_bucket = int(h2)
            # else:
            #     new_bucket = int(h1)
           
            new_bucket = (bucket + self.hash_func(f)) % self.bucket_size

            for i in range(0, self.entries_per_bucket):
                if pd.isnull(self.buckets[new_bucket][i]): 
                    self.buckets[new_bucket][i] = f
                    return True
            
            randIndex = random.randint(0,self.entries_per_bucket-1)
            old = self.buckets[new_bucket][randIndex]
            self.buckets[new_bucket][randIndex] = f
            f = old
            count += 1
        
        return False
    
    # CORE METHODS - insert, lookup, and delete

    # method used to insert an element
    # returns true if the element is inserted
    #    or returns true if element has already been inserted
    # returns false if the element is not inserted
    def insert(self, x):

        # get fingerprint for element
        f = self.create_fingerprint(x)

        # find two hash function values
        # second hash function is... 
        # [hash_func(x) + fingerprint(x)] % bucket_size
        h1 = self.hash_func(x)
        h2 = (h1 + self.hash_func(f)) % self.bucket_size

        # if first bucket is empty or x is already in it,,, add f and return true
        # elif second bucket is empty or x is already in it,,, add f and return true
        # else randomly pick bucket 1 or 2, then,
        # add f to that bucket and relocate the value it replaces, 
        #    if relocation is successful,,, return true
        #    if relocation is not successful,,, undo changes to buckets and return false

        for i in range(0, self.entries_per_bucket):
            if pd.isnull(self.buckets[h1][i]) or self.buckets[h1][i] == f:
                self.buckets[h1][i] = f
                return True
        for i in range(0, self.entries_per_bucket):
            if pd.isnull(self.buckets[h2][i]) or self.buckets[h2][i] == f:
                self.buckets[h2][i] = f
                return True
        
        rand_num = random.random()
        if rand_num < 0.5:
            # store buckets in case relocation is unsuccessful
            old_buckets = np.copy(self.buckets)
            randIndex = random.randint(0,self.entries_per_bucket-1)
            old = self.buckets[h1][randIndex]
            self.buckets[h1][randIndex] = f
            added = self.relocate(old, h1)
            
            # if relocation is unsuccessful, restore buckets return false
            if not added:
                self.buckets = old_buckets
                return added
            else:
                return added
        else:
            # store buckets in case relocation is unsuccessful
            old_buckets = np.copy(self.buckets)
            randIndex = random.randint(0,self.entries_per_bucket-1)
            old = self.buckets[h2][randIndex]
            self.buckets[h2][randIndex] = f
            added = self.relocate(old, h2)
            
            # if relocation is unsuccessful, restore buckets return false
            if not added:
                self.buckets = old_buckets
                return added
            else:
                return added

    # method used to determine if an element is in the filter
    # gets the fingerprint and calcuates the hash functions, then 
    # if element is in first or second bucket,,, return true
    # otherwise,,, return false
    def lookup(self, x):
        f = self.create_fingerprint(x)
        h1 = self.hash_func(x)
        h2 = (h1 + self.hash_func(f)) % self.bucket_size

        for i in range(0, self.entries_per_bucket):

            if self.buckets[h1][i] == f or self.buckets[h2][i] == f:
                return True
        return False

    # method used to delete an element in the filter
    # gets the fingerprint and calcuates the hash functions, then 
    # if element is in first bucket, put a NaN in that bucket,,, return true 
    # if element is in second bucket, put a NaN in that bucket,,, return true 
    # otherwise element was not in filter,,, return false
    def delete(self, x):
        f = self.create_fingerprint(x)
        h1 = self.hash_func(x)
        h2 = (h1 + self.hash_func(f)) % self.bucket_size

        for i in range(0, self.entries_per_bucket):
            if self.buckets[h1][i] == f:
                self.buckets[h1][i] = np.NaN
                return True

        for i in range(0, self.entries_per_bucket):
            if self.buckets[h2][i] == f:
                self.buckets[h2][i] = np.NaN
                return True
        
        return False


In [5]:
# READ IN DATA AND APPLY FILTER

cf = CuckooFilter(bucket_size)

with open(dataset, 'rt') as data_stream:
    for line in data_stream:
        for element in line.split():
            print("Insert " + str(element) + "? " + str(cf.insert(element)) + " f=" + " " + str(cf.create_fingerprint(element)))
print("\nFilter: \n" + str(cf.buckets))

Insert 5? True f= 103
Insert 4? True f= 91
Insert 9? True f= 17
Insert 11? True f= 12
Insert 33? True f= 23
Insert 46? True f= 30
Insert 77? True f= 31
Insert 4? True f= 91
Insert 9? True f= 17
Insert 13? True f= 76
Insert 42? True f= 22
Insert 155? True f= 104
Insert 207? True f= 37

Filter: 
[ 12.  nan  nan  nan  22.  nan  nan  76.  nan  nan  nan  23.  91.  nan
 103.  nan  nan  nan  37.  nan  nan  31.  nan  nan 104.  nan  nan  30.
  17.]


In [15]:
ele = "11"

size = bucket_size
print(size)

def hash1(x):
    return mmh3.hash(str(x),1) % size
def hash2(x):
    return mmh3.hash(str(x),2) % size

def fing(x):
    return mmh3.hash(str(x),5) % fingerprint_mod

fp = fing(ele)
print(fp)

b1 = hash1(ele)
b2 = (b1 ^ hash2(fp)) % size

print(b1)
print(b2)

bucket = b2

bucket = (bucket ^ hash2(fp)) % size
print(bucket)
print((bucket ^ hash2(fp)) % size)

print((3 ^ hash2(140)) % size)

9
140
3
5
3
5
5


In [6]:
# lookup every element that is added to the filter
# return true is expected as Cuckoo Filters have no false negatives
with open(dataset, 'rt') as data_stream:
    for line in data_stream:
        for element in line.split():
            print("Lookup " + str(element) + "? " + str(cf.lookup(element)))

print()

# lookup 5 values that were not inserted into the filter
# return false expected, but there could be false positives
# however, false positives will such a low number of insertions are rare
for test in ['36', '75', '3', '0', '101']:
    print("Lookup " + str(test) + "? " + str(cf.lookup(test)))

Lookup 5? True
Lookup 4? True
Lookup 9? True
Lookup 11? True
Lookup 33? True
Lookup 46? True
Lookup 77? True
Lookup 4? True
Lookup 9? True
Lookup 13? True
Lookup 42? True
Lookup 155? True
Lookup 207? True

Lookup 36? False
Lookup 75? False
Lookup 3? False
Lookup 0? False
Lookup 101? False


In [8]:
print(cf.hash1("4"))
print((cf.hash1("4") ^ cf.create_fingerprint("4"))%bucket_size)

4
2


In [7]:
# delete 3 elements that are in the filter
# return true expected
print(cf.delete('5'))
print(cf.delete('33'))
print(cf.delete('77'))
# delete an element that is not in the filter
# return false expected
print(cf.delete('55'))

print()

# see if any deleted elements remain in the filter
# return false expected every time
print(cf.lookup('5'))
print(cf.lookup('33'))
print(cf.lookup('77'))
print(cf.lookup('55'))

True
True
True
False

False
False
False
False


In [8]:
# READ IN DATA AND APPLY FILTER

cf2 = CuckooFilter(bucket_size)

with open(dataset2, 'rt') as data_stream:
    for line in data_stream:
        for element in line.split():
            print("Insert " + str(element) + "? " + str(cf2.insert(element)))
print("\nFilter: \n" + str(cf2.buckets))

Insert aarp? True
Insert abandon? True
Insert abandoned? True
Insert abandoning? True
Insert abb? True
Insert abc? True
Insert abcs? True
Insert aboard? True
Insert abortion? True

Filter: 
[ 8. nan nan nan nan 29. 47. nan nan 19. nan nan nan 49. nan nan nan nan
 nan nan nan nan 26. 25. nan 60. nan nan 98.]


In [9]:
# lookup every element that is added to the filter
# return true is expected as Cuckoo Filters have no false negatives
with open(dataset2, 'rt') as data_stream:
    for line in data_stream:
        for element in line.split():
            print("Lookup " + str(element) + "? " + str(cf2.lookup(element)))

print()

# lookup 5 values that were not inserted into the filter
# return false expected, but there could be false positives
# however, false positives will such a low number of insertions are rare
for test in ['mirror', 'hello', 'bird', 'super', 'raspberry']:
    print("Lookup " + str(test) + "? " + str(cf2.lookup(test)))

Lookup aarp? True
Lookup abandon? True
Lookup abandoned? True
Lookup abandoning? True
Lookup abb? True
Lookup abc? True
Lookup abcs? True
Lookup aboard? True
Lookup abortion? True

Lookup mirror? False
Lookup hello? False
Lookup bird? False
Lookup super? False
Lookup raspberry? False


In [10]:
# delete 3 elements that are in the filter
# return true expected
print(cf2.delete('aarp'))
print(cf2.delete('abortion'))
# delete an element that is not in the filter
# return false expected
print(cf2.delete('mirror'))

print()

# see if any deleted elements remain in the filter
# return false expected every time
print(cf2.lookup('aarp'))
print(cf2.lookup('abortion'))
print(cf2.lookup('mirror'))

True
True
False

False
False
False
