## <u>Cuckoo Filter</u>
### Maggie Drew & Adam Gibbs

Implementation of Cuckoo Filter for estimating set membership on data streams. Implementation psuedocode from the following paper: https://www.cs.cmu.edu/~dga/papers/cuckoo-conext2014.pdf. This notebook contains the following:

1. Imports used in the notebook
2. All global variables and python data structures (lists, dictionaries, etc.) used
3. Code for Cuckoo Filter functions
4. Cuckoo filter test on simulated data stream
5. Results
6. Analysis of Results 

In [9]:
# IMPORTS
import matplotlib as plt
import numpy as np 
import pandas as pd
import gzip
import hashlib
import random


In [10]:
# VARIABLES and DATA STRUCTURES

# datset to be used as data stream
dataset = './input.txt'

# number of buckets in hash table
# make this prime!
bucket_size = 29

# numpy array of buckets for elements to hash into
# buckets = []
# for i in range(0,bucket_size):
#    buckets.append(np.NaN)
# print(buckets)


In [11]:
class CuckooFilter():

    # CONSTRUCTOR
    def __init__(self, bucket_size):
        self.bucket_size = bucket_size
        self.buckets = [np.NaN for num in range(0, bucket_size)]
    
    # HELPER METHODS
    def create_fingerprint(self, x):
        return x

    def hash_func(self, x):
        return x % self.bucket_size

    def relocate(self, f, bucket):

        count = 0
        while count < len(self.buckets):
            h1 = self.hash_func(f)
            h2 = (h1 + f)% self.bucket_size
            if h1 == bucket:
                new_bucket = h2
            else:
                new_bucket = h1

            if self.buckets[new_bucket] == np.NaN: 
                self.buckets[new_bucket] = f
                return True
            else:
                old = self.buckets[new_bucket]
                self.buckets[new_bucket] = f
                f = old
            count += 1
        
        return False
    
    # CORE METHODS
    def insert(self, x):
        f = self.create_fingerprint(x)

        h1 = self.hash_func(x)
        h2 = (h1 + f) % self.bucket_size # might have to switch from + operation

        if pd.isnull(self.buckets[h1]) or self.buckets[h1] == f:
            
            self.buckets[h1] = f
            return True
        elif pd.isnull(self.buckets[h2]) or self.buckets[h2] == f:
            self.buckets[h2] = f
            return True
        else:
            rand_num = random.random()
            if rand_num < 0.5:
                old = self.buckets[h1]
                self.buckets[h1] = f
                added = self.relocate(old, h1) # need to change this to keep track of buckets
                return added
            else:
                old = self.buckets[h2]
                self.buckets[h2] = f
                added = self.relocate(old, h2)
                return added

    def lookup(self, x):
        f = self.create_fingerprint(x)
        h1 = self.hash_func(x)
        h2 = (h1 + f) % self.bucket_size

        if self.buckets[h1] == f or self.buckets[h2] == f:
            return True
        else:
            return False

    def delete(self, x):
        f = self.create_fingerprint(x)
        h1 = self.hash_func(x)
        h2 = (h1 + f) % self.bucket_size

        if self.buckets[h1] == f:
            self.buckets[h1] = np.NaN
            return True
        elif self.buckets[h2] == f:
            self.buckets[h2] = np.NaN
            return True
        
        return False


In [13]:
# READ IN DATA AND APPLY FILTER

cf = CuckooFilter(bucket_size)

with open(dataset, 'rt') as data_stream:
    for line in data_stream:
        for element in line.split():
            cf.insert(int(element))
print(cf.buckets)

[nan, nan, nan, nan, 4, 5, nan, nan, 33, 9, nan, 11, nan, 13, nan, nan, nan, 46, nan, 77, nan, nan, nan, nan, nan, nan, 42, nan, nan]


In [15]:
with open(dataset, 'rt') as data_stream:
    for line in data_stream:
        for element in line.split():
            print(cf.lookup(int(element)))

True
True
True
True
True
True
True
True
True
True
True
