## <u>Cuckoo Filter</u>
### Maggie Drew & Adam Gibbs

Implementation of Cuckoo Filter for estimating set membership on data streams. Implementation psuedocode from the following paper: https://www.cs.cmu.edu/~dga/papers/cuckoo-conext2014.pdf. This notebook contains the following:

1. Imports used in the notebook
2. All global variables and python data structures (lists, dictionaries, etc.) used
3. Code for Cuckoo Filter functions
4. Cuckoo filter test on simulated data stream
5. Results
6. Analysis of Results 

In [10]:
# IMPORTS
import matplotlib as plt
import numpy as np 
import pandas as pd
import gzip
#import hashlib
import random


In [11]:
# VARIABLES and DATA STRUCTURES

# dataset to be used as data stream
dataset = './input.txt'

# number of buckets in hash table
# make this prime!
bucket_size = 29



In [12]:
# Class containing basic Cuckoo Filter with insert, lookup, and delete functionality
# Creates a Cuckoo Filter with buckets_size buckets
# Each bucket has depth 1 and is initialized to NaN to start

class CuckooFilter():

    # CONSTRUCTOR
    def __init__(self, bucket_size):
        self.bucket_size = bucket_size
        self.buckets = [np.NaN for num in range(0, bucket_size)]
    
    # HELPER METHODS

    # creates fingerprint for element in datastream
    def create_fingerprint(self, x):
        return x

    # defines our hash function
    # for intehers we have modulus bucket_size
    def hash_func(self, x):
        return x % self.bucket_size

    # method used to relocate entries when an incoming element has no free bucket
    # returns true if all old buckets are successfully relocated
    # returns false if all buckets are filled
    def relocate(self, f, bucket):

        count = 0
        while count < len(self.buckets):
            h1 = self.hash_func(f)
            h2 = (h1 + f) % self.bucket_size
            if h1 == bucket:
                new_bucket = int(h2)
            else:
                new_bucket = int(h1)

            if pd.isnull(self.buckets[new_bucket]): 
                self.buckets[new_bucket] = f
                return True
            else:
                old = self.buckets[new_bucket]
                self.buckets[new_bucket] = f
                f = old
            count += 1
        
        return False
    
    # CORE METHODS - insert, lookup, and delete

    # method used to insert an element
    # returns true if the element is inserted
    #    or returns true if element has already been inserted
    # returns false if the element is not inserted
    def insert(self, x):

        # get fingerprint for element
        f = self.create_fingerprint(x)

        # find two hash function values
        # second hash function is... 
        # [hash_func(x) + fingerprint(x)] % bucket_size
        h1 = self.hash_func(x)
        h2 = (h1 + f) % self.bucket_size

        # if first bucket is empty or x is already in it,,, add f and return true
        # elif second bucket is empty or x is already in it,,, add f and return true
        # else randomly pick bucket 1 or 2, then,
        # add f to that bucket and relocate the value it replaces, return true
        if pd.isnull(self.buckets[h1]) or self.buckets[h1] == f:
            self.buckets[h1] = f
            return True
        elif pd.isnull(self.buckets[h2]) or self.buckets[h2] == f:
            self.buckets[h2] = f
            return True
        else:
            rand_num = random.random()
            if rand_num < 0.5:
                old_buckets = np.copy(self.buckets)
                old = self.buckets[h1]
                self.buckets[h1] = f
                added = self.relocate(old, h1)

                if not added:
                    self.buckets = old_buckets
                    return added
                else:
                    return added
            else:
                old_buckets = np.copy(self.buckets)
                old = self.buckets[h2]
                self.buckets[h2] = f
                added = self.relocate(old, h2)
                
                if not added:
                    self.buckets = old_buckets
                    return added
                else:
                    return added

    # method used to determine if an element is in the filter
    # gets the fingerprint and calcuates the hash functions, then 
    # if element is in first or second bucket,,, return true
    # otherwise,,, return false
    def lookup(self, x):
        f = self.create_fingerprint(x)
        h1 = self.hash_func(x)
        h2 = (h1 + f) % self.bucket_size

        if self.buckets[h1] == f or self.buckets[h2] == f:
            return True
        else:
            return False

    # method used to delete an element in the filter
    # gets the fingerprint and calcuates the hash functions, then 
    # if element is in first bucket, put a NaN in that bucket,,, return true 
    # if element is in second bucket, put a NaN in that bucket,,, return true 
    # otherwise element was not in filter,,, return false
    def delete(self, x):
        f = self.create_fingerprint(x)
        h1 = self.hash_func(x)
        h2 = (h1 + f) % self.bucket_size

        if self.buckets[h1] == f:
            self.buckets[h1] = np.NaN
            return True
        elif self.buckets[h2] == f:
            self.buckets[h2] = np.NaN
            return True
        
        return False


In [17]:
# READ IN DATA AND APPLY FILTER

cf = CuckooFilter(bucket_size)

with open(dataset, 'rt') as data_stream:
    for line in data_stream:
        for element in line.split():
            print("Insert " + str(element) + "? " + str(cf.insert(int(element))))
print("\nFilter: \n" + str(cf.buckets))

Insert 5? True
Insert 4? True
Insert 9? True
Insert 11? True
Insert 33? True
Insert 46? True
Insert 77? True
Insert 4? True
Insert 9? True
Insert 13? True
Insert 42? True

Filter: 
[nan, nan, nan, nan, 4, 5, nan, nan, 33, 9, nan, 11, nan, 13, nan, nan, nan, 46, nan, 77, nan, nan, nan, nan, nan, nan, 42, nan, nan]


In [None]:
with open(dataset, 'rt') as data_stream:
    for line in data_stream:
        for element in line.split():
            print(cf.lookup(int(element)))

In [6]:
list = [1,2,3,4,5,6]

one = list[0]
list[0] = 100
print(one)
print(list[0])

1
100
