# Count Min Sketch Implementation

Description: <br>
Count Min Sketch (12.2.2.2, Pg 403- 405 Aggarwal)

## 1. Step by step definition

In [154]:
# importing libraries
# https://pypi.org/project/mmh3/
import mmh3 # can be used to hash strings
import random
import math

In [155]:
# parameters
# w
length = 7
# d
noHashFunctions = 5

In [156]:
#initialize the |noHashFunctions| arrays with 0
arrays = []

for i in range(0,noHashFunctions):
    array = [0] * length
    arrays.append(array)
    
print(arrays)

[[0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0]]


In [157]:
#we need to define pairwise independent hash functions that map an item from 0 to length - 1
def hashItem(item,seed):
    #returns a 32-bit unsigned int 
    #seed is used to create independence
    hashCode = mmh3.hash(str(item),seed, signed = False)
    #maxValue of 32-bit unsigned int 
    maxValue = 4294967295
    #normalize it between 0 and length-1
    normalize = hashCode/maxValue * (length-1)
    #round ensures even splits
    #round(3.6) = 4 while int(3.6) = 3 
    return round(normalize)

In [158]:
#counting an item
def count(item):
    for i in range(0,noHashFunctions):
        index = hashItem(item = item, seed = i)
        arrays[i][index] += 1

In [159]:
#counting an item
item_1 = 2000
item_2 = "countmin"
count(item_1)
count(item_2)
count(item_2)
print(arrays)

[[0, 0, 0, 1, 2, 0, 0], [0, 0, 2, 1, 0, 0, 0], [0, 0, 2, 0, 1, 0, 0], [0, 3, 0, 0, 0, 0, 0], [0, 1, 0, 0, 2, 0, 0]]


As shown, array 4 has a count collision.

In [160]:
#finding the count of an item
def getCount(item):
    #taking the min reduces the effect of collisions
    minCount = float('inf')

    for i in range(0,noHashFunctions):
        index = hashItem(item = item, seed = i)
        count = arrays[i][index]

        if count < minCount:
            minCount = count
    
    return minCount

In [161]:
print(getCount(item_1))
print(getCount(item_2))

1
2


## 2. Class Definition

Let $||count||_1$ be the sum of all counts stored in the data structure, i.e. the sum of values in one row of the sketch. The central guarantee CMS provides is then the following:<br>
Theorem: With a probability of $1−\delta$, the error is at most $\epsilon∗||count||_1$. Concrete values for these error bounds $\epsilon$ and $\delta$ can be freely chosen by setting:<br>
$w=⌈\frac{e}{\epsilon}⌉$ and $d=⌈ln\frac{1}{\delta}⌉$

In [162]:
import mmh3
import math

class CountMinSketch():
    def __init__(self,delta = 0.01,epsilon= 0.05):
        self.length = math.ceil(math.e/epsilon)
        self.noHashFunctions = math.ceil(math.log((1/delta), math.e))
        self.arrays = []

        for i in range(0,self.noHashFunctions):
            array = [0] * self.length
            self.arrays.append(array)
            
    def hashItem(self,item,seed):
        hashCode = mmh3.hash(str(item),seed, signed = False)
        maxValue = 4294967295
        normalize = hashCode/maxValue * (self.length-1)
        return round(normalize)
    
    def count(self, item):
        for i in range(0,self.noHashFunctions):
            index = self.hashItem(item = item, seed = i)
            self.arrays[i][index] += 1
            
    def getCount(self,item):
        minCount = float('inf')

        for i in range(0,self.noHashFunctions):
            index = self.hashItem(item = item, seed = i)
            count = self.arrays[i][index]

            if count < minCount:
                minCount = count

        return minCount

In [163]:
cms = CountMinSketch()
print(cms.length,cms.noHashFunctions)

55 5


In [164]:
#counting an item
item_1 = 2000
item_2 = "countmin"

In [165]:
cms.count(item_1)
cms.count(item_2)
cms.count(item_2)

In [166]:
print(cms.getCount(item_1))
print(cms.getCount(item_2))

1
2
