# Count Min Sketch Implementation

Description: <br>
Count Min Sketch (12.2.2.2, Pg 403- 405 Aggarwal)

## 1. Step by step definition

In [117]:
# importing libraries
# https://pypi.org/project/mmh3/
import mmh3 # can be used to hash strings
import random
import math

In [118]:
# parameters
# w
length = 7
# d
noHashFunctions = 5

In [119]:
#initialize the |noHashFunctions| arrays with 0
arrays = []

for i in range(0,noHashFunctions):
    array = [0] * length
    arrays.append(array)
    
print(arrays)

[[0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0]]


In [120]:
#we need to define pairwise independent hash functions that map an item from 0 to length - 1
def hashItem(item,seed):
    #returns a 32-bit unsigned int 
    #seed is used to create independence
    hashCode = mmh3.hash(str(item),seed, signed = False)
    #maxValue of 32-bit unsigned int 
    maxValue = 4294967295
    #normalize it between 0 and length-1
    normalize = hashCode/maxValue * (length-1)
    #round ensures even splits
    #round(3.6) = 4 while int(3.6) = 3 
    return round(normalize)

In [121]:
#counting an item
def count(item):
    for i in range(0,noHashFunctions):
        index = hashItem(item = item, seed = i)
        arrays[i][index] += 1

In [122]:
#counting an item
item_1 = 2000
item_2 = "countmin"
count(item_1)
count(item_2)
count(item_2)
print(arrays)

[[0, 0, 0, 1, 2, 0, 0], [0, 0, 2, 1, 0, 0, 0], [0, 0, 2, 0, 1, 0, 0], [0, 3, 0, 0, 0, 0, 0], [0, 1, 0, 0, 2, 0, 0]]


As shown, array 4 has a count collision.

In [123]:
#finding the count of an item
def getCount(item):
    #taking the min reduces the effect of collisions
    minCount = float('inf')

    for i in range(0,noHashFunctions):
        index = hashItem(item = item, seed = i)
        count = arrays[i][index]

        if count < minCount:
            minCount = count
    
    return minCount

In [124]:
print(getCount(item_1))
print(getCount(item_2))

1
2


## 2. Class Definition

Let $||count||_1$ be the sum of all counts stored in the data structure, i.e. the sum of values in one row of the sketch. The central guarantee CMS provides is then the following:<br>
Theorem: With a probability of $1−\delta$, the error is at most $\epsilon∗||count||_1$. Concrete values for these error bounds $\epsilon$ and $\delta$ can be freely chosen by setting:<br>
$w=⌈\frac{e}{\epsilon}⌉$ and $d=⌈ln\frac{1}{\delta}⌉$

In [125]:
import mmh3
import math

class CountMinSketch():
    def __init__(self,delta = 0.01,epsilon= 0.05):
        self.length = math.ceil(math.e/epsilon)
        self.noHashFunctions = math.ceil(math.log((1/delta), math.e))
        self.arrays = []

        for i in range(0,self.noHashFunctions):
            array = [0] * self.length
            self.arrays.append(array)
            
    def hashItem(self,item,seed):
        hashCode = mmh3.hash(str(item),seed, signed = False)
        maxValue = 4294967295
        normalize = hashCode/maxValue * (self.length-1)
        return round(normalize)
    
    def count(self, item):
        for i in range(0,self.noHashFunctions):
            index = self.hashItem(item = item, seed = i)
            self.arrays[i][index] += 1
            
    def getCount(self,item):
        minCount = float('inf')

        for i in range(0,self.noHashFunctions):
            index = self.hashItem(item = item, seed = i)
            count = self.arrays[i][index]

            if count < minCount:
                minCount = count

        return minCount

In [126]:
cms = CountMinSketch()
print(cms.length,cms.noHashFunctions)

55 5


In [127]:
#counting an item
item_1 = 2000
item_2 = "countmin"

In [128]:
cms.count(item_1)

for i in range(0,20):
    cms.count(item_2)

In [129]:
print(cms.getCount(item_1))
print(cms.getCount(item_2))

1
20


## 3. Alternative hash function - Polynomial hashing

Assumptions:<br>
- All strings lowercase<br>
- Numbers(0-9) and Letters(a-z) only 

In [130]:
def generateAsciiMapping(string):
    if not string.isalnum():
        print(string)
        raise Exception("Not alphanumeric")
        
    l = list(string)
    n = []
    
    for char in l:
        char = char.lower()
        # a - z
        if ord(char) > 57:
            n.append(ord(char) - ord('a') + 11)
        # 0 - 9
        else: 
            n.append(ord(char) - 47)
    
    return n

In [131]:
print(generateAsciiMapping("0"))
print(generateAsciiMapping("A"))
print(generateAsciiMapping("z"))

[1]
[11]
[36]


We have 36 characters. Therefore we need a prime > 36. To be able to map these characters to our array, we also need a mod which is equal to the size of each array we use 

hash(s) = $(\sum_{i= 0} ^ {n-1} s[i].p^i)$mod m

In [132]:
def generatePolyhash(string,mod):
    p_power = 1
    prime = 37
    mapToInteger = generateAsciiMapping(string)
    value = 0
    
    for no in mapToInteger:
        value = (value + (no * p_power)) % mod
        p_power = (p_power * prime) % mod 
    
    return value

In [133]:
generatePolyhash("66666t",2)

1

In [134]:
class CountMinSketchPoly():
    def __init__(self,delta = 0.01,epsilon= 0.05):
        self.length = math.ceil(math.e/epsilon)
        self.noHashFunctions = math.ceil(math.log((1/delta), math.e))
        self.arrays = []

        for i in range(0,self.noHashFunctions):
            array = [0] * self.length
            self.arrays.append(array)
            
    def hashItem(self,item):
        hashCode = generatePolyhash(str(item), self.length)
        return hashCode
    
    def count(self, item):
        for i in range(0,self.noHashFunctions):
            index = self.hashItem(item = item)
            self.arrays[i][index] += 1
            
    def getCount(self,item):
        minCount = float('inf')

        for i in range(0,self.noHashFunctions):
            index = self.hashItem(item = item)
            count = self.arrays[i][index]

            if count < minCount:
                minCount = count

        return minCount

In [135]:
cmsPoly = CountMinSketchPoly()
print(cmsPoly.length,cmsPoly.noHashFunctions)

55 5


In [136]:
#counting an item
item_1 = 2000
item_2 = "countmin"

In [137]:
cmsPoly.count(item_1)

for i in range(0,20):
    cmsPoly.count(item_2)

In [138]:
print(cmsPoly.getCount(item_1))
print(cmsPoly.getCount(item_2))

1
20
