# Count Min Sketch Implementation

Description: <br>
Count Min Sketch (12.2.2.2, Pg 403- 405 Aggarwal)

## 1. Step by step definition

In [106]:
# importing libraries
# https://pypi.org/project/mmh3/
import mmh3 # can be used to hash strings
import random
import math

In [107]:
# parameters
# w
length = 7
# d
noHashFunctions = 5

In [108]:
#initialize the |noHashFunctions| arrays with 0
arrays = []

for i in range(0,noHashFunctions):
    array = [0] * length
    arrays.append(array)
    
print(arrays)

[[0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0]]


In [109]:
#we need to define pairwise independent hash functions that map an item from 0 to length - 1
def hashItem(item,seed):
    #returns a 32-bit unsigned int 
    #seed is used to create independence
    hashCode = mmh3.hash(str(item),seed, signed = False)
    #maxValue of 32-bit unsigned int 
    maxValue = 4294967295
    #normalize it between 0 and length-1
    normalize = hashCode/maxValue * (length-1)
    #round ensures even splits
    #round(3.6) = 4 while int(3.6) = 3 
    return round(normalize)

In [110]:
#counting an item
def count(item):
    for i in range(0,noHashFunctions):
        index = hashItem(item = item, seed = i)
        arrays[i][index] += 1

In [111]:
#counting an item
item_1 = 2000
item_2 = "countmin"
count(item_1)
count(item_2)
count(item_2)
print(arrays)

[[0, 0, 0, 1, 2, 0, 0], [0, 0, 2, 1, 0, 0, 0], [0, 0, 2, 0, 1, 0, 0], [0, 3, 0, 0, 0, 0, 0], [0, 1, 0, 0, 2, 0, 0]]


As shown, array 4 has a count collision.

In [112]:
#finding the count of an item
def getCount(item):
    #taking the min reduces the effect of collisions
    minCount = float('inf')

    for i in range(0,noHashFunctions):
        index = hashItem(item = item, seed = i)
        count = arrays[i][index]

        if count < minCount:
            minCount = count
    
    return minCount

In [113]:
print(getCount(item_1))
print(getCount(item_2))

1
2


## 2. Class Definition

Let $||count||_1$ be the sum of all counts stored in the data structure, i.e. the sum of values in one row of the sketch. The central guarantee CMS provides is then the following:<br>
Theorem: With a probability of $1−\delta$, the error is at most $\epsilon∗||count||_1$. Concrete values for these error bounds $\epsilon$ and $\delta$ can be freely chosen by setting:<br>
$w=⌈\frac{e}{\epsilon}⌉$ and $d=⌈ln\frac{1}{\delta}⌉$

### mmh3 version

In [114]:
import mmh3
import math

class CountMinSketch():
    def __init__(self,delta = 0.001,epsilon= 0.005):
        self.length = math.ceil(math.e/epsilon)
        self.noHashFunctions = math.ceil(math.log((1/delta), math.e))
        self.arrays = []
        # stores already seen hashes to avoid recomputation
        self.cache = {}

        for i in range(0,self.noHashFunctions):
            array = [0] * self.length
            self.arrays.append(array)
            
    def hashItem(self,item,seed):
        key = str(item) + " " + str(seed)
        if key in self.cache:
            return self.cache[key]

        hashCode = mmh3.hash(str(item),seed, signed = False)
        maxValue = 4294967295
        normalize = hashCode/maxValue * (self.length-1)
        self.cache[key] = round(normalize)
        return round(normalize)
    
    def count(self, item):
        for i in range(0,self.noHashFunctions):
            index = self.hashItem(item = item, seed = i)
            self.arrays[i][index] += 1
            
    def getCount(self,item):
        minCount = float('inf')

        for i in range(0,self.noHashFunctions):
            index = self.hashItem(item = item, seed = i)
            count = self.arrays[i][index]

            if count < minCount:
                minCount = count

        return minCount

In [115]:
cms = CountMinSketch()
print(cms.length,cms.noHashFunctions)

544 7


In [116]:
#counting an item
item_1 = 2000
item_2 = "countmin"

In [117]:
cms.count(item_1)

for i in range(0,20):
    cms.count(item_2)

In [118]:
print(cms.getCount(item_1))
print(cms.getCount(item_2))

1
20


### Rolling polynomial hashing version

Assumptions:<br>
- All strings lowercase<br>
- Numbers(0-9) and Letters(a-z) only 

In [119]:
import string
import random

# this will enable us to get different independent mappings for different seeds
def randomMap(seed):
    mapping = {}
    characters =  string.ascii_lowercase + '0123456789'
    characters = list(characters)
    random.Random(seed).shuffle(characters)

    for i,char in enumerate(characters):
        mapping[char] = i + 1

    return mapping

# will return integer value to be used in the polynomial rolling algorithm
def generateAsciiMapping(item,seed):
    mapping = randomMap(seed)

    if not item.isalnum():
        raise Exception("Not alphanumeric")
        
    l = list(item)
    n = []
    
    for char in l:
        char = char.lower()
        n.append(mapping[char])
    
    return n

We have 36 characters. Therefore we need a prime > 36. To be able to map these characters to the arrays in our count min sketch, we also need a mod which is equal to the size of each array we use 

hash(s) = $(\sum_{i= 0} ^ {n-1} s[i].p^i)$mod m

In [120]:
# polynomial rolling hash algorithm
def generatePolyhash(string,mod,seed):
    p_power = 1
    prime = 39
    string = str(string)
    mapToInteger = generateAsciiMapping(string,seed)
    value = 0
    
    for no in mapToInteger:
        value = (value + (no * p_power)) % mod
        p_power = (p_power * prime) % mod 
    
    return value

All this was exported to a file called polyhash.py

In [121]:
import polyhash 
import math

class CountMinSketchPoly():
    def __init__(self,delta = 0.001,epsilon= 0.005):
        self.length = math.ceil(math.e/epsilon)
        self.noHashFunctions = math.ceil(math.log((1/delta), math.e))
        self.arrays = []
        # stores already seen hashes to avoid recomputation
        self.cache = {}

        for i in range(0,self.noHashFunctions):
            array = [0] * self.length
            self.arrays.append(array)
            
    def hashItem(self,item,seed):
        key = str(item) + " " + str(seed)
        if key in self.cache:
            return self.cache[key]
        
        hashCode = polyhash.generatePolyhash(str(item),self.length,seed)
        self.cache[key] = hashCode
        return hashCode
    
    def count(self, item):
        for i in range(0,self.noHashFunctions):
            index = self.hashItem(item = item, seed = i)
            self.arrays[i][index] += 1
            
            
    def getCount(self,item):
        minCount = float('inf')

        for i in range(0,self.noHashFunctions):
            index = self.hashItem(item = item,seed = i)
            count = self.arrays[i][index]

            if count < minCount:
                minCount = count

        return minCount

In [122]:
cmsPoly = CountMinSketchPoly()
print(cmsPoly.length,cmsPoly.noHashFunctions)

544 7


In [123]:
#counting an item
item_1 = 2000
item_2 = "countmin"

In [124]:
cmsPoly.count(item_1)

for i in range(0,20):
    cmsPoly.count(item_2)

In [125]:
print(cmsPoly.getCount(item_1))
print(cmsPoly.getCount(item_2))

1
20


# 3 . Analysis on real data

We will compare our approximation algorithms with the exact frequency counting algorithm on the webdocs dataset, which contains a high number of distinct items and may use a lot of space

In [126]:
# EXEMPLARY SOLUTION HOMEWORK 1
import sys
import os
from collections import defaultdict

def exact_algorithm(file_path):
    """Run exact frequent items algorithm"""
    M = defaultdict(int)
    dataset_size = 0
    output = list()

    with open(file_path, "r") as dataset:
        # Process the transactions as we get them
        for transaction in dataset:
            dataset_size += 1
            # Get the transaction as list of ints
            transaction = [
                int(item) for item in transaction.split() if item.isnumeric()
            ]
            # Using defaultdict properties of initializing the value for a
            # non-existing key to 0
            for item in transaction:
                M[item] += 1

    for key in M:
            output.append((key, M[key]))

    return output

In [127]:
import sys
import os
from collections import defaultdict

def cms_algorithm(file_path,hashFunction,delta,epsilon):
    dataset_size = 0
    output = list()
    M = set()
    cms = CountMinSketch(delta,epsilon)
    
    if hashFunction == "poly":
        cms = CountMinSketchPoly(delta,epsilon)

    print("Using only" ,cms.noHashFunctions, "by",cms.length, "entries")
    
    with open(file_path, "r") as dataset:
        # Process the transactions as we get them
        for transaction in dataset:
            dataset_size += 1
            # Get the transaction as list of ints
            transaction = [
                int(item) for item in transaction.split() if item.isnumeric()
            ]

            for item in transaction:
                cms.count(item)
                M.add(item)

    for key in M:
        output.append((key, cms.getCount(key)))

    return output

In [128]:
file = "./accidents.dat"

In [129]:
output_exact = exact_algorithm(file)

In [130]:
def runTests(delta, epsilon):
    output_cms_mmh3 = cms_algorithm(file,"mmh3",delta,epsilon)
    output_cms_poly = cms_algorithm(file,"poly",delta,epsilon)
    return output_cms_mmh3,output_cms_poly 

In [131]:
def avg_difference(a,b):
    difference = 0
    index = 0
    length = len(a)
    
    for i in range(0, length):
        if a[i][0] != b[i][0]:
            raise Exception("Keys do not match")
        else:
            difference += abs(a[i][1] - b[i][1])
            
    # average difference
    return difference/length

### delta = 0.1, epsilon = 0.05

In [132]:
output_cms_1,output_poly_1 = runTests(0.1,0.05) 

Using only 3 by 55 entries
Using only 3 by 55 entries


In [133]:
print(avg_difference(output_exact,output_cms_1))
print(avg_difference(output_exact,output_poly_1))

72014.64957264958
64423.11965811966


### delta = 0.1, epsilon = 0.005

In [134]:
output_cms_2,output_poly_2 = runTests(0.1,0.005) 

Using only 3 by 544 entries
Using only 3 by 544 entries


In [135]:
print(avg_difference(output_exact,output_cms_2))
print(avg_difference(output_exact,output_poly_2))

232.25
434.38247863247864


### delta = 0.1, epsilon = 0.0005

In [136]:
output_cms_3,output_poly_3 = runTests(0.1,0.0005) 

Using only 3 by 5437 entries
Using only 3 by 5437 entries


In [137]:
print(avg_difference(output_exact,output_cms_3))
print(avg_difference(output_exact,output_poly_3))

0.0
0.0
