In [130]:
import numpy as np
import urllib.request

In [3]:
!pip install mmh3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting mmh3
  Downloading mmh3-3.0.0-cp38-cp38-manylinux2010_x86_64.whl (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.0/50.0 KB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mmh3
Successfully installed mmh3-3.0.0


In [4]:
import mmh3

In [131]:
eps = 0.002 # bound on realtive error
delta = 0.01

Colum numbers: $\doteq\left\lceil\frac{e}{\epsilon}\right\rceil$

In [132]:
# number of columns
col_num = int(np.ceil(np.exp(1)/eps))
print(col_num)

1360


In [133]:
# number of rows
row_num  = int(np.ceil(np.log(1/delta)))
print(row_num)

5


In [146]:
# Based on: https://github.com/pnxenopoulos/countminsketch
class CountMinSketch(object):
    ''' Class for a CountMinSketch data structure
    '''
    def __init__(self, width, depth, seeds):
        ''' Method to initialize the data structure
        @param width int: Width of the table
        @param depth int: Depth of the table (num of hash func)
        @param seeds list: Random seed list
        '''
        self.width = width
        self.depth = depth
        self.table = np.zeros([depth, width])  # Create empty table
        self.seed = seeds #seeds to define the hash functions

    def increment(self, key):
        ''' Method to add a key to the CMS
        @param key str: A string to add to the CMS
        '''
        for i in range(0, self.depth):
            #print(i,key,self.seed[i])
            index = mmh3.hash(key, self.seed[i]) % self.width
            self.table[i, index] = self.table[i, index]+1

    def estimate(self, key):
        ''' Method to estimate if a key is in a CMS
        @param key str: A string to check
        '''
        vals=[]
        for i in range(0, self.depth):
            index = mmh3.hash(key, self.seed[i]) % self.width
            vals.append(self.table[i, index])
            min_est = int(np.min(vals))
        return min_est

In [136]:
seeds=np.random.randint(1000, size = row_num)
seeds

array([884, 817, 494, 517, 620])

In [149]:
CM= CountMinSketch(width=col_num, depth=row_num, seeds=seeds)

In [150]:
CM.increment('the')

In [151]:
CM.estimate('the')

1

In [140]:
target_url = "https://www.gutenberg.org/cache/epub/1513/pg1513-images.html#sceneI_30.1"

In [152]:
CM= CountMinSketch(width=col_num, depth=row_num, seeds=seeds)

mydict={}
word_count = 0
data = urllib.request.urlopen(target_url)

for line in data:
  line = str(line)
  words = line.split(' ')
  for actword in words:
    word_count=word_count+1
    actword = actword.lower() 
    CM.increment(actword)
    try:
      mydict[actword]
    except:
      mydict[actword]=1
    else:
      mydict[actword]=mydict[actword]+1
print(f'Word count: {word_count}')

Word count: 31939


In [153]:
print(f'Error bound: {word_count*eps}' )

Error bound: 63.878


In [154]:
key = 'romeo'
print(CM.estimate(key),mydict[key])

42 37


In [157]:
key = 'juliet'
print(CM.estimate(key),mydict[key])

23 16


In [158]:
key = 'the'
print(CM.estimate(key),mydict[key])

766 762
