In [1]:
import numpy as np
import time
import os
import random as rnd
from collections import defaultdict

In [2]:
# This class is going to represent a bloom filter, so that we can organize all the implementation and methods
# of the data structure in a single class.
class Bloom_Filter:
    
    # To the constructor we're going to pass the size of the array representing the bloom filter
    # and the list of hash functions that will be used for our methods
    def _init_(self, size, hash_functions):
        self._array = np.empty(size, dtype = bool)
        self._hash_functions = hash_functions
    
    # This function is for adding elements to the bloom filter
    def insert(self, element):
        for function in self._hash_functions:
            self._array[function(element)] = True
            
    # This function is for checking if an element is possibly on the bloom filter or definitely not in it.
    # It returns True if the element is possibly on it, False if it's definetely not on it.
    def check(self, element):
        for function in self._hash_functions:
            if(not self._array[function(element)]):
                return(False)
        return(True)

In [3]:
passwords = open("passwords1.txt", "r")
# It's worth noting that when Python opens a file it's not going to save it in memory
# so we are not cheating on our task by just opening the file if we don't read it all at once

counter = 0
while(passwords.readline()):
    counter = counter + 1
passwords.close()
print(counter)

100000000


In [4]:
passwords = open("passwords1.txt", "r")

# We're going to save the minimum as well the maximum possible character in our file (characters are ordered by their ASCII code)
minimum = 102
maximum = 102

# We're going to look at only the first 1'000'000 entries of the file so to speed up the process
# implicitly assuming that the underlying distribution is homogenous throughout the file
for _ in range(1000000):
    string = passwords.readline()
    for character in string[:19]: # It's important we get rid of the last character, which is always a "\n"
        if(ord(character) < minimum):
            minimum = ord(character)
        if(ord(character) > maximum):
            maximum = ord(character)

print(minimum, chr(minimum))
print(maximum, chr(maximum))
passwords.close()

33 !
122 z


In [5]:
passwords = open("passwords1.txt", "r")

# Here we're going to save how many times a character appears on the file, at position i will be the number of times
# chr(i + 33) appeared
counter = [0] * (122 - 33 + 1)

# Again we're just looking at the first 1'000'000 to speed up the process
for _ in range(1000000):
    string = passwords.readline()
    for character in string[:19]:
        counter[ord(character) - 33] += 1

passwords.close()

In [6]:
counter

[226536,
 226375,
 226357,
 226105,
 226044,
 226000,
 226268,
 226404,
 225767,
 225890,
 225885,
 226388,
 226831,
 225541,
 225986,
 226636,
 225616,
 227077,
 226304,
 227385,
 226377,
 225768,
 226336,
 226474,
 226330,
 226024,
 226416,
 226617,
 226811,
 226216,
 226053,
 226097,
 225798,
 226659,
 225852,
 226279,
 226296,
 226135,
 226755,
 226109,
 226002,
 225869,
 226628,
 225940,
 226091,
 226075,
 225593,
 225928,
 225867,
 226701,
 225958,
 226349,
 226193,
 226762,
 225935,
 226347,
 226287,
 226075,
 0,
 0,
 0,
 0,
 0,
 0,
 226032,
 225611,
 226913,
 226309,
 225951,
 226027,
 225492,
 226486,
 225835,
 225963,
 226979,
 226746,
 225956,
 226440,
 226128,
 225894,
 225349,
 225799,
 226374,
 226345,
 225788,
 225929,
 225759,
 225880,
 226971,
 225647]

In [7]:
def get_base_10(character):
    value = ord(character)
    
    # We remember that values ranging from 91 to 96 do not appear
    if(value < 91):
        return(value - 33)
    else:
        return(value - 39)

In [8]:
def hash_1(string):
    value  = 0
    for index in range(len(string) - 1, -1, -1):
        value = (84 * value + get_base_10(string[index])) % 958505838
    return(value)

In [9]:
def rotate_string(string, step):
    return(string[step:] + string[:step])


In [10]:
def hash_function(first_hash_function, k):
    
    # Our k-th hash function will just apply hash_1 to the string rotated by k steps
    return(lambda x : hash_1(rotate_string(x, k - 1)))

In [11]:
hash_functions = [hash_function(hash_1, index + 1) for index in range(7)]

In [12]:
# The function takes as parameters the name of the file containing the first data set, the name of the file 
# containing the second data set, the size m of the array used to represen the bloom filter and
# the list of hash functions used by the bloom filter

# The function returns the number of strings from the second data set that are possibly contained in the first data set
# and the execution time for finding this number
def task(first_data_set, second_data_set, m, hash_functions):
    
    # We initialize our bloom filter
    bloom_filter = Bloom_Filter(m, hash_functions)
    
    # We add every string in the first data set to the bloom filter
    strings = open(first_data_set, "r")
    start = time.time()
    while(True):
        string = strings.readline()
        if(string == ""):
            break
        string = string[:len(string) - 1] # We need to get rid of the "\n" at the end
        bloom_filter.insert(string)
    strings.close()
    
    # We now check how many strings from the second data set are probably on the first data set
    # and we also create a list containing this possibly duplicates
    strings = open(second_data_set, "r")
    possibly_duplicates = []
    while(True):
        string = strings.readline()
        if(string == ""):
            break
        string = string[:len(string) - 1]
        if(bloom_filter.check(string)):
            possibly_duplicates.append(string)
    end = time.time()
    strings.close()
    
    return((possibly_duplicates, end - start))


In [13]:
if(not os.path.isfile("possibly_duplicates.txt")):
    result = task("passwords1.txt", "passwords2.txt", 958505838, hash_functions)
    f = open("possibly_duplicates.txt", "w")
    f.write(str(result[1]) + "\n")
    for password in result[0]:
        f.write(password + "\n")
    f.close()
else:
    f = open("possibly_duplicates.txt", "r")
    result = [[], 0]
    result[1] = float(f.readline())
    while(True):
        string = f.readline()
        if(string == ""):
            break
        string = string[:len(string) - 1]
        result[0].append(string)
    f.close()
# We print the asked results
print('Number of hash functions used: ', len(hash_functions))
print('Number of possibly duplicates: ', len(result[0]))
print('Probability of false positives: 0.01')
print('Execution time: ', result[1])

TypeError: Bloom_Filter() takes no arguments