In [1]:
import numpy as np
import time
import os
import random as rnd
from collections import defaultdict

In [3]:
class Bloom_Filter:
    
    
    def __init__(self, size, hash_functions):
        self._array = np.empty(size, dtype = bool)
        self._hash_functions = hash_functions
    
   
    def insert(self, element):
        for function in self._hash_functions:
            self._array[function(element)] = True
            
    def check(self, element):
        for function in self._hash_functions:
            if(not self._array[function(element)]):
                return(False)
        return(True)

In [4]:
passwords = open("passwords1.txt", "r")

counter = 0
while(passwords.readline()):
    counter = counter + 1
passwords.close()
print(counter)

100000000


In [5]:
passwords = open("passwords1.txt", "r")


minimum = 102
maximum = 102


for _ in range(1000000):
    string = passwords.readline()
    for character in string[:19]: 
        if(ord(character) < minimum):
            minimum = ord(character)
        if(ord(character) > maximum):
            maximum = ord(character)

print(minimum, chr(minimum))
print(maximum, chr(maximum))
passwords.close()

33 !
122 z


In [8]:
passwords = open("passwords1.txt", "r")


counter = [0] * (122 - 33 + 1)


for _ in range(1000000):
    string = passwords.readline()
    for character in string[:19]:
        counter[ord(character) - 33] += 1

passwords.close()

In [9]:
counter

[226536,
 226375,
 226357,
 226105,
 226044,
 226000,
 226268,
 226404,
 225767,
 225890,
 225885,
 226388,
 226831,
 225541,
 225986,
 226636,
 225616,
 227077,
 226304,
 227385,
 226377,
 225768,
 226336,
 226474,
 226330,
 226024,
 226416,
 226617,
 226811,
 226216,
 226053,
 226097,
 225798,
 226659,
 225852,
 226279,
 226296,
 226135,
 226755,
 226109,
 226002,
 225869,
 226628,
 225940,
 226091,
 226075,
 225593,
 225928,
 225867,
 226701,
 225958,
 226349,
 226193,
 226762,
 225935,
 226347,
 226287,
 226075,
 0,
 0,
 0,
 0,
 0,
 0,
 226032,
 225611,
 226913,
 226309,
 225951,
 226027,
 225492,
 226486,
 225835,
 225963,
 226979,
 226746,
 225956,
 226440,
 226128,
 225894,
 225349,
 225799,
 226374,
 226345,
 225788,
 225929,
 225759,
 225880,
 226971,
 225647]

In [10]:
def get_base_10(character):
    value = ord(character)
    
    
    if(value < 91):
        return(value - 33)
    else:
        return(value - 39)

In [11]:
def hash_1(string):
    value  = 0
    for index in range(len(string) - 1, -1, -1):
        value = (84 * value + get_base_10(string[index])) % 958505838
    return(value)

In [12]:
def rotate_string(string, step):
    return(string[step:] + string[:step])

In [13]:
def hash_function(first_hash_function, k):
    
   
    return(lambda x : hash_1(rotate_string(x, k - 1)))

In [14]:
hash_functions = [hash_function(hash_1, index + 1) for index in range(7)]

In [15]:

def task(first_data_set, second_data_set, m, hash_functions):
    
   
    bloom_filter = Bloom_Filter(m, hash_functions)
    
    
    strings = open(first_data_set, "r")
    start = time.time()
    while(True):
        string = strings.readline()
        if(string == ""):
            break
        string = string[:len(string) - 1] 
        bloom_filter.insert(string)
    strings.close()
    
   
    strings = open(second_data_set, "r")
    possibly_duplicates = []
    while(True):
        string = strings.readline()
        if(string == ""):
            break
        string = string[:len(string) - 1]
        if(bloom_filter.check(string)):
            possibly_duplicates.append(string)
    end = time.time()
    strings.close()
    
    return((possibly_duplicates, end - start))

In [13]:
if(not os.path.isfile("possibly_duplicates.txt")):
    result = task("passwords1.txt", "passwords2.txt", 958505838, hash_functions)
    f = open("possibly_duplicates.txt", "w")
    f.write(str(result[1]) + "\n")
    for password in result[0]:
        f.write(password + "\n")
    f.close()
else:
    f = open("possibly_duplicates.txt", "r")
    result = [[], 0]
    result[1] = float(f.readline())
    while(True):
        string = f.readline()
        if(string == ""):
            break
        string = string[:len(string) - 1]
        result[0].append(string)
    f.close()


print('Number of hash functions used: ', len(hash_functions))
print('Number of possibly duplicates: ', len(result[0]))
print('Probability of false positives: 0.01')
print('Execution time: ', result[1])

Number of hash functions used:  7
Number of possibly duplicates:  14261334
Probability of false positives: 0.01
Execution time:  6857.633438587189


In [15]:
def hash_dictionary(list_of_data, hash_function):
    to_return = defaultdict(list)
    for element in list_of_data:
        to_return[hash_function(element)].append(element)
    return(to_return)

In [16]:
possibly_duplicates_dict = hash_dictionary(result[0], hash_1)

In [22]:
f = open("passwords1.txt", "r")

while(True):
    string = f.readline()
    if(string == ""):
        break
    string = string[:len(string) - 1]
    hash_value = hash_1(string)
    if(string in possibly_duplicates_dict[hash_value]):
        possibly_duplicates_dict[hash_value].remove(string)

f.close()

false_positives = []
for elements in possibly_duplicates_dict.values():
    false_positives.extend(elements)

print("Number of false positives: " + str(len(false_positives)))

Number of false positives: 261334
