In [1]:
from bitarray import bitarray
import math, mmh3, pyhash
from fnvhash import fnv1a_32 as fnvh
from pyhashxx import hashxx

In [2]:
def cal_value_k(n, m):
    k = (n/m)*math.log(2)
    k = round(k)
    return k

In [3]:
def initialization(input_set):
    m = len(input_set)
    n = m * 8 
    b = bitarray(n)
    b.setall(0)
    print("n:", n)
    print("m:", m)
    return b, n, m

In [4]:
def reading_input_set(file):
    S = []
    temp = dict()
    with open(file,'r', encoding='ISO-8859-1') as text:
        for line in text:
            temp[line] = line
            S.append(line)
    text.close()
    b, n, m = initialization(S)
    k = cal_value_k(n, m)
    for line in S:
        h_1 = mmh3.hash(line, 31) % n
        h_2 = fnvh(line.encode()) % n
        h_3 = hashxx(line.encode()) % n
        h_4 = pyhash.super_fast_hash()(line) % n
        h_5 = pyhash.spooky_32()(line) % n
        h_6 = pyhash.farm_32()(line) % n
        b[h_1] = b[h_2] = b[h_3] = b[h_4] = b[h_5] = b[h_6] = 1
    return b, n, m, k, temp

In [5]:
def processing_input_stream(stream, b, n, m, k, temp):
    true_negatives, false_positives = 0, 0
    data = open(stream, 'r', encoding='ISO-8859-1')
    for line in data:
        h_1 = mmh3.hash(line, 31) % n
        h_2 = fnvh(line.encode()) % n
        h_3 = hashxx(line.encode()) % n
        h_4 = pyhash.super_fast_hash()(line) % n
        h_5 = pyhash.spooky_32()(line) % n
        h_6 = pyhash.farm_32()(line) % n
        if (b[h_1] == 1) and (b[h_2] == 1) and (b[h_3] == 1) and (b[h_4] == 1) and (b[h_5] == 1) and (b[h_6] == 1):
            if line not in temp:
                false_positives += 1
        else:
            true_negatives += 1
    data.close()       
    print("No. of False Positives: ", false_positives)
    print("No. of True Negatives: ", true_negatives)
    print('Actual False Positive Rate ='+str(round((false_positives/(false_positives+true_negatives))*100, 2))+"%")

    percentage_false_positives = (1 - (math.e)**(-k*m/n))**k
    print("% of false positives according to formula:", round(percentage_false_positives *100, 2))
    print('Optimal Value for k is:', k)

In [6]:
if __name__ == "__main__":
    input_file = 'listed_username_30.txt'
    stream_file = 'listed_username_365.txt'
    b, n, m, k, temp = reading_input_set(input_file)
    processing_input_stream(stream_file, b, n, m, k, temp)

n: 2267192
m: 283399
No. of False Positives:  45542
No. of True Negatives:  2095606
Actual False Positive Rate =2.13%
% of false positives according to formula: 2.16
Optimal Value for k is: 6
