In [17]:
# IMPORTS
import gzip
import random
import math


In [45]:
# EXACT ALGORITHM
def exact_freq_items(thres, data):

    support = dict()
    dsize = 0

    with gzip.open(data, 'rt') as reader:
        for transaction in reader:
            dsize += 1
            elements = transaction.split()
            for element in elements:
                if element in support.keys():
                    support[element] += 1
                else:
                    support[element] = 1
    
    output = []

    for element in support.keys():
        if support[element] / dsize >= thres:
            output.append((element, support[element] / dsize))

    print(dsize)
    return output


In [46]:
print(exact_freq_items(0.7, "./data/accidents.txt.gz"))

3401830
[('8', 0.7027423475011979), ('12', 0.9982891561306708), ('15', 0.8022681909442859), ('16', 0.9771622920604498), ('17', 0.9999059329831297), ('18', 0.9968017214264088), ('21', 0.8890685307613844), ('24', 0.8060749655332571), ('25', 0.759691107433352), ('27', 0.8349506001181717), ('28', 0.7953513256100393), ('29', 0.8795001513891053), ('31', 0.9348321344688006), ('41', 0.7778078269637224), ('43', 0.856586013998348), ('59', 0.72400443290817)]


In [52]:
# Sampling algorithm
def sample_freq_items(ssize, dsize, delta, thres, data):

    samples = random.sample(range(dsize),k=ssize)
    samples.sort()

    support = dict()
    next = samples.pop(0)
    current = 0
    max_t = 0
    with gzip.open(data, 'rt') as reader:
        for transaction in reader:
            if current == next:
                elements = transaction.split()

                if len(elements) > max_t:
                    max_t = len(elements)

                for element in elements:
                    if element in support.keys():
                        support[element] += 1
                    else:
                        support[element] = 1

                next = samples.pop(0)
                if len(samples) == 0:
                    break

            current += 1

    ds = math.floor(math.log(max_t,2) + 1)
    eps = math.sqrt(2*(ds + math.log(1/delta)) / ssize)

    output = []

    for element in support.keys():
        if support[element] / ssize >= thres - eps/2:
            output.append((element, support[element] / ssize))

    return eps, output






In [54]:
eps,output = sample_freq_items(2000,3401830,0.1,0.7,"./data/accidents.txt.gz")
print(eps)
print(output)

0.09111852222788759
[('8', 0.6915), ('12', 0.997), ('15', 0.813), ('16', 0.9765), ('17', 0.9995), ('18', 0.9945), ('21', 0.891), ('24', 0.809), ('25', 0.7545), ('27', 0.846), ('29', 0.8735), ('43', 0.8605), ('22', 0.675), ('28', 0.7845), ('31', 0.929), ('41', 0.766), ('59', 0.7145), ('1', 0.699)]
