In [8]:
# IMPORTS
import gzip
import random
import math
from operator import itemgetter


In [29]:
# EXACT ALGORITHM
def exact_freq_items(thres, data):

    # create dict to store support of each item
    support = dict()
    # create a counter to measure dsize as we read thru data
    dsize = 0

    # open file and read transactions line by line
    # increment dsize by 1 for each transaction
    # loop thru transaction and increment support for each item present
    with gzip.open(data, 'rt') as reader:
        for transaction in reader:
            
            #increment dataset size counter
            dsize += 1

            # split string into the items it contains
            # split() strips whitespace and newlines
            elements = transaction.split()

            # increment support for each item in transaction
            # if item not in dict, add it then increment
            # else just increment
            for element in elements:
                if element in support.keys():
                    support[element] += 1
                else:
                    support[element] = 1
    
    # for each item in support, calculate frequency
    # calculate by dividing support of item by dsize
    # if frequency is greater than thres, add tuple to output
    # tuple consists of item value followed by frequency
    output = []
    for element in support.keys():
        if support[element] / dsize >= thres:
            output.append((int(element), support[element] / dsize))

    # sort output by value of item in ascending order
    # there can't be any ties
    output = sorted(output, key=itemgetter(0))

    # Sort output by frequency in descending order, ties broken by value of item 
    # python sorted() is "stable" so initial order is preserved during ties,
    #   and original order is ascending by value of item
    output = sorted(output, key= itemgetter(1), reverse=True) 

    # print size of dataset
    print("Size of dataset: " + str(dsize))

    # return tuple of items and their frequencies
    return output


In [30]:
print(exact_freq_items(0.0, "./data/kosarak.txt.gz"))

Size of dataset: 2970006
[(6, 0.6074472576823077), (3, 0.4545758493417185), (11, 0.36774168132993673), (1, 0.19951676865299262), (218, 0.08949274849949798), (7, 0.08777558025135303), (4, 0.07888569922080965), (27, 0.07286247906569886), (148, 0.07062814014517142), (55, 0.06607259379273982), (64, 0.04874939646586573), (77, 0.043892840620523996), (2, 0.04336051846359906), (138, 0.03548174650152222), (294, 0.03257266146937077), (83, 0.03253730800543837), (316, 0.028133276498431316), (40, 0.02432520338342751), (136, 0.023918133498720205), (446, 0.023505003020195918), (490, 0.023459548566568553), (69, 0.022859549778687316), (90, 0.022284803465043506), (215, 0.0218100569493799), (205, 0.02158884527506005), (303, 0.02146056270593393), (278, 0.02031713067246329), (438, 0.01910097151318886), (73, 0.01893834557909984), (87, 0.018670669352183127), (269, 0.018528245397484044), (135, 0.018497942428399135), (987, 0.017983802052925147), (737, 0.017823196316775116), (273, 0.017350469999050506), (56, 0.

In [26]:
# Sampling algorithm
def sample_freq_items(ssize, dsize, delta, thres, data):

    samples = random.sample(range(dsize),k=ssize)
    samples.sort()

    support = dict()
    next = samples.pop(0)
    current = 0
    max_t = 0
    with gzip.open(data, 'rt') as reader:
        for transaction in reader:
            if current == next:
                elements = transaction.split()

                if len(elements) > max_t:
                    max_t = len(elements)

                for element in elements:
                    if element in support.keys():
                        support[element] += 1
                    else:
                        support[element] = 1

                next = samples.pop(0)
                if len(samples) == 0:
                    break

            current += 1

    ds = math.floor(math.log(max_t,2) + 1)
    eps = math.sqrt(2*(ds + math.log(1/delta)) / ssize)

    output = []

    for element in support.keys():
        if support[element] / ssize >= thres - eps/2:
            output.append((int(element), support[element] / ssize))

    # sort output by value of item in ascending order
    # there can't be any ties
    output = sorted(output, key=itemgetter(0))

    # sort output by frequency in descending order, ties broken by value of item 
    # python sorted() is "stable" so order from intitial sort is preserved during ties,
    #   and that order is ascending by value of item
    output = sorted(output, key= itemgetter(1), reverse=True)

    return eps, output






In [31]:
dataset = 'kosarak'
if dataset == 'accidents':
    dsize = 3401830
elif dataset == 'kosarak':
    dsize = 2970006
else:
    dsize = 0

eps,output = sample_freq_items(2000,dsize,0.1,0,"./data/kosarak.txt.gz")
print(eps)
print(output)

0.11091701895107912
[(6, 0.612), (3, 0.4715), (11, 0.378), (1, 0.21), (7, 0.098), (27, 0.0895), (218, 0.082), (4, 0.0805), (55, 0.0685), (148, 0.0635), (77, 0.05), (64, 0.0415), (2, 0.039), (294, 0.038), (138, 0.036), (83, 0.035), (303, 0.029), (316, 0.0275), (446, 0.0265), (205, 0.026), (69, 0.0255), (215, 0.0255), (40, 0.025), (135, 0.0245), (73, 0.024), (490, 0.023), (136, 0.022), (278, 0.0215), (87, 0.021), (438, 0.02), (90, 0.0195), (269, 0.019), (56, 0.0185), (273, 0.0185), (987, 0.018), (49, 0.0175), (504, 0.0175), (378, 0.0155), (737, 0.0155), (314, 0.015), (91, 0.0145), (512, 0.0145), (321, 0.014), (897, 0.014), (934, 0.014), (423, 0.0135), (155, 0.013), (361, 0.013), (14, 0.012), (86, 0.012), (492, 0.012), (85, 0.0115), (435, 0.011), (632, 0.011), (644, 0.0105), (667, 0.0105), (670, 0.0105), (1644, 0.0105), (32, 0.01), (265, 0.01), (678, 0.01), (25, 0.0095), (28, 0.0095), (71, 0.0095), (281, 0.0095), (747, 0.0095), (254, 0.009), (590, 0.009), (787, 0.009), (993, 0.009), (145,