In [1]:
# IMPORTS
import gzip
import random
import math
import time
import timeit
from operator import itemgetter


In [2]:
# EXACT ALGORITHM
def exact_freq_items(thres, data):

    # get start time of algorithm to calculate runtime later
    start_time = time.time()

    # create dict to store support of each item
    support = dict()
    # create a counter to measure dsize as we read thru data
    dsize = 0

    # open file and read transactions line by line
    # increment dsize by 1 for each transaction
    # loop thru transaction and increment support for each item present
    with gzip.open(data, 'rt') as reader:
        for transaction in reader:
            
            #increment dataset size counter
            dsize += 1

            # split string into the items it contains
            # split() strips whitespace and newlines
            elements = transaction.split()

            # increment support for each item in transaction
            # if item not in dict, add it then increment
            # else just increment
            for element in elements:
                if element in support.keys():
                    support[element] += 1
                else:
                    support[element] = 1
    
    # for each item in support, calculate frequency
    # calculate by dividing support of item by dsize
    # if frequency is greater than thres, add tuple to output
    # tuple consists of item value followed by frequency
    output = []
    for element in support.keys():
        if support[element] / dsize >= thres:
            output.append((int(element), support[element] / dsize))

    # sort output by value of item in ascending order
    # there can't be any ties
    output = sorted(output, key=itemgetter(0))

    # Sort output by frequency in descending order, ties broken by value of item 
    # python sorted() is "stable" so initial order is preserved during ties,
    #   and original order is ascending by value of item
    output = sorted(output, key= itemgetter(1), reverse=True) 
    
    # get runtime
    # multiply by 1000 to return value in ms and round to 3 decimals
    final_time = round(1000 * (time.time() - start_time), 3)

    # print size of dataset & runtime
    print("Size of dataset: " + str(dsize))
    print("Runtime --- %s ms ---" % final_time)

    # return tuple of items and their frequencies
    return output


In [4]:
# def wrapper(func, *args, **kwargs):
#     def wrapped():
#         return func(*args, **kwargs)
#     return wrapped

# wrapped_exact = wrapper(exact_freq_items, 0.0, "./data/kosarak.txt.gz")

# elapsed_time = timeit.timeit(wrapped_exact, number=1)
# print(elapsed_time)
print(exact_freq_items(0.8, "./data/webdocs.txt.gz"))

Size of dataset: 3384164
Runtime --- 195594.204 ms ---
[(122, 0.8448319880478605)]


In [5]:
# Sampling algorithm
def sample_freq_items(ssize, dsize, delta, thres, data):

    # get start time of algorithm to calculate runtime later
    start_time = time.time()

    samples = random.sample(range(dsize),k=ssize)
    samples.sort()

    support = dict()
    next = samples.pop(0)
    current = 0
    max_t = 0
    with gzip.open(data, 'rt') as reader:
        for transaction in reader:
            if current == next:
                elements = transaction.split()

                if len(elements) > max_t:
                    max_t = len(elements)

                for element in elements:
                    if element in support.keys():
                        support[element] += 1
                    else:
                        support[element] = 1

                next = samples.pop(0)
                if len(samples) == 0:
                    break

            current += 1

    ds = math.floor(math.log(max_t,2) + 1)
    eps = math.sqrt(2*(ds + math.log(1/delta)) / ssize)

    output = []

    for element in support.keys():
        if support[element] / ssize >= thres - eps/2:
            output.append((int(element), support[element] / ssize))

    # sort output by value of item in ascending order
    # there can't be any ties
    output = sorted(output, key=itemgetter(0))

    # sort output by frequency in descending order, ties broken by value of item 
    # python sorted() is "stable" so order from intitial sort is preserved during ties,
    #   and that order is ascending by value of item
    output = sorted(output, key= itemgetter(1), reverse=True)

    # get runtime
    # multiply by 1000 to return value in ms and round to 3 decimals
    final_time = round(1000 * (time.time() - start_time), 3)

    # print runtime
    print("Runtime --- %s ms ---" % final_time)

    return eps, output






In [7]:

#eps,output = sample_freq_items(2000, 3401830, 0.1, 0.8,"./data/accidents.txt.gz")
#eps,output = sample_freq_items(2000, 2970006, 0.1, 0.8,"./data/kosarak.txt.gz")
eps,output = sample_freq_items(2000, 3384164, 0.1, 0.8,"./data/webdocs.txt.gz")
print(eps)
print(output)

Runtime --- 13660.104 ms ---
0.12370361794625914
[(122, 0.84), (8, 0.774)]
