In [None]:
# IMPORTS
import gzip
import random
import math
import time
import matplotlib.pyplot as plt
from statistics import mean
from random import seed
from operator import itemgetter


In [None]:
# EXACT ALGORITHM
def exact_freq_items(thres, data):

    # get start time of algorithm to calculate runtime later
    start_time = time.time()

    # create dict to store support of each item
    support = dict()
    # create a counter to measure dsize as we read thru data
    dsize = 0

    # open file and read transactions line by line
    # increment dsize by 1 for each transaction
    # loop thru transaction and increment support for each item present
    with open(data, 'rt') as reader:
        for transaction in reader:
            
            #increment dataset size counter
            dsize += 1

            # split string into the items it contains
            # split() strips whitespace and newlines
            elements = transaction.split()

            # increment support for each item in transaction
            # if item not in dict, add it then increment
            # else just increment
            for element in elements:
                if element in support.keys():
                    support[element] += 1
                else:
                    support[element] = 1
    
    # for each item in support, calculate frequency
    # calculate by dividing support of item by dsize
    # if frequency is greater than thres, add tuple to output
    # tuple consists of item value followed by frequency
    output = []
    for element in support:
        if support[element] / dsize >= thres:
            output.append((int(element), support[element] / dsize))

    # sort output by value of item in ascending order
    # there can't be any ties
    output = sorted(output, key=itemgetter(0))

    # Sort output by frequency in descending order, ties broken by value of item 
    # python sorted() is "stable" so initial order is preserved during ties,
    #   and original order is ascending by value of item
    output = sorted(output, key= itemgetter(1), reverse=True) 
    
    # get runtime
    # multiply by 1000 to return value in ms and round to 3 decimals
    final_time = round(1000 * (time.time() - start_time), 3)

    # print size of dataset & runtime
    #print("Size of dataset: " + str(dsize))
    #print("Runtime --- %s ms ---" % final_time)

    # return tuple of items and their frequencies
    return output, final_time, dsize


In [None]:
# Run exact algorithm on webdocs dataset, ascribe id = 1
# store output, runtime, and dataset size

output1, runtime1, dsize1 = exact_freq_items(0, "./data/webdocs.txt")


In [None]:
# Run exact algorithm on kosarak dataset, ascribe id = 2
# store output, runtime, and dataset size

output2, runtime2, dsize2 = exact_freq_items(0, "./data/kosarak.txt")

In [None]:
# Run exact algorithm on accidents dataset, ascribe id = 3
# store output, runtime, and dataset size

output3, runtime3, dsize3 = exact_freq_items(0, "./data/accidents.txt")

In [None]:
# Plot results

x1 = []
y1 = []

x2 = []
y2 = []

x3 = []
y3 = []

size1 = len(output1)
# x1.append(1.0)
# y1.append(0.0)
counter = 1
for freq in output1:
    x1.append(freq[1])
    y1.append(counter/size1)
    counter += 1

size2 = len(output2)
# x2.append(1.0)
# y2.append(0.0)
counter = 1
for freq in output2:
    x2.append(freq[1])
    y2.append(counter/size2)
    counter += 1

size3 = len(output3)
# x3.append(1.0)
# y3.append(0.0)
counter = 1
for freq in output3:
    x3.append(freq[1])
    y3.append(counter/size3)
    counter += 1


plt.plot(x1,y1, label="Webdocs", linestyle="-")
plt.plot(x2,y2, label="Kosarak", linestyle="--")
plt.plot(x3,y3, label="accidents", linestyle="-.")
plt.yscale("log")
plt.legend()
plt.show()

In [None]:
# Sampling algorithm

def sample_freq_items(ssize, dsize, delta, thres, data):
    
    seed(1998)

    # get start time of algorithm to calculate runtime later
    start_time = time.time()

    samples = random.sample(range(dsize),k=ssize)
    samples.sort()

    support = dict()
    next = samples.pop(0)
    current = 0
    max_t = 0
    with open(data, 'rt') as reader:
        for transaction in reader:
            if current == next:
                elements = transaction.split()

                if len(elements) > max_t:
                    max_t = len(elements)

                for element in elements:
                    if element in support:
                        support[element] += 1
                    else:
                        support[element] = 1

                next = samples.pop(0)
                if len(samples) == 0:
                    break

            current += 1

    ds = math.floor(math.log(max_t,2) + 1)
    eps = math.sqrt(2*(ds + math.log(1/delta)) / ssize)

    output = []

    for element in support:
        if support[element] / ssize >= thres - eps/2:
            output.append((int(element), support[element] / ssize))

    # sort output by value of item in ascending order
    # there can't be any ties
    output = sorted(output, key=itemgetter(0))

    # sort output by frequency in descending order, ties broken by value of item 
    # python sorted() is "stable" so order from intitial sort is preserved during ties,
    #   and that order is ascending by value of item
    output = sorted(output, key= itemgetter(1), reverse=True)

    # get runtime
    # multiply by 1000 to return value in ms and round to 3 decimals
    runtime = round(1000 * (time.time() - start_time), 3)

    # print runtime
    #print("Runtime --- %s ms ---" % runtime)

    return runtime, eps, output






In [None]:
#dsize1 = 3384164
runtime,eps,output = sample_freq_items(2000, dsize1, 0.1, 0.5,"./data/webdocs.txt")
print(eps)
print(output)

In [None]:
#dsize2 = 2970006
runtime,eps,output = sample_freq_items(2000, dsize2, 0.1, 0.3,"./data/kosarak.txt")
print(eps)
print(output)

In [None]:
#dsize3 = 3401830

thres = 0.8
ssizes = [100,1000,10000,100000]

max_runs = []
min_runs = []
avg_runs = []
max_eps_runs = []

max_err_runs = []
min_err_runs = []
avg_err_runs = []


for ssize in ssizes:
    runtimes = []
    epss = []
    max_outputs = []
    min_outputs = []
    avg_outputs = []

    for run in range(0,5):
        runtime,eps,output = sample_freq_items(ssize, dsize3, 0.1, thres,"./data/accidents.txt")
        runtimes.append(runtime)
        epss.append(eps)
        outputs = [item[1] for item in output]
        max_outputs.append(outputs.copy())
        min_outputs.append(outputs.copy())
        avg_outputs.append(outputs.copy())


    max_runs.append(max(runtimes))
    min_runs.append(min(runtimes))
    avg_runs.append(mean(runtimes))
    max_eps_runs.append(max(epss))

    max_err_runs.append(max_outputs)
    min_err_runs.append(min_outputs)
    avg_err_runs.append(avg_outputs)
    
#exact runs
output, runtime_eps, dsize = exact_freq_items(thres, "./data/accidents.txt")
output_eps2, runtime_eps2, dsize_eps2 = exact_freq_items(thres-max_eps_runs[0], "./data/accidents.txt")

for i in range(0,4):
    for j in range(0,5):
        
        for k in range(0, len(max_err_runs[i][j])):
            max_err_runs[i][j][k] = abs(max_err_runs[i][j][k] - output_eps2[k][1])
            min_err_runs[i][j][k] = abs(min_err_runs[i][j][k] - output_eps2[k][1])
            avg_err_runs[i][j][k] =abs(avg_err_runs[i][j][k] - output_eps2[k][1])

        max_err_runs[i][j] = max(max_err_runs[i][j])
        min_err_runs[i][j] = min(min_err_runs[i][j])
        avg_err_runs[i][j] = mean(avg_err_runs[i][j])
    max_err_runs[i] = max(max_err_runs[i])
    min_err_runs[i] = min(min_err_runs[i])
    avg_err_runs[i] = mean(avg_err_runs[i])
        
# Plot the lines for the runtimes of the sampling algorithm
plt.plot(ssizes, max_runs, label="max runtime", linestyle=":")
plt.plot(ssizes, min_runs, label="min runtime", linestyle="--")
plt.plot(ssizes, avg_runs, label="avg runtime", linestyle="-.")
# plot a horizontal line of exact runtime
plt.plot([ssizes[0],ssizes[3]], [runtime_eps, runtime_eps], label="exact", linestyle="-")
# give plot a title and axis labels
plt.title("Runtime vs Sample Size")
plt.xlabel("Sample size")
plt.ylabel("Runtime (ms)")
# give both axes a log scale
plt.xscale("log")
plt.yscale("log")
# show legend and plot
plt.legend()
plt.show()

# plot the max, min, and avg error for each sample size
plt.plot(ssizes, max_err_runs, label="max error", linestyle=":")
plt.plot(ssizes, min_err_runs, label="min error", linestyle="--")
plt.plot(ssizes, avg_err_runs, label="avg error", linestyle="-.")
# divide max eps by 2 for all sample sizes and plot them as a line
max_eps_runs2 = [x/2 for x in max_eps_runs]
plt.plot(ssizes, max_eps_runs, label="max eps/2", linestyle="-")
# give plot a title and axis labels
plt.title("Error vs epsilon/2")
plt.xlabel("Sample size")
plt.ylabel("Error")
# give both axes a log scale
plt.xscale("log")
plt.yscale("log")
# show legend and plot
plt.legend()
plt.show()
