### Import

In [None]:
import time
import altair as alt
from copy import copy

%run implementations.ipynb
%run utilities.ipynb

### Load Data

In [None]:
sizes = pd.read_csv('npmpackages/npm_no_scope_full_stats_nonzero_downloads.csv', header=None)
sizes.head()

In [None]:
sizes = pd.read_csv('unsplash/lite/unsplash_lite.csv', header=None)
sizes.head()

### Runtimes

In [None]:
size_list = sizes[1].tolist()

size_counts = defaultdict(int)

for size in size_list:
    size_counts[size] += 1
    
uniq_sizes = sorted(list(size_counts.keys()))

c_list = [1.01, 1.02, 1.03, 1.04, 1.05, 1.06, 1.07, 1.08, 1.09, 1.1]

runtimes = []

for c in c_list:
    p_alpaca_no_metrics_time = 0.
    min_mi_per_obj_time = 0.
    pure_maxl_time = 0.
    
    for i in range(10):
        start = time.time()
        p_alpaca_results = p_alpaca_no_metrics(sizes, c)
        stop  = time.time()
        p_alpaca_no_metrics_time += stop - start
        
    p_alpaca_no_metrics_time *= .1
    
    for i in range(10):
        start = time.time()

        partition, dyn_solution = opt_partition_MI(sizes, c)

        size_map = {}    
        current_block = 0
        current_ceiling = partition[current_block][1]

        for size in uniq_sizes:
            if size > current_ceiling:
                current_block += 1
                current_ceiling = partition[current_block][1]
            size_map[size] = current_ceiling
    
        stop = time.time()
        
        min_mi_per_obj_time += stop - start
        
    min_mi_per_obj_time *= .1
    
    for i in range(10):
        start = time.time()

        size_map = pureMaxL(sizes, c)
    
        stop = time.time()
        
        pure_maxl_time += stop - start
        
    pure_maxl_time *= .1

    start = time.time()
    min_mi_per_req_results = BlahutArimoto(sizes, c, max_itr=10000000, eps=5e-3)
    stop = time.time()
    
    min_mi_per_req_time = stop - start
    
    runtimes.append(("P-ALPaCA", c, p_alpaca_no_metrics_time))
    runtimes.append(("MI_per_object", c, min_mi_per_obj_time))
    runtimes.append(("Pure_MaxL", c, pure_maxl_time))
    runtimes.append(("MI_per_request", c, min_mi_per_req_time))
    
    print(c)

In [None]:
df_runtimes = pd.DataFrame(runtimes, columns =['Method', 'c', 'Runtime'])

In [None]:
import statistics

size_list = sizes[1].tolist()

size_counts = defaultdict(int)

for size in size_list:
    size_counts[size] += 1
    
uniq_sizes = sorted(list(size_counts.keys()))

c_list = [1.01, 1.02, 1.03, 1.04, 1.05, 1.06, 1.07, 1.08, 1.09, 1.1]

runtimes = []

for c in c_list:
    
    time_samples = []
    
    for i in range(10):
        start = time.time()
        p_alpaca_results = p_alpaca_no_metrics(sizes, c)
        stop  = time.time()
        time_samples.append(stop - start)
        
    p_alpaca_no_metrics_time = statistics.mean(time_samples)
    p_alpaca_no_metrics_rel_std_dev = 100 * (statistics.pstdev(time_samples) / p_alpaca_no_metrics_time)
    
    time_samples = []
    
    for i in range(10):
        start = time.time()

        partition, dyn_solution = opt_partition_MI(sizes, c)

        size_map = {}    
        current_block = 0
        current_ceiling = partition[current_block][1]

        for size in uniq_sizes:
            if size > current_ceiling:
                current_block += 1
                current_ceiling = partition[current_block][1]
            size_map[size] = current_ceiling
    
        stop = time.time()
        
        time_samples.append(stop - start)
        
    min_mi_per_obj_time = statistics.mean(time_samples)
    min_mi_per_obj_rel_std_dev = 100 * (statistics.pstdev(time_samples) / min_mi_per_obj_time)
    
    time_samples = []
    
    for i in range(10):
        start = time.time()

        size_map = pureMaxL(sizes, c)
    
        stop = time.time()
        
        time_samples.append(stop - start)
        
    pure_maxl_time = statistics.mean(time_samples)
    pure_maxl_rel_std_dev = 100 * (statistics.pstdev(time_samples) / pure_maxl_time)
    
    time_samples = []
    
    for i in range(10):
        start = time.time()
        min_mi_per_req_results = BlahutArimoto(sizes, c, max_itr=10000000, eps=5e-4)
        stop = time.time()
    
        time_samples.append(stop - start)
    print(str(c) + '\t' + str(min_mi_per_req_results[2]))    
    min_mi_per_req_time = statistics.mean(time_samples)
    min_mi_per_req_rel_std_dev = 100 * (statistics.pstdev(time_samples) / min_mi_per_req_time)
    
    runtimes.append(("P-ALPaCA", c, p_alpaca_no_metrics_time, p_alpaca_no_metrics_rel_std_dev))
    runtimes.append(("MI_per_object", c, min_mi_per_obj_time, min_mi_per_obj_rel_std_dev))
    runtimes.append(("Pure_MaxL", c, pure_maxl_time, pure_maxl_rel_std_dev))
    runtimes.append(("MI_per_request", c, min_mi_per_req_time, min_mi_per_req_rel_std_dev))
    
    print(c)

In [None]:
df_runtimes = pd.DataFrame(runtimes, columns =['Method', 'c', 'Avg Runtime', 'Rel Std Dev'])

In [None]:
alt.Chart(df_runtimes).mark_line().encode(
    alt.X('c', scale=alt.Scale(domain=(1.01, 1.09))),
    alt.Y('Avg Runtime', scale=alt.Scale(domain=(0, 3.5))),
    color='Method',
    strokeDash='Method',
)

In [None]:
df_runtimes_temp = df_runtimes.loc[df_runtimes['Method'] != 'MI_per_request', ['Method', 'c', 'Runtime']]

In [None]:
alt.Chart(df_runtimes_temp).mark_line().encode(
    alt.X('c', scale=alt.Scale(domain=(1.01, 1.09))),
    alt.Y('Runtime', scale=alt.Scale(domain=(0, .35))),
    color='Method',
    strokeDash='Method',
)

In [None]:
df_runtimes.to_csv('evaluation/runtimes-unsplash_1-01_to_1-10.csv')

__Print__ $\LaTeX{}$

In [None]:
df_runtimes = pd.read_csv('evaluation/runtimes-unsplash_1-01_to_1-10.csv')
df_runtimes.head()

In [None]:
methods = ["MI_per_request", "MI_per_object", "P-ALPaCA", "Pure_MaxL"]
legend = {
          "MI_per_request": "[line width=1pt, densely dotted, curvecolor]", 
          "MI_per_object": "[line width=1pt, dash pattern=on 1pt off 3pt on 3pt off 3pt, curvecolor]", 
          "P-ALPaCA": "[line width=1pt, solid, curvecolor]", 
          "Pure_MaxL": "[line width=1.5pt, dash pattern=on 1pt off 3pt on 3pt off 3pt, black]"
         }

In [None]:
for method in methods:
    df_method_temp = df_runtimes.loc[df_runtimes['Method'] == method, ['c', 'Avg Runtime']]
    print("\\addplot " + legend[method])
    print("table {%")
    for row in df_method_temp.itertuples():
        print(str(row.c) + ' ' + str(row[2]))        
    print("};")

### Incremental Update

In [None]:
sizes = pd.read_csv('npmpackages/npm_no_scope_full_stats_nonzero_downloads.csv', header=None)
total_downloads = sizes[2].sum()

sorted_sizes = sizes.sort_values(by=[2], ascending=False)
update_names = sorted_sizes.head(10)[0].tolist()
print(update_names)

In [None]:
sizes = pd.read_csv('unsplash/lite/unsplash_lite.csv', header=None)
total_downloads = sizes[2].sum()

sorted_sizes = sizes.sort_values(by=[2], ascending=False)
update_names = sorted_sizes.head(10)[0].tolist()
print(update_names)

In [None]:
c = 1.1
growth = 1.25

start = time.time()
i_max, cndl_idxs, p_ji, p_i, s, orig_p_j_dict, size_to_i, this_MI, max_L = BlahutArimoto_init(sizes, c, max_itr=10000000, eps=5e-3)
stop = time.time()
print(str(stop - start))
print(this_MI)

In [None]:
for o_id in update_names:
    p_j_dict = copy(orig_p_j_dict)
    
    start = time.time()
    
    o_index = sizes.index[sizes[0].values == o_id][0]
    o_old_size = sizes.loc[o_index][1]
    o_old_prob = sizes.loc[o_index][2] / total_downloads

    o_new_size = math.ceil(o_old_size * growth)

    this_i = size_to_i[o_old_size]
    i_num_cndls = i_max[this_i] - this_i + 1
    i_idx = cndl_idxs[this_i]
        
    for offset in range(i_num_cndls):
        this_j = this_i + offset
        j_size = s[this_j]
        idx = i_idx + offset
        prob = p_ji[idx]
        p_j_dict[j_size] -= prob*o_old_prob
    
    all_sizes = sorted(p_j_dict.keys())
    feasible_sizes = set()
    feasible_sizes.add(o_new_size)
    for size in all_sizes:
        if size < o_new_size:
            continue
        if c*o_new_size < size:
            break
        feasible_sizes.add(size)
    
    num_feasible = len(feasible_sizes)
    equal_share = o_old_prob / num_feasible

    for f_size in feasible_sizes:
        p_j_dict[f_size] += equal_share

    sizes.at[o_index, 1] = o_new_size
   
    DISCARD_i_max, DISCARD_cndl_idxs, DISCARD_p_ji, DISCARD_p_i, DISCARD_s, DISCARD_p_j_dict, DISCARD_size_to_i, DISCARD_this_MI, DISCARD_max_L = BlahutArimoto_incr(sizes, c, p_j_dict, max_itr=10000000, eps=5e-3)

    stop = time.time()
    print(stop - start)
    
    sizes.at[o_index, 1] = o_old_size