# Compaction Project

## Problem Formulation

Given a set of chunks whose sizes are

$$
S_n = \{d_1, \cdots, d_n\},
$$

where the positive integer $d_i \leq 2048$ for all $i = 1, \cdots, n$. Suppose the remaining operators need time 

$$
f(d_i) = C_1 + d_i \times C_2
$$

to process a data chunk with the size $d_i$. Our goal is to compact the set $S$, i.e., we need a transformation

$$
\mathcal{M}: S_n \rightarrow S'_m \triangleq \{d'_1,  \cdots, d'_m\},
$$

where $\sum_i^n d_i = \sum_j^m d'_j$ and $m$ is an arbitrary integer less than $n$, to minimize 

$$
\sum_j^m f(d'_j) + cost(M, S).
$$

where $cost(\mathcal{M}, S)$ is the cost of the transformation $\mathcal{M}$ on the set $S$. 

The cost of combining two or more chunks into one: $d_i + \cdots + d_j = d'_s \leq 2048$, is 

$$
g(d'_s) = C_3 + d'_s \times C_4.
$$

**Note:** This formulated problem is easier than the real compaction problem because we have the sizes of all data chunks in advance, rather than a chunk stream.

In [58]:
# utils
from termcolor import colored

def print_color(text, color='black'):
    print(colored(text, color))

## 1. Chunk Sizes Distribution

In [195]:
import numpy as np

# Generate random chunk sizes from a Gaussian distribution
def generate_chunk_sizes(n, mean=64, scale=256):
    return np.minimum(2048, np.maximum(1, np.random.normal(mean, scale, n))).astype(int)

## 2. Compaction Simulator

In [246]:
#             fixed cost      per tuple cost
# probe()     1.5             0.03
# next()      0.9             0.06
# --------------------------------------
# compact()   0.3             0.03
# --------------------------------------

k_pcs_fixed_cost = (1.5 + 0.9)
k_pcs_per_tuple_cost = (0.03 + 0.06)
k_cpt_fixed_cost = 0.3
k_cpt_per_tuple_cost = 0.03

def simulate_join(sizes, compact_func, chunk_factor=1, level=1):
    prc_cost = 0
    cpt_cost = 0
    next_sizes = np.array(sizes)

    for _ in range(level):
        # join
        prc_cost += k_pcs_fixed_cost * len(next_sizes) + np.sum(next_sizes) * k_pcs_per_tuple_cost
        next_sizes = np.repeat(np.array(next_sizes) // chunk_factor, chunk_factor)

        # compact
        next_sizes, cost = compact_func(next_sizes, chunk_factor, level)
        cpt_cost += cost

    return prc_cost, cpt_cost

## 3. Compaction Strategies

In [250]:
def compute_prs_cost(chunk_size, chunk_factor, level):
    cost = 0
    next_sizes = [chunk_size]

    for _ in range(level):
        cost += k_pcs_fixed_cost * len(next_sizes) + np.sum(next_sizes) * k_pcs_per_tuple_cost
        next_sizes = np.repeat(np.array(next_sizes) // chunk_factor, chunk_factor)

    return cost


# Compute the cost of a single compaction
def compute_cpt_cost(sizes_in_one_compaction):
    return k_cpt_fixed_cost + np.sum(sizes_in_one_compaction) * k_cpt_per_tuple_cost


# Strategy 1: Do not compact any chunks
def alg_no_compaction(chunk_sizes, chunk_factor, level):
    return chunk_sizes, 0


# Strategy 2: Fully compact all chunks
def alg_full_compaction(chunk_sizes, chunk_factor, level):
    transformed_sizes = []
    cpt_cost = 0
    cpt_sizes = []
    
    for size in chunk_sizes:
        if size == 2048: 
            transformed_sizes.append(size)
            continue

        if sum(cpt_sizes) + size <= 2048:
            cpt_sizes.append(size)
        else:
            cpt_cost += compute_cpt_cost(cpt_sizes)
            transformed_sizes.append(sum(cpt_sizes))
            cpt_sizes = [size]
    
    if cpt_sizes:
        cpt_cost += compute_cpt_cost(cpt_sizes)
        transformed_sizes.append(sum(cpt_sizes))

    return transformed_sizes, cpt_cost

### Optimal Algorithm

In [253]:
# Strategy 3: sort all chunks ascendingly and compact them in order, until compaction is not beneficial
def alg_sort_compaction(chunk_sizes, chunk_factor, level):
    sorted_sizes = sorted(chunk_sizes)
    transformed_sizes = []
    cpt_cost = 0
    cpt_sizes = []

    i = 0
    for i in range(len(sorted_sizes)):
        size = sorted_sizes[i]
        if sum(cpt_sizes) + size <= 2048:
            gain = compute_prs_cost(sum(cpt_sizes), chunk_factor, level) + compute_prs_cost(cpt_sizes, chunk_factor, level) - compute_prs_cost(sum(cpt_sizes) + size, chunk_factor, level)
            gain -= k_cpt_fixed_cost + size * k_cpt_per_tuple_cost
            if gain > 0:
                cpt_sizes.append(size)
            else:
                if len(cpt_sizes) > 1:
                    cpt_cost += compute_cpt_cost(cpt_sizes)
                transformed_sizes.append(sum(cpt_sizes))
                break
        else:
            cpt_cost += compute_cpt_cost(cpt_sizes)
            transformed_sizes.append(sum(cpt_sizes))
            cpt_sizes = [size]

    for j in range(i, len(sorted_sizes)):
        transformed_sizes.append(sorted_sizes[j])

    return transformed_sizes, cpt_cost

In [258]:
chunk_sizes = generate_chunk_sizes(n=int(1e7 / 2048), mean=2048, scale=0)
chunk_factor = 3
level = 3

grades = {
    "No Compaction": simulate_join(chunk_sizes, alg_no_compaction, chunk_factor, level), 
    "Full Compaction": simulate_join(chunk_sizes, alg_full_compaction, chunk_factor, level),
    "Sort Compaction": simulate_join(chunk_sizes, alg_sort_compaction, chunk_factor, level),
}

for grade in grades:
    prc_cost = grades[grade][0]/1e6
    cpt_cost = grades[grade][1]/1e6
    print_color(f"[{grade}]", 'green')
    print(f"\t Total Cost: {prc_cost + cpt_cost:.2f}\tCompute Cost: {prc_cost:.2f}s\t Compaction Cost: {cpt_cost:.2f}s")

[32m[No Compaction][0m
	 Total Cost: 2.85	Compute Cost: 2.85s	 Compaction Cost: 0.00s
[32m[Full Compaction][0m
	 Total Cost: 3.64	Compute Cost: 2.73s	 Compaction Cost: 0.90s
[32m[Sort Compaction][0m
	 Total Cost: 3.06	Compute Cost: 2.75s	 Compaction Cost: 0.30s
