# VM sizing for COS Lite

Before we start, we import some preliminary libraries.

In [1]:
%pip install -q ipywidgets

In [2]:
import numpy as np
from matplotlib import pyplot as plt
from scipy import integrate

from ipywidgets import interactive, fixed

We will also define the actual solver and plotting routine.

VM sizing depends on ingestion rate. Currently, we have only two independent variables: loglines per minute, and metric datapoints per minute.

Expressed in matrix notation, it may look like this:

$$
\left(\begin{matrix}
\mathrm{cpu} \\
\mathrm{mem} \\
\mathrm{disk} 
\end{matrix}\right)
= \begin{bmatrix}
\mathrm{-cpu\ coeffs-} \\ 
\mathrm{-mem\ coeffs-} \\ 
\mathrm{-disk\ coeffs-} 
\end{bmatrix}
\cdot \left(\begin{matrix}
\mathrm{loglines/min} \\
\mathrm{datapoints/min}
\end{matrix}\right)
+ \left(\begin{matrix}
\mathrm{cos\ idle\ cpu} \\
\mathrm{cos,\ microk8s\ idle\ mem} \\
\mathrm{non cos\ disk\ fill\ rate} 
\end{matrix}\right)
$$

In [12]:
from typing import Tuple
def total_estimation_from_per_pod(loglines_per_minute, datapoints_per_minute) -> Tuple[float, float, float]:
    # Return a 3-tuple: (cpu, mem_gb, storage_gb_per_day).

    # If loglines_per_minute and datapoints_per_minute were scalars, we could have used matrix multiplication:
    # (cpu, mem, disk) = A.dot(x) + idle_coeffs

    # / cpu [vCPUs]   \     /  -- cpu coeffs --  \  / loglines/min   \     /  cos idle cpu           \
    # | mem [GB]      |  =  |  -- mem coeffs --  |  | datapoints/min |  +  |  cos, microk8s idle mem |
    # \ disk [GB/day] /     \  -- disk coeffs -- /  \                /     \  host disk fill rate    /
    #
    #        y           =             A          *         x           +                b
    #      [3x1]                     [3x2]                [2x1]                        [3x1]

    # But because they are matrices (from meshgrid), we calculate manually.

    cpu_coeffs = np.array([  # in vCPUs
        # (a1, b1, c), where cpu = a1*(logline per minutes) + b1 * (datapoints/min) + c
        [6.84e-6, 0, 0.483],  # loki - contributes to cpu only via loglines
        [0, 1.08e-7, 0.173],  # prom - contributes to cpu only via metric datapoints
        [0, 0, 0.25],  # grafana
        [0, 0, 0.08],  # traefik
        [0, 0, 0.1],  # host os (microk8s, ...)
    ]).sum(axis=0)

    mem_coeffs = np.array([  # in GB
        # (a1, b1, c), where mem = a1*(logline per minutes) + b1 * (datapoints/min)
        [3.52e-6, 0, 2.07],  # loki
        [0, 1.47e-6, 0.25],  # prom
        [0, 0, 0.2],  # grafana
        [0, 0, 0.2],  # traefik
        [0, 0, 4],  # host os (microk8s, ...)
    ]).sum(axis=0)

    # (a1, b1, c), where disk = a1*(logline per minutes) + b1 * (datapoints/min)
    # From fit - c is 0 because the fit was 1e-12 which is effectively zero.
    # The initial system size - about 4gb - is eliminated by the derivative (GB/day).
    disk_coeffs = np.array([3.18e-4, 3.24e-6, 0])  # in GB/day

    return (
        cpu_coeffs[0] * loglines_per_minute + cpu_coeffs[1] * datapoints_per_minute + cpu_coeffs[2],
        mem_coeffs[0] * loglines_per_minute + mem_coeffs[1] * datapoints_per_minute + mem_coeffs[2],
        disk_coeffs[0] * loglines_per_minute + disk_coeffs[1] * datapoints_per_minute + disk_coeffs[2],
    )

In [51]:
def plot_total_estimation(num_datapoints: float = None, num_loglines: float = None):
    xlabel = "Metrics datapoints / min"
    ylabel = "Log lines / min"

    datapoints = np.linspace(0, 6e6)  # "x, datapoints per minute
    loglines = np.linspace(0, 360e3)  # "y", loglines per minute
    datapoints_mat, loglines_mat = np.meshgrid(datapoints, loglines)

    cpu, mem, disk = total_estimation_from_per_pod(loglines_mat, datapoints_mat)

    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)

    cs1 = ax.contour(datapoints_mat, loglines_mat, cpu, colors="red", linestyles="-", linewidths=1, levels=[1,2,3,4,5,6,7,8])
    cs2 = ax.contour(datapoints_mat, loglines_mat, mem, colors="blue", linestyles="--", linewidths=1, levels=[2,4,6,8,10,12,14,16])
    cs3 = ax.contour(datapoints_mat, loglines_mat, disk, colors="gray", linestyles="-.", linewidths=1, levels=[25,50,75,100,125])

    ax.clabel(cs1, inline=True, fontsize=9)
    ax.clabel(cs2, inline=True, fontsize=9)
    ax.clabel(cs3, inline=True, fontsize=9)

    ax.grid(linestyle="--")
    ax.set_title("VM size estimation from per-pod data")
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)

    if num_datapoints is not None and num_loglines is not None:
        cpu, mem, disk = total_estimation_from_per_pod(num_loglines, num_datapoints)
        ax.plot(num_datapoints, num_loglines, 'rx')
        ax.text(num_datapoints, num_loglines, f"{cpu:.1f} cpu\n{mem:.1f} gb\n{disk:.1f} gb/day")

    plt.show()


In [50]:
w=interactive(plot_total_estimation,num_datapoints=(0.0,6e6),num_loglines=(0.0,350e3))
w

interactive(children=(FloatSlider(value=3000000.0, description='num_datapoints', max=6000000.0), FloatSlider(v…