In [None]:
import pickle
import pandas as pd
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict

In [None]:
keys = [
    "12f4b1fc58f4a13b0e5bc8d854c0b9cf9abd7422082631bfec2787ebb7ffb928",
    "33c62168d74030abe560c3ad9d3281a040db4ff5eb2006e423afb99e129c6ee9",
    "e3207d8e111207a6411ab78f0523b4b0f7a20689410cfdfa9fb7c1216ee84841",
    "c531d4a648804161bb266dc5e338e80ed8287f50ae1443cfa3d065b348970163",
    "e0bcfea24c34654ffc13a9bd43681b6e1b89a608fcc5b014a27e3572621fc5d3",
    "a4cd330134a6d4973a5cbbd2ba5e77ddf52f31a920a853ee4ddd80ecf0c6edca",
]

In [None]:
def load_df(dir_path, day, targets=None):
    invocation_fpattern = "invocations_per_function_md.anon.d%02d.csv"
    df = pd.read_csv(dir_path.joinpath(invocation_fpattern % day))
    if targets is None:
        return df
    else:
        return df[df.HashFunction.isin(targets)].sort_values("HashFunction").reset_index(drop=True)

target_dir = Path("/srv/local/bj2/azure_2019")

dfs = {}
for i in range(8, 13):
    dfs[i] = load_df(target_dir, i, targets=keys)

In [None]:
def parse_df(df, day):
    keys = []
    counts = []
    for _, s in df.iterrows():
        keys.append(s["HashFunction"])
        counts.append(np.array([s[str(i)] for i in np.arange(1, 1441)], dtype=np.int32))

    counts = [pd.Series(c, dtype=np.int32) for c in counts]
    return pd.DataFrame({"hash_func": keys, "day": np.repeat(day, len(keys)), "counts": counts})

parsed_dfs = {day: parse_df(df, day) for day, df in dfs.items()}

In [None]:
def scale1d(count, target=50, limit_only=False):
    print(f"len={len(count)}")
    if limit_only:
        if np.max(count) > target:
            # normalize and remap to [0, target]
            new_count = count / np.max(count) * target
            return np.round(new_count).astype(np.int32)
        else:
            return count
    else:
        new_count = count / np.max(count) * target
        return np.round(new_count).astype(np.int32)

def scale2d(counts2d, target=50):
    # normalize counts when their max qps > target
    new_counts = []
    for counts in counts2d:
        new_counts.append(scale1d(counts, target))
    return np.stack(new_counts)

def scale_df(df, target=50):
    # normalize counts when their max qps > target
    df = df.copy()
    scaled_series = []
    for i in df.index:
        scaled_series.append(pd.Series(scale1d(df.loc[i, "counts"], target)))
    df.counts = scaled_series
    # for series in df.counts:
    #     series.update(pd.Series(scale1d(series, target)))
    # for counts in df.counts:
    #     counts[:] = scale1d(counts, target)
    return df

In [None]:
def scale1d_avg(count, target):
    avg = np.mean(count)
    factor = target / avg
    return count * factor

In [None]:
import functools


def scale_dfs(dfs, scale_fn):
    scaled_counts_list = []
    for i, key in enumerate(sorted(keys)):
        counts = []
        for day, df in dfs.items():
            target = df.iloc[i]
            assert target.hash_func == key
            counts.append(target.counts)
        scaled_counts = [
            pd.Series(s) for s
            in np.split(scale_fn(pd.concat(counts).to_numpy()), len(dfs))]
        scaled_counts_list.append(scaled_counts)

    scaled_dfs = {}
    for (day, df), scaled_counts in zip(dfs.items(), zip(*scaled_counts_list)):
        scaled_df = df.copy()
        scaled_df.counts = scaled_counts
        scaled_dfs[day] = scaled_df
    return scaled_dfs

scale_fn = functools.partial(scale1d, target=50, limit_only=True)
# scale_fn = functools.partial(scale1d_avg, target=210)
scaled_dfs = scale_dfs(parsed_dfs, scale_fn)

In [None]:
def generate_count_dict(dfs):
    counts_dict = defaultdict(list)

    for day, df in sorted(dfs.items()):
        for _, row in df.iterrows():
            counts_dict[row.hash_func].append(row.counts)

    return {
        hash_func: pd.concat(counts).reset_index(drop=True) for hash_func, counts in counts_dict.items()
    }

unscaled_count_dict = generate_count_dict(parsed_dfs)
scaled_count_dict = generate_count_dict(scaled_dfs)

In [None]:
for k, count in unscaled_count_dict.items():
    print(k[:5], np.mean(count))

print()
for k, count in scaled_count_dict.items():
    print(k[:5], np.mean(count))

print(np.sum([np.mean(count) for k, count in scaled_count_dict.items()]))

In [None]:
def plot(counts_dict):
    for hash_func, counts in counts_dict.items():
        fig, ax = plt.subplots(figsize=(18, 4))
        ax.plot(counts, label=hash_func[:10])
        ax.set_xlim(0, len(counts))
        for i in range(1440, len(counts), 1440):
            ax.axvline(i, color="red")
        ax.legend()

plot(unscaled_count_dict)

In [None]:
plot(scaled_count_dict)