# Time Analysis

We need to extract the time taken for training and metric computation times, this should be simply done by from the way that Phil's code pull's the data.

In [13]:
import numpy as np
import pandas as pd
import utils
import wandb
from constants import DATASET_NAMES, MAX_SIZES

In [14]:
# Load the data
api = wandb.Api()
datasets = list(MAX_SIZES.keys())
train_data = utils.get_data(api, "train", datasets)
metric_data = utils.get_data(api, "metrics", datasets)
preproc_data = utils.get_data(api, "preproc", datasets)
data = metric_data.merge(train_data, on=["dataset_name", "model_name", "n_samples"])
data = data.merge(preproc_data, on=["dataset_name", "model_name", "n_samples"])

In [15]:
# Extract timings
data["LogME"] = [v["time"] for v in data["LogME"].values]
data["renggli"] = [v["time"] for v in data["renggli"].values]

In [16]:
# Add inference and pre processing times
data["LogME"] = data["LogME"] + data["inference_times"]
data["renggli"] = data["renggli"] + data["inference_times"]
data["train_runtime"] = (
    data["train_runtime"] + data["preproc_times"] + data["eval_runtime"]
)

In [17]:
group_data = data.groupby(["dataset_name", "n_samples", "n_metric_samples"])

In [19]:
def extract_timings(data, name):
    """Extract the minimum and maximum metric computation times

    Args:
        data: sub dataframe table, containing results for one combination of
                (dataset, n_samples, n_metric_samples)
        name: labels for this set
    """
    min_train_time = data["train_runtime"].min() / 60
    max_train_time = data["train_runtime"].max() / 60
    avg_train_time = data["train_runtime"].mean()

    metric_times = []
    metric_times.append(data["LogME"].values)
    metric_times.append(data["renggli"].values)
    min_metric_time = np.min(metric_times) / 60
    max_metric_time = np.max(metric_times) / 60
    avg_metric_time = np.mean(metric_times)

    row = {
        "dataset": DATASET_NAMES[name[0]],
        "n_samples": name[1],
        "train_time": f"{np.around(min_train_time)} - {np.around(max_train_time)}",
        "n_metric_samples": name[2],
        "metric_time": f"{np.around(min_metric_time)} - {np.around(max_metric_time)}",
        "metric_advantage": np.around(avg_train_time / avg_metric_time),
    }

    return row


def get_timings(group_data):
    return pd.DataFrame([extract_timings(data, name) for name, data in group_data])

In [20]:
timings = get_timings(group_data)

In [16]:
equal_timings = timings[timings["n_samples"] == timings["n_metric_samples"]]

In [11]:
max_timings = timings[timings["n_samples"].isin(MAX_SIZES.values())]

In [23]:
max_training_samps_timing = timings[timings["n_samples"].isin(list(MAX_SIZES.values()))]

In [18]:
equal_timings.to_csv("timings.csv")