# Evaluate *ritme* trials of all usecases


## Setup

In [None]:
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from src.evaluate_trials import (
    boxplot_metric,
    multi_boxplot_metric,
    plot_complexity_vs_metric,
    plot_trend_over_time,
    plot_trend_over_time_multi_models,
)

warnings.filterwarnings("ignore", category=FutureWarning)

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
######## USER INPUTS ########

# path to extracted MLflow logs - with script extract_all_logs.sh
log_folder_location = "merged_all_trials.csv"

# which usecase to analyze: "u1", "u2", "u3" or "all"
usecase = "u2"

# which samplers to analyse: "tpe", "random"
sampler = "random"

# how many trials to consider for complexity vs. performance plot
top_x = 1000

# figure saving dpi
dpi = 400
######## END USER INPUTS #####

In [None]:
# set title
if usecase == "u1":
    title = "Usecase 1"
    best_model_type = "xgb"
    log_x_scale = False
elif usecase == "u2":
    title = "Usecase 2"
    best_model_type = "linreg"
    log_x_scale = False
elif usecase == "u3":
    title = "Usecase 3"
    best_model_type = "xgb"
    log_x_scale = True
else:
    title = "All usecases"

## Extract trial information

In [None]:
# extract all trial information
all_trials = pd.read_csv(log_folder_location)
# sort by asc metrics.rmse_val
all_trials = all_trials.sort_values(by="metrics.rmse_val", ascending=True)
print(f"Found {all_trials.shape[0]} trials")

In [None]:
if usecase != "all":
    print(f"Analyzing trials for usecase: {usecase}")
    if usecase == "u3":
        all_trials = all_trials[
            np.logical_and(
                all_trials["tags.experiment_tag"].str.startswith("u3_galaxy"),
                ~all_trials["tags.experiment_tag"].str.contains("w_start"),
            )
        ]
    else:
        all_trials = all_trials[
            all_trials["tags.experiment_tag"].str.startswith(usecase)
        ]

if sampler != "all":
    print(f"Analyzing trials for sampler: {sampler}")
    all_trials = all_trials[all_trials["tags.experiment_tag"].str.contains(sampler)]

print(f"Selected {all_trials.shape[0]} trials")

In [None]:
# find the best trial
top_1_trial = all_trials.head(1)
top_1_trial["tags.experiment_tag"]

## Insights on performance

In [None]:
fig, axes = multi_boxplot_metric(
    all_trials,
    metric_col="metrics.rmse_val",
    metric_name="RMSE Validation",
    group_specs=[
        ("params.data_aggregation", "Data aggregation"),
        ("params.data_selection", "Data selection"),
        ("params.data_transform", "Data transform"),
        ("params.data_enrich", "Data enrichment"),
        ("params.model", "Model type"),
    ],
    order_by_median=True,
    showfliers=False,
    title=title,
    x_log_scale=log_x_scale,
)
fig.savefig(
    f"result_figures/boxplot_all_trials_{usecase}_{sampler}.pdf",
    bbox_inches="tight",
    dpi=dpi,
)

## Model complexity vs. performance: top X trials

In [None]:
fig, axes = multi_boxplot_metric(
    all_trials,
    metric_col="metrics.nb_features",
    metric_name="Number of features",
    group_specs=[
        ("params.model", "Model type"),
    ],
    order_by_median=True,
    showfliers=False,
    title=title,
    x_log_scale=True,
    figsize=(6, 4),
)
plt.tight_layout()
fig.savefig(
    f"result_figures/boxplot_all_trials_{usecase}_{sampler}_nb_fts.pdf",
    bbox_inches="tight",
    dpi=dpi,
)

In [None]:
top_x_trials = all_trials.head(top_x)
top_x_trials["params.model"].value_counts()
figc, _ = plot_complexity_vs_metric(
    top_x_trials,
    metric_col="metrics.rmse_val",
    metric_name="RMSE Validation",
    group_col="params.model",
    group_name="Model type",
    n=top_x,
    figsize=(7, 6),
    title=title,
    x_log_scale=True,
)

figc.savefig(
    f"result_figures/complexity_top_trials_{usecase}_{sampler}.pdf",
    bbox_inches="tight",
    dpi=dpi,
)

In [None]:
print(f"number of trials w random scheduler: {0.2 * 2500}")

In [None]:
# # Training over time

# metric = "rmse_val"
# for model in all_trials["params.model"].unique():
#     model_trials = all_trials[all_trials["params.model"] == model]
#     plot_trend_over_time(
#         model_trials,
#         f"metrics.{metric}",
#         window=100,
#         title_prefix=f"Model: {model}",
#         figsize=(12, 6),
#         first_n=None,
#         y_log_scale=True,
#     )

In [None]:
fig, axes = plot_trend_over_time_multi_models(
    all_trials,
    y_col="metrics.rmse_val",
    window=100,
    title_prefix="Model: ",
    figsize=(7, 3 * 4),
    first_n=None,
    y_log_scale=True,
)

fig.savefig(
    f"result_figures/trend_over_time_{usecase}_{sampler}.pdf",
    bbox_inches="tight",
    dpi=dpi,
)

# Top 1 trial

based on held-out test set performance - that's why we're selecting the best_model_type here!

In [None]:
top_1_trial = all_trials.loc[all_trials["params.model"] == best_model_type, :].head(1)

top_1_trial["metrics.nb_features"]

In [None]:
top_1_trial["tags.experiment_tag"]

In [None]:
top_1_trial_true = all_trials.head(10)
top_1_trial_true["tags.experiment_tag"]

In [None]:
top_1_trial_true["metrics.rmse_val"]