In [None]:
from pathlib import Path

import pandas as pd

import wandb

api = wandb.Api()

# Project is specified by <entity/project-name>
runs = api.runs("ingmarnitze_team/darts")
cache = Path("../data/analysis/sweep-cv-large-planet.csv")
cache.parent.mkdir(parents=True, exist_ok=True)
if cache.exists():
    runs_df = pd.read_parquet(cache)
    print("Loaded from cache")
else:
    summary_list, config_list, name_list = [], [], []
    for run in runs:
        # .summary contains the output keys/values for metrics like accuracy.
        #  We call ._json_dict to omit large files
        summary_list.append(run.summary._json_dict)

        # .config contains the hyperparameters.
        #  We remove special values that start with _.
        config_list.append({k: v for k, v in run.config.items() if not k.startswith("_")})

        # .name is the human-readable name of the run.
        name_list.append(run.name)

    runs_df = pd.DataFrame({"summary": summary_list, "config": config_list, "name": name_list})
    runs_df.to_csv(cache)
runs_df

In [None]:
summary_keys = set()
for i, row in runs_df.iterrows():
    summary_list = row["config"]
    if not summary_list:
        summary_keys = set(summary_list.keys())

    diff = summary_keys - set(summary_list.keys())
    if diff:
        print(row["name"], diff)
        break

In [None]:
summary_df = pd.json_normalize(runs_df["summary"])
config_df = pd.json_normalize(runs_df["config"])
metrics_df = pd.concat([summary_df, config_df], axis=1)
# Filter out row which are not group of the sweep
metrics_df = metrics_df[~metrics_df["trial_name"].isna() & metrics_df["epoch"] > 0]
# Drop columns which are all nan
metrics_df = metrics_df.dropna(axis=1, how="all")
# Kick out all visualization columns
for i in range(3):
    viz_columns = [c for c in metrics_df.columns if c.startswith(f"val{i}-samples/")]
    metrics_df = metrics_df.drop(columns=viz_columns)
# Kick out all prc, cmx and roc columns
prc_columns = [c for c in metrics_df.columns if c.startswith("val/prc")]
cmx_columns = [c for c in metrics_df.columns if c.startswith("val/cmx")]
roc_columns = [c for c in metrics_df.columns if c.startswith("val/roc")]
# Kick out learning_rate because it is doubled for some reason
metrics_df = metrics_df.drop(columns=["learning_rate"])
metrics_df = metrics_df.drop(columns=prc_columns + cmx_columns + roc_columns)
metrics_df

In [None]:
# Add some inverses for log plotting
metrics_df["focal_loss_alpha_inv"] = 1 - metrics_df["focal_loss_alpha"]
metrics_df["val/JaccardIndex_inv"] = 1 - metrics_df["val/JaccardIndex"]
metrics_df["val/AveragePrecision_inv"] = 1 - metrics_df["val/AveragePrecision"]


In [None]:
metrics_df_combi = metrics_df[
    (metrics_df["config.model.arch"] == "UPerNet")
    & (metrics_df["config.model.encoder_name"] == "tu-maxvit_tiny_rw_224")
]
metrics_df_combi

In [None]:
metrics_df_combi.hvplot.scatter(x="val/AveragePrecision", y="focal_loss_gamma", c="val/JaccardIndex", groupby="fold")

In [None]:
metrics_df_combi.hvplot.scatter(c="val/AveragePrecision", y="focal_loss_gamma", x="val/JaccardIndex", groupby="fold")

In [None]:
metrics_df_combi.hvplot.scatter(y="val/AveragePrecision", c="focal_loss_gamma", x="val/JaccardIndex", groupby="fold")

In [None]:
metrics_df[metrics_df["config.model.arch"].isin(["UPerNet", "Unet"])].hvplot.violin(
    by="config.model.arch", y="val/JaccardIndex", groupby="fold"
)