In [1]:
import wandb
import os
import os.path as path
import numpy as np
import pandas as pd
api = wandb.Api()


In [2]:
# repair experiment name:
for r in api.runs("CTGAN"):
    if "''_2024_" in r.name:
        r.name = r.name.replace("''_2024_", "_2024_")
        r.update()


# apply correct group name:
for r in api.runs("CTGAN-ctgan"):
    if "_2024_" in r.name:
        if r.group != r.name.split("_2024_")[0]:
            r.group = r.name.split("_2024_")[0]
            r.update()
            
for r in api.runs("CTGAN"):
    if "_2024_" in r.name:
        if r.group != r.name.split("_2024_")[0]:
            r.group = r.name.split("_2024_")[0]
            r.update()

# Aggregate Statistics

In [3]:
def get_filtered_runs(project_name, group_name):
    api = wandb.Api()
    runs = api.runs(
        path=project_name,
        filters={
            "group": group_name,
            "state": "finished",
            "tags": {"$nin": ["old", "surplus"]},
            "host": {"$regex": "^paul"}
        }
    )
    return runs

def compute_metric_stats(runs, metric_name: str, use_max: bool = True, round_digits: int = 4) -> dict:
    """Compute statistics across WandB runs for a given metric."""
    last_values, peak_values = [], []
    
    for run in runs:
        #history = run.scan_history(keys=[metric_name])
        #values = [row[metric_name] for row in history if metric_name in row]
        history = run.history(samples=run.lastHistoryStep+1, keys=[metric_name])
        values = history[metric_name].values
        last_values.append(values[-1])
        peak_values.append(max(values) if use_max else min(values))
        
    if not last_values:
        return {}
    
    last_arr = np.array(last_values)
    opt_peak = round(max(peak_values) if use_max else min(peak_values), round_digits)
    opt_mean = round(np.mean(peak_values), round_digits)
    opt_std = round(np.std(peak_values), round_digits)
    opt_se = round(opt_std / np.sqrt(len(peak_values)), round_digits)
    last_mean = round(np.mean(last_arr), round_digits)
    last_std = round(np.std(last_arr), round_digits)
    last_se = round(last_std / np.sqrt(len(last_arr)), round_digits)
    
    return {
        f"opt_{'max' if use_max else 'min'}_value": opt_peak,
        "opt_mean": opt_mean,
        #"opt_std": opt_std,
        "opt_se ": opt_se,
        "last_mean": last_mean,
        #"last_std": last_std,
        "last_se": last_se
    }

In [7]:
METRICS = ["valid/Column_Shape", "valid/Column_Pair_Trend"] #"valid/variational_performance"


PROJECT = "CTGAN" 
GROUPS_ADULTS = ["1ew", "2ew", "3ew", "5ew", "10ew"]
GROUPS_CANCER = ["cancer_1ew_bs100", "cancer_2ew_bs100", "cancer_3ew_bs100", "cancer_5ew_bs100", "cancer_10ew_bs100"]
GROUPS_SUPERSTORE = ["superstore_1ew", "superstore_2ew", "superstore_3ew", "superstore_5ew", "superstore_10ew"]
GROUPS_WEIGHTINGS = ["superstore_5ew", "superstore_5rand_normal", "superstore_5rand_uniform", "superstore_5rand_bernoulli", "superstore_5ew_gn", "superstore_5rand_bernoulli_gn"]
GROUPS_WEIGHTINGS = ["superstore_5rand_uniform"]

STATISTIC_TEST = ["1ew", "5ew", "cancer_1ew_bs100", "cancer_5ew_bs100", "superstore_1ew", "superstore_5ew"]

#GROUPS = GROUPS_ADULTS + GROUPS_CANCER + GROUPS_SUPERSTORE
#GROUPS = GROUPS_WEIGHTINGS
GROUPS = STATISTIC_TEST

for group in GROUPS:
    runs = get_filtered_runs(PROJECT, group)
    print("GROUP: ", group, "({})".format(len(runs)))
    for r in runs:
        print("\t", r.name)
    
    for metric in METRICS:
        print("")
        print("METRIC: ", metric)
        metrics = compute_metric_stats(runs, metric, round_digits=3)
        for key, value in metrics.items():
            value = f"{value:.3f}".lstrip('0') if value != 0 else '.000'
            print(f"{key}:\t\t{value}")
    
    print("")
    print("")


GROUP:  superstore_5rand_uniform (6)
	 superstore_5rand_uniform_2024_12_12_17_05_38
	 superstore_5rand_uniform_2024_12_12_17_05_38
	 superstore_5rand_uniform_2024_12_13_18_57_37
	 superstore_5rand_uniform_2024_12_13_19_37_28
	 superstore_5rand_uniform_2024_12_13_19_37_48
	 superstore_5rand_uniform_2024_12_13_21_02_22

METRIC:  valid/Column_Shape
opt_max_value:		.828
opt_mean:		.821
opt_se :		.002
last_mean:		.776
last_se:		.003

METRIC:  valid/Column_Pair_Trend
opt_max_value:		.600
opt_mean:		.594
opt_se :		.002
last_mean:		.519
last_se:		.010




In [13]:
# Download Metric as csv:

METRIC = "valid/Column_Pair_Trend"
GROUPS = ["superstore_1ew", "superstore_2ew", "superstore_3ew", "superstore_5ew", "superstore_10ew"]

PROJECT = "CTGAN" 

run_list = []
history_list =[]
metric_df = pd.DataFrame()

#retrieve all runs
for group in GROUPS:
    runs = get_filtered_runs(PROJECT, group)
    for run in runs:
        run_list.append(run)

#retrieve all histories:
for idx, run in enumerate(run_list):
    history = run.history(samples=run.lastHistoryStep+1, keys= [METRIC])
    metric_df[run.name + "_{}".format(idx)] = history[METRIC]

metric_df.index = (metric_df.index+1)

metric_df.to_csv("02/{}_{}.csv".format(GROUPS[0].split("/")[-1], METRIC.split("/")[-1]))

