In [1]:
import wandb
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


In [2]:
wandb.login(key="8a88a8c49d1c2d31b8677fe0b8eb7d3e3a031f83")
api = wandb.Api()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /export/home/0schindl/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mbelaschindler[0m ([33mbelaschindler-university-hamburg[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
def get_expert_distribution(run):
    if run.state != "finished":
        return None

    history = run.history()
    
    expert_distributions = dict()
    ft_tasks = [None] * 1000
    ft_buffer = run.config.get("moe_max_experts")
    for line in run.history().columns:
        if line.startswith("Expert") and line.endswith("learned task"):
            
            line_splited = line.split(" ")
            expert = int(line_splited[1])
            tasks = history[line].dropna().tolist()
            tasks = [int(task) for task in tasks]
            
            if expert not in expert_distributions:
                expert_distributions[expert] = list()
            expert_distributions[expert].extend(tasks) 

            
            for i in tasks:
                if i >= ft_buffer:
                    ft_tasks[i - ft_buffer] = expert    

    # cleaning ft_tasks
    ft_tasks = [i for i in ft_tasks if i is not None]

    return ft_tasks, expert_distributions


In [4]:
sweep_id = "belaschindler-university-hamburg/0schindl-LayUp_sweeps_question1_selection_method/wk4w5q0t"
sweep = api.sweep(sweep_id)
runs = sweep.runs



data = []
for run in runs:
    config = run.config
    summary = run.summary

    dataset = config.get("dataset")
    selection_method = config.get("selection_method")
    mean_acc = summary.get("task_mean/acc")
    run_id = run.id
    state = run.state

    if dataset is not None and selection_method is not None and mean_acc is not None and state == "finished":
        data.append({
            "run_id": run_id,
            "dataset": dataset,
            "selection_method": selection_method,
            "mean_acc": mean_acc
            })

df_sweep = pd.DataFrame(data)
print("\nDataFrame der Sweep-Runs:")
print(df_sweep.head())




DataFrame der Sweep-Runs:
     run_id        dataset selection_method  mean_acc
0  f1etfz4c  dil_imagenetr           kl_div  0.035325
1  lolfv1ky  dil_imagenetr   inv_eucld_dist  0.042349
2  ut9m2ban  dil_imagenetr           ws_div  0.052182
3  ccodxf0x  dil_imagenetr       eucld_dist  0.075996
4  oh5mrj60  dil_imagenetr           around  0.062176


In [5]:
fill_value = "N/A"  # Hier kannst du deinen gewünschten Füllwert festlegen

def rank_selection_methods(group):
    """Ordnet die Selection Methods innerhalb einer Dataset-Gruppe nach mean_acc."""
    ranked = group.sort_values(by='mean_acc', ascending=False)['selection_method'].reset_index(drop=True)
    return ranked

ranked_methods = df_sweep.groupby('dataset').apply(rank_selection_methods)

# Erstelle einen neuen DataFrame für die Übersicht mit Füllwerten
overview_ranked = pd.DataFrame()
max_rows = 0
for dataset in ranked_methods.index.get_level_values('dataset').unique():
    methods = ranked_methods[dataset]
    overview_ranked[dataset] = methods.reindex(range(len(methods)), fill_value=fill_value)
    max_rows = max(max_rows, len(methods))

# Setze den Index basierend auf der tatsächlichen Anzahl der Zeilen im DataFrame
overview_ranked.index = [f"{i+1}." for i in range(len(overview_ranked))]


print("\nÜbersicht der Selection Methods nach Dataset:")
pd.set_option('display.width', 5000)
pd.set_option('display.max_colwidth', None)
print(overview_ranked)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)



Übersicht der Selection Methods nach Dataset:
              cars            cddb        cifar100             cub   dil_imagenetr       imageneta       imagenetr limited_domainnet   omnibenchmark            vtab
1.          kl_div          around      inv_ws_div      inv_ws_div      eucld_dist      inv_kl_div      inv_kl_div        inv_ws_div      inv_ws_div          kl_div
2.      inv_kl_div          ws_div      inv_kl_div          ws_div          around          kl_div          ws_div            around          ws_div      inv_kl_div
3.  inv_eucld_dist          kl_div          around      inv_kl_div          ws_div  inv_eucld_dist  inv_eucld_dist            kl_div          around  inv_eucld_dist
4.      eucld_dist      inv_kl_div      eucld_dist          kl_div      inv_ws_div      inv_ws_div      eucld_dist    inv_eucld_dist      eucld_dist      inv_ws_div
5.          around  inv_eucld_dist          kl_div      eucld_dist  inv_eucld_dist      eucld_dist          around        eucld_

In [6]:
def rank_run_ids(group):
    """Ordnet die Run IDs innerhalb einer Dataset-Gruppe nach mean_acc und gibt sie zurück."""
    ranked_runs = group.sort_values(by='mean_acc', ascending=False)['run_id'].reset_index(drop=True)
    return ranked_runs

ranked_run_ids_per_dataset = df_sweep.groupby('dataset').apply(rank_run_ids)

# Erstelle einen neuen DataFrame für die Übersicht der Run IDs
overview_run_ids = pd.DataFrame()
max_rows_run_ids = 0
for dataset in ranked_run_ids_per_dataset.index.get_level_values('dataset').unique():
    run_ids = ranked_run_ids_per_dataset[dataset]
    overview_run_ids[dataset] = run_ids.reindex(range(len(run_ids)), fill_value=fill_value)
    max_rows_run_ids = max(max_rows_run_ids, len(run_ids))

# Setze den Index basierend auf der maximalen Anzahl der Zeilen
overview_run_ids.index = [f"{i+1}." for i in range(len(overview_run_ids))]

print("\nÜbersicht der Run IDs nach Dataset (sortiert nach mean_acc):")
pd.set_option('display.width', 5000)
pd.set_option('display.max_colwidth', None)
print(overview_run_ids)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)


Übersicht der Run IDs nach Dataset (sortiert nach mean_acc):
        cars      cddb  cifar100       cub dil_imagenetr imageneta imagenetr limited_domainnet omnibenchmark      vtab
1.  al08o88u  fw7owkrl  q9r1xc9a  b0dzsjax      ccodxf0x  i456qxf5  e7fey5va          jg82arnp      53rkgrjf  msix6frk
2.  fc1z9ff6  sbuithvw  vip27gx3  1v5fru6p      oh5mrj60  f7ho9rvo  9lcro1ez          yantgyei      czo0hmq2  ynjcw7e5
3.  ua1ld836  2v1rqtqi  5wzxh6k3  6o4wlyxx      ut9m2ban  kc8w724c  xml5gzh7          3r9p6naq      kjxo0x09  1x3l5agm
4.  kvj4knc9  ntbz0it8  4hjczvft  kf6ehova      f9yiptc2  kemmqdbx  b2u56ro8          8iyhagho      5arkgaql  as2k8u4j
5.  kjh5dj04  5u5u48j0  p5xo2f0o  iqy2i7z6      lolfv1ky  8ckoklh7  qr5u1wcw          lcpf3ofo      8djuj7se  923atyey
6.  da82toeh  ozgume7j  3487s1kp  8asspb0s      f1etfz4c  v58sf8i1  oju7pgkq          xuphq28p      wb55zhl3  yc32vq2v
7.  93y21l92  fjt2ri5i  4ww7bxkx  zv1t9s88           NaN  srie9ojl  a9576ypx          gvfz3ds9      otpft

In [7]:
overview_expert_distributions = pd.DataFrame(index=overview_run_ids.index, columns=overview_run_ids.columns)

# Iterate through the overview_run_ids DataFrame and call get_expert_distribution for each run
for col in overview_run_ids.columns:
    for index, run_id in overview_run_ids[col].items():
        if pd.notna(run_id):
            run = api.run(f"belaschindler-university-hamburg/0schindl-LayUp_sweeps_question1_selection_method/{run_id}")
            expert_distribution = get_expert_distribution(run)[0]
            overview_expert_distributions.loc[index, col] = str(expert_distribution)  # Änderung hier: Speichere als String
        else:
            overview_expert_distributions.loc[index, col] = fill_value

print("\nÜbersicht der Expertenverteilungen nach Dataset (sortiert nach mean_acc):")
pd.set_option('display.width', 5000)
pd.set_option('display.max_colwidth', None)
print(overview_expert_distributions)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)



Übersicht der Expertenverteilungen nach Dataset (sortiert nach mean_acc):
               cars    cddb         cifar100              cub             dil_imagenetr        imageneta        imagenetr limited_domainnet    omnibenchmark    vtab
1.  [1, 1, 1, 1, 1]  [0, 1]  [0, 0, 1, 0, 1]  [1, 1, 4, 1, 1]  [1, 1, 1, 1, 1, 1, 1, 1]  [4, 4, 4, 3, 3]  [3, 0, 0, 0, 0]         [2, 0, 0]  [1, 1, 2, 2, 2]  [2, 2]
2.  [4, 3, 2, 2, 2]  [1, 2]  [2, 0, 1, 1, 2]  [3, 3, 3, 3, 3]  [0, 1, 2, 3, 4, 5, 6, 0]  [1, 2, 1, 1, 2]  [4, 4, 4, 4, 4]         [0, 1, 2]  [0, 3, 0, 3, 3]  [0, 0]
3.  [4, 0, 4, 4, 4]  [2, 2]  [0, 1, 2, 3, 4]  [2, 0, 3, 2, 2]  [5, 1, 1, 1, 1, 1, 1, 1]  [3, 3, 3, 0, 4]  [2, 3, 3, 3, 3]         [2, 2, 2]  [0, 1, 2, 3, 4]  [0, 1]
4.  [2, 1, 1, 2, 1]  [0, 1]  [4, 4, 4, 4, 3]  [1, 1, 1, 1, 4]  [3, 6, 2, 0, 5, 3, 3, 3]  [3, 0, 3, 0, 4]  [1, 1, 1, 4, 3]         [2, 2, 2]  [0, 0, 4, 4, 4]  [0, 0]
5.  [0, 1, 2, 3, 4]  [2, 1]  [4, 4, 4, 4, 4]  [4, 4, 1, 4, 1]  [3, 6, 3, 3, 4, 4, 3, 3]  [2, 2, 1, 1

In [None]:
sweep_DIL = "belaschindler-university-hamburg/0schindl-LayUp_sweeps_question1_selection_method/6kim8tiu"
sweep_CIL1 = "belaschindler-university-hamburg/0schindl-LayUp_sweeps_question1_selection_method/p7zmthx9"
sweep_CIL2 = "belaschindler-university-hamburg/0schindl-LayUp_sweeps_question1_selection_method/jdpa9z1x"
sweep_CIL3 = "belaschindler-university-hamburg/0schindl-LayUp_sweeps_question1_selection_method/cjddpel4"
sweep_CIL4 = "belaschindler-university-hamburg/0schindl-LayUp_sweeps_question1_selection_method/hxigp6ck"


sweep = api.sweep(sweep_id)
runs = sweep.runs



data = []
for run in runs:
    config = run.config
    summary = run.summary

    dataset = config.get("dataset")
    selection_method = config.get("selection_method")
    mean_acc = summary.get("task_mean/acc")
    run_id = run.id
    state = run.state

    if dataset is not None and selection_method is not None and mean_acc is not None and state == "finished":
        data.append({
            "run_id": run_id,
            "dataset": dataset,
            "selection_method": selection_method,
            "mean_acc": mean_acc
            })

df_sweep = pd.DataFrame(data)
print("\nDataFrame der Sweep-Runs:")
print(df_sweep.head())
