# **Result Analysis -** AmazeVille Topological Streams

___
___

### **0. Streams**

**Topological Stream 1 :**

- amazeville-maze_2-high
- amazeville-maze_4-high
- amazeville-maze_4-high
- amazeville-maze_2-high

**Topological Stream 2 :**

- amazeville-maze_1-low
- amazeville-maze_1-low
- amazeville-maze_4-low
- amazeville-maze_3-low

___

### **1. Initialization**

#### *1.A. Imports*

In [2]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import os
import pandas as pd
import seaborn as sns

from offbench.utils.logger import extract_metrics_from_event_file
from offbench.utils.paths import WORKSPACE_PATH
from hydra import compose, initialize
from omegaconf import DictConfig
from typing import Union

#### *1.B. Parameters*

In [3]:
if True:

    all_experiments_folder_path = os.path.join(WORKSPACE_PATH, "experiments", "2_continual", "godot_goal")

    seeds = [100 * i for i in range(1, 4)]

    max_gradient_steps = 100000

    n_tasks = 4

    stream_names = [
        "amazeville_topological_1", 
        "amazeville_topological_2",
    ]

    stream_name_to_title = {
        "amazeville_topological_1": "AmazeVille - Topological Stream 1",
        "amazeville_topological_2": "AmazeVille - Topological Stream 2",
    }

    agent_ids = [
        "hgcbc_scratch_1_mlp",
        "hgcbc_scratch_n_mlp",
        "hgcbc_finetune_1_mlp",
        "hgcbc_finetune_n_mlp",
        "hgcbc_freeze_mlp",
        "hgcbc_pnn_mlp",
        "hgcbc_replay_mlp",
    ]

    reg_lbs = [0.01, 0.1, 1.0, 10.0, 100.0]
    agent_ids += ["hgcbc_ewc_mlp_lb{}".format(lb) for lb in reg_lbs]
    agent_ids += ["hgcbc_l2_mlp_lb{}".format(lb) for lb in reg_lbs]

    hsp_eps = [0.1, 0.05]
    hsp_lbs = [0.1, 0.05]
    agent_ids += ["hgcbc_hispofix_mlp_heps{}_leps{}_lb{}_l2".format(heps, leps, lb) for heps in hsp_eps for leps in hsp_eps for lb in hsp_lbs]

    agent_id_to_algo = {
        "hgcbc_scratch_1_mlp": "HBC",
        "hgcbc_scratch_n_mlp": "HBC",
        "hgcbc_finetune_1_mlp": "HBC",
        "hgcbc_finetune_n_mlp": "HBC",
        "hgcbc_freeze_mlp": "HBC",
        "hgcbc_pnn_mlp": "HBC",
        "hgcbc_replay_mlp": "HBC",
    }

    for lb in reg_lbs:
        agent_id_to_algo["hgcbc_ewc_mlp_lb{}".format(lb)] = "HBC"
        agent_id_to_algo["hgcbc_l2_mlp_lb{}".format(lb)] = "HBC"
    
    for lb in hsp_lbs:
        for heps in hsp_eps:
            for leps in hsp_eps:
                agent_id_to_algo["hgcbc_hispofix_mlp_heps{}_leps{}_lb{}_2".format(heps, leps, lb)] = "HBC"

    agent_id_to_method = {
        "hgcbc_scratch_1_mlp": "SC1",
        "hgcbc_scratch_n_mlp": "SCN",
        "hgcbc_finetune_1_mlp": "FT1",
        "hgcbc_finetune_n_mlp": "FTN",
        "hgcbc_freeze_mlp": "FRZ",
        "hgcbc_pnn_mlp": "PNN",
        "hgcbc_replay_mlp": "RPL",
    }

    for lb in reg_lbs:
        agent_id_to_method["hgcbc_ewc_mlp_lb{}".format(lb)] = "EWC (λ={})".format(lb)
        agent_id_to_method["hgcbc_l2_mlp_lb{}".format(lb)] = "L2 (λ={})".format(lb)

    for lb in hsp_lbs:
        for heps in hsp_eps:
            for leps in hsp_eps:
                agent_id_to_method["hgcbc_hispofix_mlp_heps{}_leps{}_lb{}_l2".format(heps, leps, lb)] = "HiSPO (hε={}, lε={}, λ={})".format(heps, leps, lb)

#### 1.C. Data

In [4]:
def generate_data():

    # ROUGH DATA
    ############

    data = []

    for stream_name in stream_names:

        for agent_id in agent_ids:
            
            for seed in seeds:

                experiment_folder_path = os.path.join(all_experiments_folder_path, stream_name, agent_id, f"seed_{seed}")
                path_to_logs = os.path.join(experiment_folder_path, "logs")

                if not os.path.exists(path_to_logs): continue

                events_files = os.listdir(path_to_logs)
                events_files = [f for f in events_files if "events.out.tfevents" in f]

                all_metrics = []

                for events_file in events_files:

                    metrics = extract_metrics_from_event_file(path_to_logs + "/" + events_file)

                    for tag,step,value in metrics:

                        if tag.startswith("EVALUATIONS"):
                            
                            if step == max_gradient_steps:

                                train_task_idx = int(tag.split("/")[0].split("_")[1][1:])
                                eval_task_idx = int(tag.split("/")[1].split("_")[0][1:])
                                metric_name = "_".join(tag.split("/")[1].split("_")[1:])

                                if (train_task_idx <= n_tasks) and (eval_task_idx <= n_tasks):

                                    if "success" in metric_name:

                                        all_metrics.append({
                                            "stream_name": stream_name,
                                            "agent_id": agent_id,
                                            "seed": seed,
                                            "train_task_idx": train_task_idx,
                                            "eval_task_idx": eval_task_idx,
                                            "metric": metric_name,
                                            "value": value,
                                        })
                        
                        elif tag.startswith("SIZE"):

                            if step <= n_tasks:

                                metric = tag.split("/")[1]

                                for i in range(n_tasks):

                                    all_metrics.append({
                                        "stream_name": stream_name,
                                        "agent_id": agent_id,
                                        "seed": seed,
                                        "train_task_idx": step,
                                        "eval_task_idx": i+1,
                                        "metric": metric,
                                        "value": value,
                                    })
                        
                        elif tag.startswith("HIGH_SIZE"):

                            if step <= n_tasks:

                                metric = "high_" + tag.split("/")[1]

                                for i in range(n_tasks):

                                    all_metrics.append({
                                        "stream_name": stream_name,
                                        "agent_id": agent_id,
                                        "seed": seed,
                                        "train_task_idx": step,
                                        "eval_task_idx": i+1,
                                        "metric": metric,
                                        "value": value,
                                    })
                        
                        elif tag.startswith("LOW_SIZE"):

                            if step <= n_tasks:

                                metric = "low_" + tag.split("/")[1]

                                for i in range(n_tasks):

                                    all_metrics.append({
                                        "stream_name": stream_name,
                                        "agent_id": agent_id,
                                        "seed": seed,
                                        "train_task_idx": step,
                                        "eval_task_idx": i+1,
                                        "metric": metric,
                                        "value": value,
                                    })
                    
                    try:
                    
                        df = pd.DataFrame(all_metrics)
                        df = df.pivot_table(index=["stream_name", "agent_id", "seed", "train_task_idx", "eval_task_idx"], columns="metric", values="value")
                        df = df.reset_index(drop=False)
                    
                    except:

                        print(f"Error with {experiment_folder_path}")
                    
                    data.append(df)

    data = pd.concat(data)
    data = data.reset_index(drop=True)
    data = data.rename(columns={"mean_success": "success"})

    # NORMALIZED SIZE
    #################

    high_ref_size = data[
        (data["agent_id"] == "hgcbc_scratch_1_mlp") & 
        (data["stream_name"] == stream_names[0]) &
        (data["seed"] == 100) &
        (data["train_task_idx"] == 1) &
        (data["eval_task_idx"] == 1)
    ]["high_inference_size"]

    data["high_inference_size"] = data["high_inference_size"] / high_ref_size.values
    data["high_total_size"] = data["high_total_size"] / high_ref_size.values

    low_ref_size = data[
        (data["agent_id"] == "hgcbc_scratch_1_mlp") & 
        (data["stream_name"] == stream_names[0]) &
        (data["seed"] == 100) &
        (data["train_task_idx"] == 1) &
        (data["eval_task_idx"] == 1)
    ]["low_inference_size"]

    data["low_inference_size"] = data["low_inference_size"] / low_ref_size.values
    data["low_total_size"] = data["low_total_size"] / low_ref_size.values

    total_ref_size = data[
        (data["agent_id"] == "hgcbc_scratch_1_mlp") & 
        (data["stream_name"] == stream_names[0]) &
        (data["seed"] == 100) &
        (data["train_task_idx"] == 1) &
        (data["eval_task_idx"] == 1)
    ]["inference_size"]

    data["inference_size"] = data["inference_size"] / total_ref_size.values
    data["total_size"] = data["total_size"] / total_ref_size.values

    # FILTERING
    ###########

    # group by agent_id, stream_name, and seed
    grouped = data.groupby(["agent_id", "stream_name", "seed"])
    
    # identify valid agents with full task coverage (train_task_idx)
    valid_agents = []
    for (agent, stream, seed), group in grouped:
        train_task_indices = set(group["train_task_idx"])
        if set(range(1, n_tasks + 1)).issubset(train_task_indices):
            valid_agents.append((agent, stream, seed))
    
    # filter the dataframe
    filtered_data = data.set_index(["agent_id", "stream_name", "seed"])
    filtered_data = filtered_data.loc[valid_agents].reset_index()

    data = filtered_data
    
    data = data[[
        "agent_id", "stream_name", "seed", 
        "train_task_idx", "eval_task_idx", 
        "success", 
        "inference_size", "total_size",
        "high_inference_size", "high_total_size",
        "low_inference_size", "low_total_size",
    ]]

    # remove duplicate rows
    data = data.drop_duplicates()

    return data

In [5]:
data = generate_data()

#### 1.D. Metrics

In [6]:
def generate_metrics(data: pd.DataFrame) -> pd.DataFrame:

    group_cols = ["stream_name", "agent_id", "seed"]

    processed_data = []

    for (stream, agent, seed), df_sub in data.groupby(group_cols):

        for N in range(1, n_tasks+1):

            # pivot to get perf_matrix
            perf_matrix = df_sub.pivot(index="train_task_idx", columns="eval_task_idx", values="success")
                        
            # Performance (mean for 1 to N)
            PER = 0.0
            for k in range(1, N+1):
                PER += perf_matrix.loc[N, k]
            PER /= N
            
            # BWT
            bwt_sum = 0.0
            for k in range(1, N+1):
                bwt_sum += perf_matrix.loc[N, k] - perf_matrix.loc[k, k]
            BWT = bwt_sum / N
            
            # For FWT, we also need the scratch model for the same (stream, seed)
            # But agent_id='gcbc_scratch_1_mlp'. We'll do a separate query:
            df_scratch_sub = data.query(f"stream_name == '{stream}' & seed == {seed} & agent_id == 'hgcbc_scratch_1_mlp'")
            scratch_matrix = df_scratch_sub.pivot(index="train_task_idx", columns="eval_task_idx", values="success")
            
            fwt_sum = 0.0
            for k in range(1, N+1):
                fwt_sum += perf_matrix.loc[k, k] - scratch_matrix.loc[k, k]
            FWT = fwt_sum / N
            
            # MEM
            final_model_row = df_sub.query("train_task_idx == @N").iloc[0]
            
            processed_data.append({
                "stream_name": stream,
                "agent_id": agent,
                "seed": seed,
                "train_task_idx": N,
                "PER": PER,
                "BWT": BWT,
                "FWT": FWT,
                "HIGH_MEM_inference": final_model_row["high_inference_size"],
                "HIGH_MEM_total": final_model_row["high_total_size"],
                "LOW_MEM_inference": final_model_row["low_inference_size"],
                "LOW_MEM_total": final_model_row["low_total_size"],
                "MEM_inference": final_model_row["inference_size"],
                "MEM_total": final_model_row["total_size"],
            })

    processed_data = pd.DataFrame(processed_data)

    processed_data["method"] = processed_data["agent_id"].apply(lambda x: agent_id_to_method[x])

    processed_data = processed_data[[
        "stream_name", "method", "seed",
        "train_task_idx",
        "PER", "BWT", "FWT", 
        "HIGH_MEM_inference", "HIGH_MEM_total",
        "LOW_MEM_inference", "LOW_MEM_total",
        "MEM_inference", "MEM_total"
    ]]

    return processed_data

In [7]:
metrics = generate_metrics(data)

___

### **2. Tables by Streams**

#### 2.A. Stream 1

In [8]:
metrics_1 = metrics[metrics["stream_name"] == stream_names[0]]
metrics_1 = metrics_1[metrics_1["train_task_idx"] == n_tasks]

# average over seeds
metrics_1 = metrics_1.groupby(["stream_name", "method"]).agg({
    "PER": ["mean", "std"],
    "BWT": ["mean", "std"],
    "FWT": ["mean", "std"],
    "HIGH_MEM_inference": ["mean", "std"],
    "HIGH_MEM_total": ["mean", "std"],
    "LOW_MEM_inference": ["mean", "std"],
    "LOW_MEM_total": ["mean", "std"],
    "MEM_inference": ["mean", "std"],
    "MEM_total": ["mean", "std"],
}).reset_index()
metrics_1.columns = ["_".join(col).strip() for col in metrics_1.columns.values]
metrics_1 = metrics_1.sort_values("PER_mean", ascending=False)

# find 1st iteration of EWC (the best EWC for any lambda) and remove the other EWC
ewc_idx = metrics_1[metrics_1["method_"].str.contains("EWC")].index[0]
ewc_idx = metrics_1[(metrics_1["method_"].str.contains("EWC")) & (metrics_1.index != ewc_idx)].index
metrics_1 = metrics_1.drop(ewc_idx)
# find 1st iteration of L2 (the best L2 for any lambda) and remove the other L2
l2_idx = metrics_1[metrics_1["method_"].str.contains("L2")].index[0]
l2_idx = metrics_1[(metrics_1["method_"].str.contains("L2")) & (metrics_1.index != l2_idx)].index
metrics_1 = metrics_1.drop(l2_idx)
# find 1st iteration of HiSPO (the best HiSPO for any epsilon and lambda) and remove the other HiSPO
hispo_idx = metrics_1[metrics_1["method_"].str.contains("HiSPO")].index[0]
hispo_idx = metrics_1[(metrics_1["method_"].str.contains("HiSPO")) & (metrics_1.index != hispo_idx)].index
metrics_1 = metrics_1.drop(hispo_idx)

# show
metrics_1["PER_mean"] = (metrics_1["PER_mean"] * 100).round(1)
metrics_1["PER_std"] = (metrics_1["PER_std"] * 100).round(1)
metrics_1["BWT_mean"] = (metrics_1["BWT_mean"] * 100).round(1)
metrics_1["BWT_std"] = (metrics_1["BWT_std"] * 100).round(1)
metrics_1["FWT_mean"] = (metrics_1["FWT_mean"] * 100).round(1)
metrics_1["FWT_std"] = (metrics_1["FWT_std"] * 100).round(1)
metrics_1["HIGH_MEM_inference_mean"] = metrics_1["HIGH_MEM_inference_mean"].round(1)
metrics_1["HIGH_MEM_inference_std"] = metrics_1["HIGH_MEM_inference_std"].round(1)
metrics_1["HIGH_MEM_total_mean"] = metrics_1["HIGH_MEM_total_mean"].round(1)
metrics_1["HIGH_MEM_total_std"] = metrics_1["HIGH_MEM_total_std"].round(1)
metrics_1["LOW_MEM_inference_mean"] = metrics_1["LOW_MEM_inference_mean"].round(1)
metrics_1["LOW_MEM_inference_std"] = metrics_1["LOW_MEM_inference_std"].round(1)
metrics_1["LOW_MEM_total_mean"] = metrics_1["LOW_MEM_total_mean"].round(1)
metrics_1["LOW_MEM_total_std"] = metrics_1["LOW_MEM_total_std"].round(1)
metrics_1["MEM_inference_mean"] = metrics_1["MEM_inference_mean"].round(1)
metrics_1["MEM_inference_std"] = metrics_1["MEM_inference_std"].round(1)
metrics_1["MEM_total_mean"] = metrics_1["MEM_total_mean"].round(1)

metrics_1

Unnamed: 0,stream_name_,method_,PER_mean,PER_std,BWT_mean,BWT_std,FWT_mean,FWT_std,HIGH_MEM_inference_mean,HIGH_MEM_inference_std,HIGH_MEM_total_mean,HIGH_MEM_total_std,LOW_MEM_inference_mean,LOW_MEM_inference_std,LOW_MEM_total_mean,LOW_MEM_total_std,MEM_inference_mean,MEM_inference_std,MEM_total_mean,MEM_total_std
17,amazeville_topological_1,PNN,85.0,3.1,0.0,0.0,7.8,7.1,9.9,0.0,9.9,0.0,9.9,0.0,9.9,0.0,9.9,0.0,9.9,0.0
7,amazeville_topological_1,FTN,81.3,5.7,0.0,0.0,4.1,2.8,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0
20,amazeville_topological_1,SCN,77.2,7.3,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0
8,amazeville_topological_1,"HiSPO (hε=0.05, lε=0.05, λ=0.1)",75.2,1.4,0.0,0.0,-1.9,8.2,2.0,0.0,2.0,0.0,2.3,0.6,2.3,0.6,2.2,0.3,2.2,0.290608
18,amazeville_topological_1,RPL,61.2,5.8,4.3,7.6,-20.3,3.9,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1,amazeville_topological_1,EWC (λ=0.1),56.5,4.4,-25.3,1.4,4.6,3.6,1.0,0.0,9.0,0.0,1.0,0.0,9.0,0.0,1.0,0.0,9.0,0.0
6,amazeville_topological_1,FT1,56.3,6.3,-24.9,1.3,4.1,2.8,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
12,amazeville_topological_1,L2 (λ=0.01),48.0,10.6,-30.2,5.1,1.0,1.8,1.0,0.0,5.0,0.0,1.0,0.0,5.0,0.0,1.0,0.0,5.0,0.0
5,amazeville_topological_1,FRZ,46.8,12.8,0.0,0.0,-30.3,5.9,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
19,amazeville_topological_1,SC1,46.8,12.8,-30.3,5.9,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


#### 2.B. Stream 2

In [9]:
metrics_2 = metrics[metrics["stream_name"] == stream_names[1]]
metrics_2 = metrics_2[metrics_2["train_task_idx"] == n_tasks]

# average over seeds
metrics_2 = metrics_2.groupby(["stream_name", "method"]).agg({
    "PER": ["mean", "std"],
    "BWT": ["mean", "std"],
    "FWT": ["mean", "std"],
    "HIGH_MEM_inference": ["mean", "std"],
    "HIGH_MEM_total": ["mean", "std"],
    "LOW_MEM_inference": ["mean", "std"],
    "LOW_MEM_total": ["mean", "std"],
    "MEM_inference": ["mean", "std"],
    "MEM_total": ["mean", "std"],
}).reset_index()
metrics_2.columns = ["_".join(col).strip() for col in metrics_2.columns.values]
metrics_2 = metrics_2.sort_values("PER_mean", ascending=False)

# find 1st iteration of EWC (the best EWC for any lambda) and remove the other EWC
ewc_idx = metrics_2[metrics_2["method_"].str.contains("EWC")].index[0]
ewc_idx = metrics_2[(metrics_2["method_"].str.contains("EWC")) & (metrics_2.index != ewc_idx)].index
metrics_2 = metrics_2.drop(ewc_idx)
# find 1st iteration of L2 (the best L2 for any lambda) and remove the other L2
l2_idx = metrics_2[metrics_2["method_"].str.contains("L2")].index[0]
l2_idx = metrics_2[(metrics_2["method_"].str.contains("L2")) & (metrics_2.index != l2_idx)].index
metrics_2 = metrics_2.drop(l2_idx)
# find 1st iteration of HiSPO (the best HiSPO for any epsilon and lambda) and remove the other HiSPO
hispo_idx = metrics_2[metrics_2["method_"].str.contains("HiSPO")].index[0]
hispo_idx = metrics_2[(metrics_2["method_"].str.contains("HiSPO")) & (metrics_2.index != hispo_idx)].index
metrics_2 = metrics_2.drop(hispo_idx)

# show
metrics_2["PER_mean"] = (metrics_2["PER_mean"] * 100).round(1)
metrics_2["PER_std"] = (metrics_2["PER_std"] * 100).round(1)
metrics_2["BWT_mean"] = (metrics_2["BWT_mean"] * 100).round(1)
metrics_2["BWT_std"] = (metrics_2["BWT_std"] * 100).round(1)
metrics_2["FWT_mean"] = (metrics_2["FWT_mean"] * 100).round(1)
metrics_2["FWT_std"] = (metrics_2["FWT_std"] * 100).round(1)
metrics_2["HIGH_MEM_inference_mean"] = metrics_2["HIGH_MEM_inference_mean"].round(1)
metrics_2["HIGH_MEM_inference_std"] = metrics_2["HIGH_MEM_inference_std"].round(1)
metrics_2["HIGH_MEM_total_mean"] = metrics_2["HIGH_MEM_total_mean"].round(1)
metrics_2["HIGH_MEM_total_std"] = metrics_2["HIGH_MEM_total_std"].round(1)
metrics_2["LOW_MEM_inference_mean"] = metrics_2["LOW_MEM_inference_mean"].round(1)
metrics_2["LOW_MEM_inference_std"] = metrics_2["LOW_MEM_inference_std"].round(1)
metrics_2["LOW_MEM_total_mean"] = metrics_2["LOW_MEM_total_mean"].round(1)
metrics_2["LOW_MEM_total_std"] = metrics_2["LOW_MEM_total_std"].round(1)
metrics_2["MEM_inference_mean"] = metrics_2["MEM_inference_mean"].round(1)
metrics_2["MEM_inference_std"] = metrics_2["MEM_inference_std"].round(1)
metrics_2["MEM_total_mean"] = metrics_2["MEM_total_mean"].round(1)

metrics_2

Unnamed: 0,stream_name_,method_,PER_mean,PER_std,BWT_mean,BWT_std,FWT_mean,FWT_std,HIGH_MEM_inference_mean,HIGH_MEM_inference_std,HIGH_MEM_total_mean,HIGH_MEM_total_std,LOW_MEM_inference_mean,LOW_MEM_inference_std,LOW_MEM_total_mean,LOW_MEM_total_std,MEM_inference_mean,MEM_inference_std,MEM_total_mean,MEM_total_std
17,amazeville_topological_2,PNN,93.4,1.6,0.0,0.0,5.7,1.8,9.9,0.0,9.9,0.0,9.9,0.0,9.9,0.0,9.9,0.0,9.9,0.0
7,amazeville_topological_2,FTN,91.0,3.5,0.0,0.0,3.2,1.6,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0
18,amazeville_topological_2,RPL,90.2,4.5,3.1,0.9,-0.7,1.6,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
4,amazeville_topological_2,EWC (λ=100.0),89.6,3.6,-1.4,5.7,3.2,3.7,1.0,0.0,9.0,0.0,1.0,0.0,9.0,0.0,1.0,0.0,9.0,0.0
20,amazeville_topological_2,SCN,87.8,3.2,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0
6,amazeville_topological_2,FT1,85.2,3.3,-5.8,3.1,3.2,1.6,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
9,amazeville_topological_2,"HiSPO (hε=0.1, lε=0.05, λ=0.05)",83.5,3.2,0.0,0.0,-4.3,6.4,3.3,0.6,3.3,0.6,4.0,0.0,4.0,0.0,3.7,0.3,3.7,0.286743
13,amazeville_topological_2,L2 (λ=0.1),82.6,3.6,-8.5,2.9,3.3,4.3,1.0,0.0,5.0,0.0,1.0,0.0,5.0,0.0,1.0,0.0,5.0,0.0
19,amazeville_topological_2,SC1,74.4,2.5,-13.3,2.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
5,amazeville_topological_2,FRZ,70.1,3.1,0.0,0.0,-17.7,4.3,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


#### 2.C. Both

In [10]:
metrics_all = metrics
metrics_all = metrics_all[metrics_all["train_task_idx"] == n_tasks]

# average over seeds
metrics_all = metrics_all.groupby(["method"]).agg({
    "PER": ["mean", "std"],
    "BWT": ["mean", "std"],
    "FWT": ["mean", "std"],
    "HIGH_MEM_inference": ["mean", "std"],
    "HIGH_MEM_total": ["mean", "std"],
    "LOW_MEM_inference": ["mean", "std"],
    "LOW_MEM_total": ["mean", "std"],
    "MEM_inference": ["mean", "std"],
    "MEM_total": ["mean", "std"],
}).reset_index()
metrics_all.columns = ["_".join(col).strip() for col in metrics_all.columns.values]
metrics_all = metrics_all.sort_values("PER_mean", ascending=False)

# find 1st iteration of EWC (the best EWC for any lambda) and remove the other EWC
ewc_idx = metrics_all[metrics_all["method_"].str.contains("EWC")].index[0]
ewc_idx = metrics_all[(metrics_all["method_"].str.contains("EWC")) & (metrics_all.index != ewc_idx)].index
metrics_all = metrics_all.drop(ewc_idx)
# find 1st iteration of L2 (the best L2 for any lambda) and remove the other L2
l2_idx = metrics_all[metrics_all["method_"].str.contains("L2")].index[0]
l2_idx = metrics_all[(metrics_all["method_"].str.contains("L2")) & (metrics_all.index != l2_idx)].index
metrics_all = metrics_all.drop(l2_idx)
# find 1st iteration of HiSPO (the best HiSPO for any epsilon and lambda) and remove the other HiSPO
hispo_idx = metrics_all[metrics_all["method_"].str.contains("HiSPO")].index[0]
hispo_idx = metrics_all[(metrics_all["method_"].str.contains("HiSPO")) & (metrics_all.index != hispo_idx)].index
metrics_all = metrics_all.drop(hispo_idx)

# show
metrics_all["PER_mean"] = (metrics_all["PER_mean"] * 100).round(1)
metrics_all["PER_std"] = (metrics_all["PER_std"] * 100).round(1)
metrics_all["BWT_mean"] = (metrics_all["BWT_mean"] * 100).round(1)
metrics_all["BWT_std"] = (metrics_all["BWT_std"] * 100).round(1)
metrics_all["FWT_mean"] = (metrics_all["FWT_mean"] * 100).round(1)
metrics_all["FWT_std"] = (metrics_all["FWT_std"] * 100).round(1)
metrics_all["HIGH_MEM_inference_mean"] = metrics_all["HIGH_MEM_inference_mean"].round(1)
metrics_all["HIGH_MEM_inference_std"] = metrics_all["HIGH_MEM_inference_std"].round(1)
metrics_all["HIGH_MEM_total_mean"] = metrics_all["HIGH_MEM_total_mean"].round(1)
metrics_all["HIGH_MEM_total_std"] = metrics_all["HIGH_MEM_total_std"].round(1)
metrics_all["LOW_MEM_inference_mean"] = metrics_all["LOW_MEM_inference_mean"].round(1)
metrics_all["LOW_MEM_inference_std"] = metrics_all["LOW_MEM_inference_std"].round(1)
metrics_all["LOW_MEM_total_mean"] = metrics_all["LOW_MEM_total_mean"].round(1)
metrics_all["LOW_MEM_total_std"] = metrics_all["LOW_MEM_total_std"].round(1)
metrics_all["MEM_inference_mean"] = metrics_all["MEM_inference_mean"].round(1)
metrics_all["MEM_inference_std"] = metrics_all["MEM_inference_std"].round(1)
metrics_all["MEM_total_mean"] = metrics_all["MEM_total_mean"].round(1)

metrics_all

Unnamed: 0,method_,PER_mean,PER_std,BWT_mean,BWT_std,FWT_mean,FWT_std,HIGH_MEM_inference_mean,HIGH_MEM_inference_std,HIGH_MEM_total_mean,HIGH_MEM_total_std,LOW_MEM_inference_mean,LOW_MEM_inference_std,LOW_MEM_total_mean,LOW_MEM_total_std,MEM_inference_mean,MEM_inference_std,MEM_total_mean,MEM_total_std
17,PNN,89.2,5.1,0.0,0.0,6.7,4.8,9.9,0.0,9.9,0.0,9.9,0.0,9.9,0.0,9.9,0.0,9.9,0.0
7,FTN,86.1,6.8,0.0,0.0,3.7,2.1,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0
20,SCN,82.5,7.7,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0
9,"HiSPO (hε=0.1, lε=0.05, λ=0.05)",79.3,5.1,0.0,0.0,-3.1,6.2,2.7,0.8,2.7,0.8,3.2,1.0,3.2,1.0,2.9,0.9,2.9,0.861788
18,RPL,75.7,16.5,3.7,4.9,-10.5,11.1,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1,EWC (λ=0.1),71.0,16.3,-15.2,11.4,3.8,3.0,1.0,0.0,9.0,0.0,1.0,0.0,9.0,0.0,1.0,0.0,9.0,0.0
6,FT1,70.8,16.4,-15.4,10.7,3.7,2.1,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
13,L2 (λ=0.1),65.0,20.4,-19.0,11.9,1.5,3.7,1.0,0.0,5.0,0.0,1.0,0.0,5.0,0.0,1.0,0.0,5.0,0.0
19,SC1,60.6,17.2,-21.8,10.1,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
5,FRZ,58.5,15.2,0.0,0.0,-24.0,8.3,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [11]:
metrics_all_simplified = metrics_all.copy(deep=True)
# simplify method names (split by space and take the first word)
metrics_all_simplified["method_"] = metrics_all_simplified["method_"].apply(lambda x: x.split(" ")[0])
# rename
metrics_all_simplified = metrics_all_simplified.rename(columns={"method_": "method"})
# save
metrics_all_simplified.to_csv("./analysis.csv", index=False)
# show
metrics_all_simplified

Unnamed: 0,method,PER_mean,PER_std,BWT_mean,BWT_std,FWT_mean,FWT_std,HIGH_MEM_inference_mean,HIGH_MEM_inference_std,HIGH_MEM_total_mean,HIGH_MEM_total_std,LOW_MEM_inference_mean,LOW_MEM_inference_std,LOW_MEM_total_mean,LOW_MEM_total_std,MEM_inference_mean,MEM_inference_std,MEM_total_mean,MEM_total_std
17,PNN,89.2,5.1,0.0,0.0,6.7,4.8,9.9,0.0,9.9,0.0,9.9,0.0,9.9,0.0,9.9,0.0,9.9,0.0
7,FTN,86.1,6.8,0.0,0.0,3.7,2.1,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0
20,SCN,82.5,7.7,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0
9,HiSPO,79.3,5.1,0.0,0.0,-3.1,6.2,2.7,0.8,2.7,0.8,3.2,1.0,3.2,1.0,2.9,0.9,2.9,0.861788
18,RPL,75.7,16.5,3.7,4.9,-10.5,11.1,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1,EWC,71.0,16.3,-15.2,11.4,3.8,3.0,1.0,0.0,9.0,0.0,1.0,0.0,9.0,0.0,1.0,0.0,9.0,0.0
6,FT1,70.8,16.4,-15.4,10.7,3.7,2.1,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
13,L2,65.0,20.4,-19.0,11.9,1.5,3.7,1.0,0.0,5.0,0.0,1.0,0.0,5.0,0.0,1.0,0.0,5.0,0.0
19,SC1,60.6,17.2,-21.8,10.1,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
5,FRZ,58.5,15.2,0.0,0.0,-24.0,8.3,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


___
___