In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import sbibm
import sbibm.visualisation

import altair as alt
from altair_saver import save
alt.renderers.enable('jupyterlab')

RendererRegistry.enable('jupyterlab')

In [3]:
num_obs_map = {1000: "10³", 10000: "10⁴", 100000: "10⁵"}
num_sims_remap = {"10³": "1e3", "10⁴": "1e4", "10⁵": "1e5"}
relevant_tasks = ["gaussian_linear_uniform", "slcp_distractors", "slcp", "two_moons", "gaussian_mixture"]

# Get the files from marginalized sbibm

In [4]:
sbibm_df = pd.read_csv("../marginalize_sbibm/marginal-c2st-summary.csv", index_col=0)
sbibm_df['num_simulations'] = sbibm_df['num_simulations'].map(num_obs_map)

In [5]:
sbibm_df.columns

Index(['task', 'num_simulations', 'num_observation', 'algorithm', 'seed', '0',
       '1', '2', '3', '4', '5', '6', '7', '8', '9', '[0, 1]', '[0, 2]',
       '[0, 3]', '[0, 4]', '[0, 5]', '[0, 6]', '[0, 7]', '[0, 8]', '[0, 9]',
       '[1, 2]', '[1, 3]', '[1, 4]', '[1, 5]', '[1, 6]', '[1, 7]', '[1, 8]',
       '[1, 9]', '[2, 3]', '[2, 4]', '[2, 5]', '[2, 6]', '[2, 7]', '[2, 8]',
       '[2, 9]', '[3, 4]', '[3, 5]', '[3, 6]', '[3, 7]', '[3, 8]', '[3, 9]',
       '[4, 5]', '[4, 6]', '[4, 7]', '[4, 8]', '[4, 9]', '[5, 6]', '[5, 7]',
       '[5, 8]', '[5, 9]', '[6, 7]', '[6, 8]', '[6, 9]', '[7, 8]', '[7, 9]',
       '[8, 9]', '1-dim mean', '1-dim var', '2-dim mean', '2-dim var', 'path',
       'folder'],
      dtype='object')

In [6]:
sbibm_result = sbibm_df.groupby([
    "task", 
    "num_simulations", 
#     "num_observation", 
    "algorithm",
]).agg(**{
#     "kl fixed 1-dim avg": ("KLD_FIX-1-dim mean", "mean"),
#     "kl fixed 2-dim avg": ("KLD_FIX-2-dim mean", "mean"),
    "c2st 1-dim avg": ("1-dim mean", "mean"),
    "c2st 2-dim avg": ("2-dim mean", "mean"),
})  # .loc['lotka_volterra', "10³"]
# print(sbibm_result)
sbibm_result.loc['two_moons']

Unnamed: 0_level_0,Unnamed: 1_level_0,c2st 1-dim avg,c2st 2-dim avg
num_simulations,algorithm,Unnamed: 2_level_1,Unnamed: 3_level_1
10³,NLE,0.620182,0.76472
10³,NPE,0.59406,0.71345
10³,NRE,0.658005,0.82325
10³,REJ-ABC,0.750385,0.95615
10³,SMC-ABC,0.69706,0.908875
10³,SNLE,0.559593,0.653795
10³,SNPE,0.562393,0.646305
10³,SNRE,0.591803,0.658745
10⁴,NLE,0.581953,0.701185
10⁴,NPE,0.539565,0.604915


# From tmnre

In [7]:
file = "reports/swyft_uniform_2d_results_budget.csv"

def put_in_bins(x):
    if 0 <= x and x <= 1500:
        return 1_000
    elif 1501 <= x and x <= 15000:
        return 10_000
    else:
        return 100_000



swyft = pd.read_csv(file, index_col=0)

# special for our stochastic one
swyft["num_simulations"] = swyft["num_constraining_simulations"].map(put_in_bins)
swyft

Unnamed: 0,task,num_simulations,num_constraining_simulations,num_observation,algorithm,seed,dimension,rounds,"C2ST (0,)","C2ST (1,)",...,"KLD_FIX (7, 9)","KLD_FIX (4, 6)","KLD_FIX (6, 8)","KLD_FIX (5, 7)","KLD_FIX (3, 8)","KLD_FIX (0, 6)","KLD_FIX (1, 8)","KLD_FIX (6,)","KLD_FIX (1, 7)","KLD_FIX (0, 9)"
0,slcp,100000,100647,10,tmnre,3254095118,5,1,0.63675,0.65310,...,,,,,,,,,,
1,slcp,100000,104968,3,tmnre,221300905,5,1,0.52980,0.54690,...,,,,,,,,,,
2,slcp,100000,99678,7,tmnre,622829621,5,1,0.53315,0.61730,...,,,,,,,,,,
3,slcp,10000,10546,10,tmnre,1259500845,5,1,0.58595,0.62285,...,,,,,,,,,,
4,slcp,100000,99616,8,tmnre,2302766548,5,1,0.51390,0.84670,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,slcp_distractors,100000,100091,1,tmnre,236924093,5,1,0.52345,0.62905,...,,,,,,,,,,
146,slcp_distractors,1000,1035,4,tmnre,2895523053,5,1,0.63915,0.96210,...,,,,,,,,,,
147,slcp_distractors,1000,957,7,tmnre,2184000798,5,1,0.65365,0.88070,...,,,,,,,,,,
148,slcp_distractors,100000,100713,9,tmnre,3896778178,5,1,0.53420,0.73435,...,,,,,,,,,,


In [8]:
swyft["RT"].sum()

220838.20588999998

In [9]:
# too many sims?
had_too_many_sims = swyft["num_constraining_simulations"] > swyft["num_simulations"]
print("Number of sims which were allowed too many to constrain", sum(had_too_many_sims))

printable_columns = ["task", "num_constraining_simulations", "num_observation"]
swyft[had_too_many_sims][printable_columns].sort_values(["task", "num_observation"])

Number of sims which were allowed too many to constrain 105


Unnamed: 0,task,num_constraining_simulations,num_observation
71,gaussian_linear_uniform,103129,1
88,gaussian_linear_uniform,10035,1
60,gaussian_linear_uniform,1027,2
67,gaussian_linear_uniform,102039,2
81,gaussian_linear_uniform,10272,2
...,...,...,...
100,two_moons,1027,9
102,two_moons,10393,9
118,two_moons,100439,9
89,two_moons,104919,10


In [10]:
# rename sims to match
swyft['num_simulations'] = swyft['num_simulations'].map(num_obs_map)

In [11]:
# which ones had at least one zoomin
swyft[swyft["rounds"] != 1]["task"].unique()

array(['slcp', 'gaussian_mixture', 'gaussian_linear_uniform', 'two_moons'],
      dtype=object)

In [12]:
# are there any difference between sbibm marginalized and this?
sbibm_relevant = sbibm_df[sbibm_df["task"].isin(relevant_tasks)]
merged = sbibm_relevant.merge(
    swyft, 
    on=["task", "num_simulations", "num_observation"],
    how="left",
    indicator=True,
)
columns = ["task", "num_simulations", "num_observation"]
merged[merged['_merge'] == 'left_only'][columns].drop_duplicates().sort_values(columns)

Unnamed: 0,task,num_simulations,num_observation


In [13]:
metrics_to_combine = {
#     "kl fixed 1-dim avg": ("KLD_FIX-1-dim mean", "mean"),
#     "kl fixed 2-dim avg": ("KLD_FIX-2-dim mean", "mean"),
    "c2st 1-dim avg": ("C2ST 1-dim mean", "mean"),
}
if "C2ST 2-dim mean" in swyft.columns:
    metrics_to_combine.update({"c2st 2-dim avg": ("C2ST 2-dim mean", "mean")})

swyft_result = swyft.groupby([
    "task", 
    "num_simulations", 
#     "num_observation", 
    "algorithm",
]).agg(**metrics_to_combine)
swyft_result

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,c2st 1-dim avg,c2st 2-dim avg
task,num_simulations,algorithm,Unnamed: 3_level_1,Unnamed: 4_level_1
gaussian_linear_uniform,10³,tmnre,0.587281,0.635547
gaussian_linear_uniform,10⁴,tmnre,0.530641,0.550524
gaussian_linear_uniform,10⁵,tmnre,0.509203,0.525162
gaussian_mixture,10³,tmnre,0.678205,0.775395
gaussian_mixture,10⁴,tmnre,0.54151,0.679445
gaussian_mixture,10⁵,tmnre,0.517097,0.54989
slcp,10³,tmnre,0.730239,0.861641
slcp,10⁴,tmnre,0.6397,0.762426
slcp,10⁵,tmnre,0.589901,0.715465
slcp_distractors,10³,tmnre,0.784208,0.890414


# Combine

In [14]:
combined = pd.concat([swyft_result, sbibm_result]).sort_index()
combined.loc["slcp"].sort_values("c2st 1-dim avg")
# sbibm_result.join(swyft_result, on=["task", "num_simulations"], how='inner')

Unnamed: 0_level_0,Unnamed: 1_level_0,c2st 1-dim avg,c2st 2-dim avg
num_simulations,algorithm,Unnamed: 2_level_1,Unnamed: 3_level_1
10⁵,SNLE,0.525365,0.543238
10⁵,SNPE,0.546987,0.58283
10⁵,NLE,0.554348,0.599687
10⁵,SNRE,0.560103,0.612055
10⁴,SNLE,0.564206,0.610173
10⁵,tmnre,0.589901,0.715465
10⁵,NPE,0.592778,0.66835
10⁴,NLE,0.598044,0.654428
10⁴,SNPE,0.607068,0.689357
10⁴,tmnre,0.6397,0.762426


# Now let's see what sbibm plotting can do

In [15]:
def split_metric_by_task_dimension(df, metric: str = "C2ST"):
    shared_columns = ["task", "num_simulations", "num_observation", "algorithm"]
    all_columns = shared_columns + [metric]
    out = []
    for (_, r) in df.iterrows():
        oneD = r[shared_columns]
        oneD["task"] = r["task"] + "_1d"
        oneD["C2ST"] = r[f"{metric} 1-dim mean"]
        
        twoD = r[shared_columns]
        twoD["task"] = r["task"] + "_2d"
        twoD["C2ST"] = r[f"{metric} 2-dim mean"]
        
        out.extend([oneD.to_frame().T, twoD.to_frame().T])
    return pd.concat(out)

In [16]:
# Do the preparing

target_1d_metric_name = "C2ST 1-dim mean"
target_2d_metric_name = "C2ST 2-dim mean"
sbibm_df = sbibm_df.rename(columns={'1-dim mean': target_1d_metric_name, "2-dim mean": target_2d_metric_name})
print(sbibm_df["algorithm"].unique().tolist())
print()

['SMC-ABC', 'NRE', 'REJ-ABC', 'NLE', 'SNPE', 'SNRE', 'SNLE', 'NPE']



In [17]:
sbibm_df['algorithm'] = sbibm_df['algorithm'].map(lambda x: " REJ-ABC" if x == 'REJ-ABC' else x)
swyft['algorithm'] = swyft['algorithm'].map(lambda x: " Our Method" if x == 'tmnre' else x)
color_dict = {
    'REJ-ABC': "#4A8CFB", 
    'NLE': "#60D098", 
    'NPE': "#CC66CC",
    'NRE': "#FFCA58", 
    'NRE_B': "#FFCA58", 
    'SMC-ABC': "#215FC6",
    'SNLE': "#339966", 
    'SNPE': "#990099", 
    'SNRE': "#FFA60A", 
    'SNRE_B': "#FFA60A", 
    "CMNRE": "#000000",
}
task_map = lambda name: sbibm.get_task(name).name_display

In [18]:
analyze = pd.concat([sbibm_df, swyft])
for task in swyft['task'].unique():
    analyze_subset = analyze.query(f"task == '{task}'")
    analyze_subset = split_metric_by_task_dimension(analyze_subset)
    
    analyze_subset["C2ST-ddm"] = analyze_subset["C2ST"]
    analyze_subset["algorithm"] = analyze_subset["algorithm"].map(lambda x: "NRE_B" if x == "NRE" else x)
    analyze_subset["algorithm"] = analyze_subset["algorithm"].map(lambda x: "SNRE_B" if x == "SNRE" else x)
    analyze_subset["algorithm"] = analyze_subset["algorithm"].map(lambda x: "  TMNRE" if x == " Our Method" else x)
    analyze_subset["num_simulations"] = analyze_subset["num_simulations"].map(num_sims_remap)
    
    do_column_label = True if task == "two_moons" else False
    do_x_label = True if task == "gaussian_mixture" else False
    column_title = "Number of Simulations" if task == "gaussian_mixture" else ""
    
    fig = sbibm.visualisation.fig_metric(
        analyze_subset, 
        title=f"● 1-dim / ■ 2-dim Marginal {task_map(task)}",
        metric="C2ST-ddm",
        config="custom",
        keywords={
            "shape": alt.Shape('task:N', legend=None), 
            "detail": alt.Detail('task:N'),
        },
        colors_dict=color_dict,
        labels=do_column_label,
        x_axis_kwargs={"labels": do_x_label},
        column_title=column_title,
    )
    save(fig, f"figures/{task}.png")