In [3]:
from reverb.training.utils import DEFAULT_TRAINING_KWARGS, DEFAULT_MODEL_KWARGS, DEFAULT_DATA_KWARGS
import segmentation_models_pytorch as smp

supervised_experiments = {
    "baseline": {
        "run_name": "ablations/supervised/baseline",
        "training_kwargs": {
            "max_epochs": 20,
        },
        "model_kwargs": DEFAULT_MODEL_KWARGS,
        "data_kwargs": DEFAULT_DATA_KWARGS,
    },
    "bs64": {
        "run_name": "ablations/supervised/bs64",
        "training_kwargs": {
            "max_epochs": 25,
            "batch_size": 64,   
        },
        "model_kwargs": DEFAULT_MODEL_KWARGS,
        "data_kwargs": DEFAULT_DATA_KWARGS,
    },
    "class_weight_0.1": {
        "run_name": "ablations/supervised/class_weight_0.1",
        "training_kwargs": {
            "max_epochs": 20,
            "class_weights": [0.1, 1.0]
        },
        "model_kwargs": DEFAULT_MODEL_KWARGS,
        "data_kwargs": DEFAULT_DATA_KWARGS,
    },
    "class_weight_1.0": {
        "run_name": "ablations/supervised/class_weight_1.0",
        "training_kwargs": {
            "max_epochs": 20,
            "class_weights": [1.0, 1.0]
        },
        "model_kwargs": DEFAULT_MODEL_KWARGS,
        "data_kwargs": DEFAULT_DATA_KWARGS,
    },
    "unetpp": {
        "run_name": "ablations/supervised/unetpp",
        "training_kwargs": {
            "max_epochs": 25,
        },
        "model_kwargs": {
            "model_type": smp.UnetPlusPlus,
            "encoder_name": "resnet18",
            "encoder_weights": "imagenet",
        },
        "data_kwargs": DEFAULT_DATA_KWARGS,
    },
    "no_imagenet": {
        "run_name": "ablations/supervised/no_imagenet",
        "training_kwargs": {
            "max_epochs": 30,
        },
        "model_kwargs": {
            "encoder_name": "resnet18",
            "encoder_weights": None,
        },
        "data_kwargs": DEFAULT_DATA_KWARGS,
    },
    "resnet50": {
        "run_name": "ablations/supervised/resnet50",
        "training_kwargs": {
            "max_epochs": 30,
        },
        "model_kwargs": {
            "encoder_name": "resnet50",
            "encoder_weights": "imagenet",
        },
        "data_kwargs": DEFAULT_DATA_KWARGS,
    },
    "resnet101": {
        "run_name": "ablations/supervised/resnet101",
        "training_kwargs": {
            "max_epochs": 30,
        },
        "model_kwargs": {
            "encoder_name": "resnet101",
            "encoder_weights": "imagenet",
        },
        "data_kwargs": DEFAULT_DATA_KWARGS,
    },
}

In [2]:
from reverb.training.utils import train, get_eval_dataloaders, compute_results_over_eval_sets, save_evaluation_results
eval_dataloaders = get_eval_dataloaders()


loading annotations into memory...
Done (t=0.18s)
creating index...
index created!
loading annotations into memory...
Done (t=0.04s)
creating index...
index created!
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!


In [None]:
for experiment in supervised_experiments.keys():
    experiment_config = supervised_experiments[experiment]
    for i in range(3):
        run_name = f"{experiment_config['run_name']}_{i}"

        training_kwargs = experiment_config['training_kwargs']
        model_kwargs = experiment_config['model_kwargs']
        data_kwargs = experiment_config['data_kwargs']
        # Train the model
        train(
            run_name=run_name,
            mode="supervised",
            model_kwargs=model_kwargs,
            data_kwargs=data_kwargs,  
            training_kwargs=training_kwargs,
        )

        # Evaluate the model
        results = compute_results_over_eval_sets(run_name, eval_dataloaders, model_kwargs=model_kwargs)
        save_evaluation_results(run_name, results)


NameError: name 'supervised_experiments' is not defined

In [2]:
import os
import json
import pandas as pd
experiment_names = supervised_experiments.keys()

# Root directory containing experiment folders like 'baseline_model_0/', 'baseline_model_1/', etc.
experiments_root = './checkpoints/ablations/supervised'

flattened_data = []

for exp_name in experiment_names:
    # Find folders starting with the experiment name and ending in a number (repeats)
    matching_folders = [
        d for d in os.listdir(experiments_root)
        if os.path.isdir(os.path.join(experiments_root, d)) and d.startswith(exp_name + '_')
    ]

    for folder in matching_folders:
        results_path = os.path.join(experiments_root, folder, 'eval_results.json')
        if os.path.isfile(results_path):
            with open(results_path, 'r') as f:
                datasets = json.load(f)
            for dataset, metrics in datasets.items():
                for metric, value in metrics.items():
                    if metric in ['miou', 'precision', 'recall']:
                        flattened_data.append({
                            'Experiment': exp_name,  # Group under common experiment name
                            'Repeat': folder,
                            'Dataset': dataset,
                            'Metric': metric,
                            'Value': value
                        })

# Convert to DataFrame
df = pd.DataFrame(flattened_data)

# Compute mean and SEM over repeats for each experiment
mean_df = (
    df.groupby(['Experiment', 'Dataset', 'Metric'])['Value']
    .mean()
    .reset_index()
    .rename(columns={'Value': 'Mean'})
)

sem_df = (
    df.groupby(['Experiment', 'Dataset', 'Metric'])['Value']
    .sem()
    .reset_index()
    .rename(columns={'Value': 'Std_Error'})
)

# Merge summaries
summary_df = pd.merge(mean_df, sem_df, on=['Experiment', 'Dataset', 'Metric'])

# Save outputs
df.to_csv('individual_repeat_results.csv', index=False)
summary_df.to_csv('supervised_experiment_summary.csv', index=False)

print("Saved individual repeat results and summary statistics.")


Saved individual repeat results and summary statistics.


In [3]:
# Filter only for 'miou'
miou_df = summary_df[summary_df['Metric'] == 'miou']

# Print one table per dataset
for dataset in miou_df['Dataset'].unique():
    print(f"\n--- Dataset: {dataset} ---")
    display(miou_df[miou_df['Dataset'] == dataset].drop(columns=['Metric']))



--- Dataset: rr_eval ---


Unnamed: 0,Experiment,Dataset,Mean,Std_Error
0,baseline,rr_eval,0.515835,0.00836
9,bs64,rr_eval,0.372996,0.059421
18,class_weight_0.1,rr_eval,0.363139,0.01398
27,class_weight_1.0,rr_eval,0.473995,0.020258
36,no_imagenet,rr_eval,0.484657,0.024188
45,resnet50,rr_eval,0.50237,0.002264
54,unetpp,rr_eval,0.469051,0.009271



--- Dataset: up34_eval ---


Unnamed: 0,Experiment,Dataset,Mean,Std_Error
3,baseline,up34_eval,0.493216,0.016052
12,bs64,up34_eval,0.403132,0.023373
21,class_weight_0.1,up34_eval,0.269367,0.008173
30,class_weight_1.0,up34_eval,0.515146,0.014166
39,no_imagenet,up34_eval,0.521609,0.008068
48,resnet50,up34_eval,0.530767,0.00563
57,unetpp,up34_eval,0.446338,0.023234



--- Dataset: valid ---


Unnamed: 0,Experiment,Dataset,Mean,Std_Error
6,baseline,valid,0.41618,0.002065
15,bs64,valid,0.345951,0.014573
24,class_weight_0.1,valid,0.317233,0.002798
33,class_weight_1.0,valid,0.393538,0.009941
42,no_imagenet,valid,0.420683,0.005143
51,resnet50,valid,0.420352,0.00539
60,unetpp,valid,0.405168,0.007725


In [5]:
import pandas as pd
all_pds = []
approaches = ['supervised', 'semisupervised', 'synthetic', 'multiclass']
for approach in approaches:
    # Load the summary DataFrame for the current approach
    df = pd.read_csv(f'{approach}_experiment_summary.csv')
    df["Approach"] = approach
    # Filter only for 'miou'
    
    all_pds.append(df)
all_pds = pd.concat(all_pds)

In [15]:
def smart_round(val, primary=2, fallback=3):
    """Round to `primary` digits unless it would round to 0.0, then use `fallback`."""
    rounded = val.round(primary)
    fallback_rounded = val.round(fallback)
    # Where the primary rounding produces 0.0 but fallback doesn’t
    mask = (rounded == 0) & (fallback_rounded != 0)
    return rounded.where(~mask, fallback_rounded)


def generate_combined_latex_table(df, dataset_rename, metric_order=["miou", "recall"], rename_map={}):
    import pandas as pd

    # Filter relevant metrics
    df = df[df["Metric"].isin(metric_order)]

    # Define column ordering
    dataset_order = list(dataset_rename.keys())
    col_order = [(ds, m) for ds in dataset_order for m in metric_order]

    # Pivot full table
    pivot = df.pivot_table(
        index=["Approach", "Experiment"],
        columns=["Dataset", "Metric"],
        values=["Mean", "Std_Error"]
    )

    # Format values as \SI
    formatted = pd.DataFrame(index=pivot.index)
    for (stat_type, dataset, metric) in pivot.columns:
        if stat_type == "Mean":
            mean = pivot["Mean"][(dataset, metric)]
            std = pivot["Std_Error"][(dataset, metric)]
            colname = (dataset, metric)
            mean_rounded = smart_round(mean)
            std_rounded = smart_round(std)

            formatted[colname] = (
                "\\SI[mode=text]{"
                + mean_rounded.astype(str)
                + " \\pm "
                + std_rounded.astype(str)
                + "}{}"
            )

    # Ensure column order
    formatted = formatted[[col for col in col_order if col in formatted.columns]]

    # Build headers
    header1 = ["Model Type"]
    header2 = ["Experiment"]
    for ds in dataset_order:
        label = dataset_rename[ds]
        header1 += [f"\\multicolumn{{{len(metric_order)}}}{{c}}{{{label}}}"]
        header2 += metric_order

    # Build rows with grouped approaches
    rows = []
    for approach in formatted.index.get_level_values(0).unique():
        rows.append("\\midrule")
        rows.append(f"\\multicolumn{{{1 + len(col_order)}}}{{l}}{{\\textbf{{{approach}}}}} \\\\")
        for experiment in formatted.loc[approach].index:
            values = formatted.loc[(approach, experiment)]
            row_label = rename_map.get((approach, experiment), experiment)
            row = [row_label] + [values.get(col, "") for col in col_order]

            rows.append(" & ".join(row) + " \\\\")

    # Construct LaTeX table
    ncols = 1 + len(col_order)
    col_spec = "l" + "c" * (ncols - 1)
    latex = "\n".join([
        "\\begin{tabular}{" + col_spec + "}",
        "\\toprule",
        " & ".join(header1) + " \\\\",
        " & ".join(header2) + " \\\\",
        "\n".join(rows),
        "\\bottomrule",
        "\\end{tabular}"
    ])

    return latex


In [14]:
unique_pairs = all_pds[["Approach", "Experiment"]].drop_duplicates().sort_values(by=["Approach", "Experiment"])

# Generate dictionary definition
print("rename_map = {")
for _, row in unique_pairs.iterrows():
    print(f"    ({row['Approach']!r}, {row['Experiment']!r}): '',")
print("}")

rename_map = {
    ('multiclass', 'baseline'): '',
    ('semisupervised', 'alpha_0.9'): '',
    ('semisupervised', 'alpha_0.98'): '',
    ('semisupervised', 'alpha_0.999'): '',
    ('semisupervised', 'baseline'): '',
    ('semisupervised', 'lambda_0.1'): '',
    ('semisupervised', 'lambda_2.0'): '',
    ('semisupervised', 'no_ramp_up'): '',
    ('supervised', 'baseline'): '',
    ('supervised', 'bs64'): '',
    ('supervised', 'class_weight_0.1'): '',
    ('supervised', 'class_weight_1.0'): '',
    ('supervised', 'no_imagenet'): '',
    ('supervised', 'resnet50'): '',
    ('supervised', 'unetpp'): '',
    ('synthetic', 'baseline'): '',
    ('synthetic', 'long_pretrain'): '',
    ('synthetic', 'long_pretrain_bs128'): '',
    ('synthetic', 'no_pre'): '',
    ('synthetic', 'no_weight_decay_all'): '',
    ('synthetic', 'no_weight_decay_fine'): '',
}


In [16]:
# Define how you'd like the datasets to appear
dataset_rename = {
    "valid": "UP05 Validation Set",
    "up34_eval": "UPFLOW UP34",
    "rr_eval": "RHUM-RUM RR40"
}
rename_map = rename_map = {
    ('multiclass', 'baseline'): 'Multiclass Baseline',
    ('semisupervised', 'alpha_0.9'): r'$\alpha=0.9$',
    ('semisupervised', 'alpha_0.98'): 'Baseline',
    ('semisupervised', 'alpha_0.999'): r'$\alpha=0.999$',
    ('semisupervised', 'baseline'): r'$\alpha=0.995$',
    ('semisupervised', 'lambda_0.1'): r'$\lambda_{\textrm{cons}}=0.1$',
    ('semisupervised', 'lambda_2.0'): r'$\lambda_{\textrm{cons}}=2.0$',
    ('semisupervised', 'no_ramp_up'): r'No $\lambda_{\textrm{cons}}$ Ramp Up',
    ('supervised', 'baseline'): 'Baseline',
    ('supervised', 'bs64'): 'Batch size 64',
    ('supervised', 'class_weight_0.1'): r'Class weight $w=0.1$',
    ('supervised', 'class_weight_1.0'): r'Class weight $w=1.0$',
    ('supervised', 'no_imagenet'): 'No ImageNet weights',
    ('supervised', 'resnet50'): 'Resnet50 encoder',
    ('supervised', 'unetpp'): 'U-net++ decoder',
    ('synthetic', 'baseline'): 'Baseline',
    ('synthetic', 'long_pretrain'): '20k spectrogram pretraining',
    ('synthetic', 'long_pretrain_bs128'): 'Batch size 128 pretraining',
    ('synthetic', 'no_pre'): 'No ImageNet weights',
    ('synthetic', 'no_weight_decay_all'): 'No weight decay',
    ('synthetic', 'no_weight_decay_fine'): 'Only weight decay pretraining',
}

latex = generate_combined_latex_table(all_pds, dataset_rename, rename_map=rename_map)
print(latex)


\begin{tabular}{lcccccc}
\toprule
Model Type & \multicolumn{2}{c}{UP05 Validation Set} & \multicolumn{2}{c}{UPFLOW UP34} & \multicolumn{2}{c}{RHUM-RUM RR40} \\
Experiment & miou & recall & miou & recall & miou & recall \\
\midrule
\multicolumn{7}{l}{\textbf{multiclass}} \\
Multiclass Baseline & \SI[mode=text]{0.41 \pm 0.004}{} & \SI[mode=text]{0.53 \pm 0.03}{} & \SI[mode=text]{0.49 \pm 0.01}{} & \SI[mode=text]{0.75 \pm 0.03}{} & \SI[mode=text]{0.48 \pm 0.01}{} & \SI[mode=text]{0.61 \pm 0.04}{} \\
\midrule
\multicolumn{7}{l}{\textbf{semisupervised}} \\
$\alpha=0.9$ & \SI[mode=text]{0.43 \pm 0.002}{} & \SI[mode=text]{0.56 \pm 0.01}{} & \SI[mode=text]{0.52 \pm 0.002}{} & \SI[mode=text]{0.8 \pm 0.03}{} & \SI[mode=text]{0.45 \pm 0.01}{} & \SI[mode=text]{0.58 \pm 0.04}{} \\
Baseline & \SI[mode=text]{0.43 \pm 0.002}{} & \SI[mode=text]{0.59 \pm 0.01}{} & \SI[mode=text]{0.47 \pm 0.01}{} & \SI[mode=text]{0.85 \pm 0.01}{} & \SI[mode=text]{0.5 \pm 0.02}{} & \SI[mode=text]{0.72 \pm 0.06}{} \\
$\alp