In [222]:
import os
import re
import numpy as np
import pandas as pd
import sklearn

from sklearn.metrics import f1_score, classification_report, confusion_matrix

import sys

sys.path.append("../..")

In [223]:
# CONSTANTS

CLEANING_METHODS = [
        "aum",
        "cincer",
        "cleanlab",
        "simifeat"
    ]

In [224]:
# Config

BASE_PATH = "/zfsauton/data/public/vsanil/aqua_results"

FOLDER_PATTERN = f"results_(?P<timestamp>.*)_randomseed_(?P<randomseed>.*)_(?P<basemodel>.*)"
FILE_PATTERN = f"(?P<dataset>.*)_(?P<noisetype>.*)_label_issues.csv"

In [225]:
# Read results

results_dict = dict()

result_dirs = os.listdir(BASE_PATH)
result_dirs.sort()
for result_dir in result_dirs:
    re_folder = re.match(FOLDER_PATTERN, result_dir)
    time_stamp = re_folder.group("timestamp")
    random_seed = re_folder.group("randomseed")
    base_model = re_folder.group("basemodel")
    result_dir_path = os.path.join(BASE_PATH, result_dir)
    for filename in os.listdir(result_dir_path):
        re_file = re.match(FILE_PATTERN, filename)
        if not re_file:
            continue
        dataset = re_file.group("dataset")
        noise_type = re_file.group("noisetype")
        if dataset not in results_dict:
            results_dict[dataset] = dict()
        if base_model not in results_dict[dataset]:
            results_dict[dataset][base_model] = dict()
        if noise_type not in results_dict[dataset][base_model]:
            results_dict[dataset][base_model][noise_type] = dict()
        if random_seed not in results_dict[dataset][base_model][noise_type]:
            results_dict[dataset][base_model][noise_type][random_seed] = dict()
        data_path = os.path.join(result_dir_path, filename)
        results_dict[dataset][base_model][noise_type][random_seed]["datacard"] = pd.read_csv(data_path, index_col=0)


In [226]:
# Processed Datasets

results_dict.keys()

dict_keys(['electricdevices', 'crop', 'mitbih', 'pendigits', 'whalecalls', 'adult', 'dry_bean', 'credit_fraud', 'car_evaluation', 'mushrooms', 'compas', 'cifar10', 'cxr', 'imdb', 'tweeteval'])

In [227]:
# Sample Datacard

results_dict["imdb"]["all-distilroberta-v1"]["uniform-0.1"]["42"]["datacard"]

Unnamed: 0,is_injected_noise,noisy_label,label_issues_cincer,preds_cleaned_cincer,label_issues_aum,preds_cleaned_aum,label_issues_simifeat,preds_cleaned_simifeat,label_issues_cleanlab,preds_cleaned_cleanlab,observed_labels
0,0,1,0,1,0,1,0,1,0,1,1
1,1,0,1,1,0,1,0,1,1,1,1
2,0,1,0,1,0,1,0,1,0,1,1
3,0,1,0,1,0,1,0,1,1,1,1
4,0,1,0,1,0,1,0,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...
24995,0,0,0,0,1,0,0,0,1,0,0
24996,0,0,0,1,1,1,0,1,0,1,0
24997,1,1,0,0,1,0,1,0,0,0,0
24998,0,0,0,0,1,0,0,0,1,1,0


In [228]:
import numpy as np
import sklearn.metrics as skm
from aqua.metrics import get_metrics, f1_score

In [229]:
# Calculate Cleaning Method Performance

metrics = ["f1", "weighted_f1", "accuracy", "precision", "recall", "error_rate"]
    
for dataset in results_dict:
    for base_model in results_dict[dataset]:
        for noise_type in results_dict[dataset][base_model]:
            for random_seed in results_dict[dataset][base_model][noise_type]:
                datacard = results_dict[dataset][base_model][noise_type][random_seed]["datacard"]
                if len(datacard.columns) != 11:
                    continue
                    
                results_dict[dataset][base_model][noise_type][random_seed]["cleaning_method_performance"] = dict()
                cm_performance_dict = results_dict[dataset][base_model][noise_type][random_seed]["cleaning_method_performance"]
                for cleaning_method in CLEANING_METHODS:
                    results = get_metrics(datacard["is_injected_noise"], datacard[f"label_issues_{cleaning_method}"])
                    noise_pred_results = {metric:result for (metric, result) in zip(metrics, results)}
                    noisy_datacard = datacard[datacard["is_injected_noise"] == 1]
                    results = get_metrics(noisy_datacard["noisy_label"], noisy_datacard[f"preds_cleaned_{cleaning_method}"])
                    noisy_label_pred_results = {metric:result for (metric, result) in zip(metrics, results)}
                    cm_performance_dict[f"{cleaning_method}_noise_pred"] = noise_pred_results
                    cm_performance_dict[f"{cleaning_method}_noisy_label_pred"] = noisy_label_pred_results
                    
                #results_dict[dataset][base_model][noise_type][random_seed]["downstream_model_performance"] = dict()
                #dm_performance_dict = results_dict[dataset][base_model][noise_type][random_seed]["downstream_model_performance"]

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(aver

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(aver

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(aver

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(aver

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(aver

In [232]:
# TABLE 1

from tqdm import tqdm
import aqua.data.preset_dataloaders as presets
from aqua.configs import main_config, data_configs, model_configs

FILE_PATTERN = f"(?P<dataset>.*)_(?P<downstream_model>.*)_(?P<noise_type>.*)_(?P<cleaning_method>.*)_(?P<random_seed>.*)_test_preds.csv"
base_downstream_model_path = "/zfsauton/data/public/vsanil/aqua/aqua_downstream_models/"

base_model = "resnet1d"
noise_rate_filters = ("0.1", "0.0")
random_seed_filters = ("42")
dataset_test_labels = dict()

results = dict()

result_files = os.listdir(base_downstream_model_path)
result_files.sort()
for result_file in tqdm(result_files):
    re_file = re.match(FILE_PATTERN, result_file)
    dataset = re_file.group("dataset")
    downstream_model = re_file.group("downstream_model")
    noise_type = re_file.group("noise_type")
#     if "-" in noise_type and any([noise_type.endswith(noise_rate) for noise_rate in noise_rate_filters]):
#         continue
    cleaning_method = re_file.group("cleaning_method")
    random_seed = re_file.group("random_seed")
    
    if random_seed not in random_seed_filters:
        continue
    
    if base_model != downstream_model:    # TODO: Handle this
        continue
    
    if dataset not in dataset_test_labels:
        data_aq, data_aq_test = getattr(presets, f'load_{dataset}')(data_configs[dataset])
        dataset_test_labels[dataset] = data_aq_test.labels

    print(result_file)
    result_file = os.path.join(base_downstream_model_path, result_file)
    y_pred = pd.read_csv(result_file, header=None)[0]
    
    test_results = get_metrics(dataset_test_labels[dataset], y_pred)
    #print(f"{test_results[1]}\t\t{dataset}_{noise_type}_{cleaning_method}_results")
    
    run_name = f"{noise_type}_{cleaning_method}"
    if dataset not in results:
        results[dataset] = dict()
    results[dataset][run_name] = test_results[1] 

  8%|███▊                                              | 46/608 [00:00<00:01, 382.72it/s]

crop_resnet1d_asymmetric-0.1_aum_42_test_preds.csv
crop_resnet1d_asymmetric-0.1_cincer_42_test_preds.csv
crop_resnet1d_asymmetric-0.1_cleanlab_42_test_preds.csv
crop_resnet1d_asymmetric-0.1_nocleaning_42_test_preds.csv
crop_resnet1d_asymmetric-0.1_simifeat_42_test_preds.csv
crop_resnet1d_asymmetric-0.2_aum_42_test_preds.csv
crop_resnet1d_asymmetric-0.2_cincer_42_test_preds.csv
crop_resnet1d_asymmetric-0.2_cleanlab_42_test_preds.csv
crop_resnet1d_asymmetric-0.2_nocleaning_42_test_preds.csv
crop_resnet1d_asymmetric-0.2_simifeat_42_test_preds.csv
crop_resnet1d_asymmetric-0.4_aum_42_test_preds.csv
crop_resnet1d_asymmetric-0.4_cincer_42_test_preds.csv
crop_resnet1d_asymmetric-0.4_cleanlab_42_test_preds.csv
crop_resnet1d_asymmetric-0.4_nocleaning_42_test_preds.csv
crop_resnet1d_asymmetric-0.4_simifeat_42_test_preds.csv
crop_resnet1d_classdependent_aum_42_test_preds.csv
crop_resnet1d_classdependent_cincer_42_test_preds.csv
crop_resnet1d_classdependent_cleanlab_42_test_preds.csv
crop_resnet1d_

 14%|██████▉                                           | 85/608 [00:00<00:03, 143.76it/s]

crop_resnet1d_instancedependent-0.1_aum_42_test_preds.csv
crop_resnet1d_instancedependent-0.1_cincer_42_test_preds.csv
crop_resnet1d_instancedependent-0.1_cleanlab_42_test_preds.csv
crop_resnet1d_instancedependent-0.1_nocleaning_42_test_preds.csv
crop_resnet1d_instancedependent-0.1_simifeat_42_test_preds.csv
crop_resnet1d_instancedependent-0.2_aum_42_test_preds.csv
crop_resnet1d_instancedependent-0.2_cincer_42_test_preds.csv
crop_resnet1d_instancedependent-0.2_cleanlab_42_test_preds.csv
crop_resnet1d_instancedependent-0.2_nocleaning_42_test_preds.csv


 18%|████████▌                                        | 107/608 [00:00<00:04, 118.77it/s]

crop_resnet1d_instancedependent-0.2_simifeat_42_test_preds.csv
crop_resnet1d_instancedependent-0.4_aum_42_test_preds.csv
crop_resnet1d_instancedependent-0.4_cincer_42_test_preds.csv
crop_resnet1d_instancedependent-0.4_cleanlab_42_test_preds.csv
crop_resnet1d_instancedependent-0.4_nocleaning_42_test_preds.csv
crop_resnet1d_instancedependent-0.4_simifeat_42_test_preds.csv
crop_resnet1d_no-noise-0.0_aum_42_test_preds.csv
crop_resnet1d_no-noise-0.0_cincer_42_test_preds.csv
crop_resnet1d_no-noise-0.0_cleanlab_42_test_preds.csv
crop_resnet1d_no-noise-0.0_nocleaning_42_test_preds.csv


 20%|██████████                                        | 123/608 [00:01<00:04, 99.54it/s]

crop_resnet1d_no-noise-0.0_simifeat_42_test_preds.csv
crop_resnet1d_nonoise_nocleaning_42_test_preds.csv
crop_resnet1d_uniform-0.1_aum_42_test_preds.csv
crop_resnet1d_uniform-0.1_cincer_42_test_preds.csv
crop_resnet1d_uniform-0.1_cleanlab_42_test_preds.csv
crop_resnet1d_uniform-0.1_nocleaning_42_test_preds.csv
crop_resnet1d_uniform-0.1_simifeat_42_test_preds.csv
crop_resnet1d_uniform-0.2_aum_42_test_preds.csv
crop_resnet1d_uniform-0.2_cincer_42_test_preds.csv
crop_resnet1d_uniform-0.2_cleanlab_42_test_preds.csv


 24%|████████████                                      | 147/608 [00:01<00:04, 96.45it/s]

crop_resnet1d_uniform-0.2_nocleaning_42_test_preds.csv
crop_resnet1d_uniform-0.2_simifeat_42_test_preds.csv
crop_resnet1d_uniform-0.4_aum_42_test_preds.csv
crop_resnet1d_uniform-0.4_cincer_42_test_preds.csv
crop_resnet1d_uniform-0.4_cleanlab_42_test_preds.csv
crop_resnet1d_uniform-0.4_nocleaning_42_test_preds.csv
crop_resnet1d_uniform-0.4_simifeat_42_test_preds.csv


 30%|██████████████▊                                  | 184/608 [00:01<00:03, 112.75it/s]

electricdevices_resnet1d_asymmetric-0.1_aum_42_test_preds.csv
electricdevices_resnet1d_asymmetric-0.1_cincer_42_test_preds.csv
electricdevices_resnet1d_asymmetric-0.1_cleanlab_42_test_preds.csv
electricdevices_resnet1d_asymmetric-0.1_nocleaning_42_test_preds.csv
electricdevices_resnet1d_asymmetric-0.1_simifeat_42_test_preds.csv
electricdevices_resnet1d_asymmetric-0.2_aum_42_test_preds.csv
electricdevices_resnet1d_asymmetric-0.2_cincer_42_test_preds.csv
electricdevices_resnet1d_asymmetric-0.2_cleanlab_42_test_preds.csv
electricdevices_resnet1d_asymmetric-0.2_nocleaning_42_test_preds.csv
electricdevices_resnet1d_asymmetric-0.2_simifeat_42_test_preds.csv
electricdevices_resnet1d_asymmetric-0.4_aum_42_test_preds.csv
electricdevices_resnet1d_asymmetric-0.4_cincer_42_test_preds.csv
electricdevices_resnet1d_asymmetric-0.4_cleanlab_42_test_preds.csv
electricdevices_resnet1d_asymmetric-0.4_nocleaning_42_test_preds.csv
electricdevices_resnet1d_asymmetric-0.4_simifeat_42_test_preds.csv
electricde

 36%|█████████████████▋                               | 220/608 [00:01<00:02, 139.13it/s]

electricdevices_resnet1d_classdependent_nocleaning_42_test_preds.csv
electricdevices_resnet1d_classdependent_simifeat_42_test_preds.csv
electricdevices_resnet1d_instancedependent-0.1_aum_42_test_preds.csv
electricdevices_resnet1d_instancedependent-0.1_cincer_42_test_preds.csv
electricdevices_resnet1d_instancedependent-0.1_cleanlab_42_test_preds.csv
electricdevices_resnet1d_instancedependent-0.1_nocleaning_42_test_preds.csv
electricdevices_resnet1d_instancedependent-0.1_simifeat_42_test_preds.csv
electricdevices_resnet1d_instancedependent-0.2_aum_42_test_preds.csv
electricdevices_resnet1d_instancedependent-0.2_cincer_42_test_preds.csv
electricdevices_resnet1d_instancedependent-0.2_cleanlab_42_test_preds.csv
electricdevices_resnet1d_instancedependent-0.2_nocleaning_42_test_preds.csv
electricdevices_resnet1d_instancedependent-0.2_simifeat_42_test_preds.csv
electricdevices_resnet1d_instancedependent-0.4_aum_42_test_preds.csv
electricdevices_resnet1d_instancedependent-0.4_cincer_42_test_pre

 42%|████████████████████▍                            | 253/608 [00:02<00:02, 143.14it/s]

electricdevices_resnet1d_no-noise-0.0_cincer_42_test_preds.csv
electricdevices_resnet1d_no-noise-0.0_cleanlab_42_test_preds.csv
electricdevices_resnet1d_no-noise-0.0_nocleaning_42_test_preds.csv
electricdevices_resnet1d_no-noise-0.0_simifeat_42_test_preds.csv
electricdevices_resnet1d_nonoise_nocleaning_42_test_preds.csv
electricdevices_resnet1d_uniform-0.1_aum_42_test_preds.csv
electricdevices_resnet1d_uniform-0.1_cincer_42_test_preds.csv
electricdevices_resnet1d_uniform-0.1_cleanlab_42_test_preds.csv
electricdevices_resnet1d_uniform-0.1_nocleaning_42_test_preds.csv
electricdevices_resnet1d_uniform-0.1_simifeat_42_test_preds.csv
electricdevices_resnet1d_uniform-0.2_aum_42_test_preds.csv
electricdevices_resnet1d_uniform-0.2_cincer_42_test_preds.csv
electricdevices_resnet1d_uniform-0.2_cleanlab_42_test_preds.csv
electricdevices_resnet1d_uniform-0.2_nocleaning_42_test_preds.csv
electricdevices_resnet1d_uniform-0.2_simifeat_42_test_preds.csv
electricdevices_resnet1d_uniform-0.4_aum_42_test

 47%|███████████████████████▏                         | 287/608 [00:02<00:02, 145.58it/s]

electricdevices_resnet1d_uniform-0.4_nocleaning_42_test_preds.csv
electricdevices_resnet1d_uniform-0.4_simifeat_42_test_preds.csv
mitbih_resnet1d_asymmetric-0.1_aum_42_test_preds.csv
mitbih_resnet1d_asymmetric-0.1_cincer_42_test_preds.csv
mitbih_resnet1d_asymmetric-0.1_cleanlab_42_test_preds.csv
mitbih_resnet1d_asymmetric-0.1_nocleaning_42_test_preds.csv
mitbih_resnet1d_asymmetric-0.1_simifeat_42_test_preds.csv
mitbih_resnet1d_asymmetric-0.2_aum_42_test_preds.csv
mitbih_resnet1d_asymmetric-0.2_cincer_42_test_preds.csv
mitbih_resnet1d_asymmetric-0.2_cleanlab_42_test_preds.csv
mitbih_resnet1d_asymmetric-0.2_nocleaning_42_test_preds.csv
mitbih_resnet1d_asymmetric-0.2_simifeat_42_test_preds.csv
mitbih_resnet1d_asymmetric-0.4_aum_42_test_preds.csv
mitbih_resnet1d_asymmetric-0.4_cincer_42_test_preds.csv
mitbih_resnet1d_asymmetric-0.4_cleanlab_42_test_preds.csv
mitbih_resnet1d_asymmetric-0.4_nocleaning_42_test_preds.csv
mitbih_resnet1d_asymmetric-0.4_simifeat_42_test_preds.csv


 54%|██████████████████████████▋                      | 331/608 [00:02<00:01, 176.99it/s]

mitbih_resnet1d_classdependent_aum_42_test_preds.csv
mitbih_resnet1d_classdependent_cincer_42_test_preds.csv
mitbih_resnet1d_classdependent_cleanlab_42_test_preds.csv
mitbih_resnet1d_classdependent_nocleaning_42_test_preds.csv
mitbih_resnet1d_classdependent_simifeat_42_test_preds.csv
mitbih_resnet1d_instancedependent-0.1_aum_42_test_preds.csv
mitbih_resnet1d_instancedependent-0.1_cincer_42_test_preds.csv
mitbih_resnet1d_instancedependent-0.1_cleanlab_42_test_preds.csv
mitbih_resnet1d_instancedependent-0.1_nocleaning_42_test_preds.csv
mitbih_resnet1d_instancedependent-0.1_simifeat_42_test_preds.csv
mitbih_resnet1d_instancedependent-0.2_aum_42_test_preds.csv
mitbih_resnet1d_instancedependent-0.2_cincer_42_test_preds.csv
mitbih_resnet1d_instancedependent-0.2_cleanlab_42_test_preds.csv
mitbih_resnet1d_instancedependent-0.2_nocleaning_42_test_preds.csv
mitbih_resnet1d_instancedependent-0.2_simifeat_42_test_preds.csv
mitbih_resnet1d_instancedependent-0.4_aum_42_test_preds.csv
mitbih_resnet1d

 61%|█████████████████████████████▉                   | 372/608 [00:02<00:01, 183.20it/s]

mitbih_resnet1d_no-noise-0.0_cincer_42_test_preds.csv
mitbih_resnet1d_no-noise-0.0_cleanlab_42_test_preds.csv
mitbih_resnet1d_no-noise-0.0_nocleaning_42_test_preds.csv
mitbih_resnet1d_no-noise-0.0_simifeat_42_test_preds.csv
mitbih_resnet1d_nonoise_nocleaning_42_test_preds.csv
mitbih_resnet1d_uniform-0.1_aum_42_test_preds.csv
mitbih_resnet1d_uniform-0.1_cincer_42_test_preds.csv
mitbih_resnet1d_uniform-0.1_cleanlab_42_test_preds.csv
mitbih_resnet1d_uniform-0.1_nocleaning_42_test_preds.csv
mitbih_resnet1d_uniform-0.1_simifeat_42_test_preds.csv
mitbih_resnet1d_uniform-0.2_aum_42_test_preds.csv
mitbih_resnet1d_uniform-0.2_cincer_42_test_preds.csv
mitbih_resnet1d_uniform-0.2_cleanlab_42_test_preds.csv
mitbih_resnet1d_uniform-0.2_nocleaning_42_test_preds.csv
mitbih_resnet1d_uniform-0.2_simifeat_42_test_preds.csv
mitbih_resnet1d_uniform-0.4_aum_42_test_preds.csv
mitbih_resnet1d_uniform-0.4_cincer_42_test_preds.csv
mitbih_resnet1d_uniform-0.4_cleanlab_42_test_preds.csv
mitbih_resnet1d_uniform-0

 67%|█████████████████████████████████▋                | 410/608 [00:04<00:05, 37.44it/s]

pendigits_resnet1d_asymmetric-0.1_aum_42_test_preds.csv
pendigits_resnet1d_asymmetric-0.1_cincer_42_test_preds.csv
pendigits_resnet1d_asymmetric-0.1_cleanlab_42_test_preds.csv
pendigits_resnet1d_asymmetric-0.1_nocleaning_42_test_preds.csv
pendigits_resnet1d_asymmetric-0.1_simifeat_42_test_preds.csv
pendigits_resnet1d_asymmetric-0.2_aum_42_test_preds.csv
pendigits_resnet1d_asymmetric-0.2_cincer_42_test_preds.csv
pendigits_resnet1d_asymmetric-0.2_cleanlab_42_test_preds.csv
pendigits_resnet1d_asymmetric-0.2_nocleaning_42_test_preds.csv
pendigits_resnet1d_asymmetric-0.2_simifeat_42_test_preds.csv
pendigits_resnet1d_asymmetric-0.4_aum_42_test_preds.csv
pendigits_resnet1d_asymmetric-0.4_cincer_42_test_preds.csv
pendigits_resnet1d_asymmetric-0.4_cleanlab_42_test_preds.csv
pendigits_resnet1d_asymmetric-0.4_nocleaning_42_test_preds.csv
pendigits_resnet1d_asymmetric-0.4_simifeat_42_test_preds.csv
pendigits_resnet1d_classdependent_aum_42_test_preds.csv
pendigits_resnet1d_classdependent_cincer_42_

 74%|█████████████████████████████████████             | 450/608 [00:05<00:02, 63.58it/s]

pendigits_resnet1d_classdependent_simifeat_42_test_preds.csv
pendigits_resnet1d_instancedependent-0.1_aum_42_test_preds.csv
pendigits_resnet1d_instancedependent-0.1_cincer_42_test_preds.csv
pendigits_resnet1d_instancedependent-0.1_cleanlab_42_test_preds.csv
pendigits_resnet1d_instancedependent-0.1_nocleaning_42_test_preds.csv
pendigits_resnet1d_instancedependent-0.1_simifeat_42_test_preds.csv
pendigits_resnet1d_instancedependent-0.2_aum_42_test_preds.csv
pendigits_resnet1d_instancedependent-0.2_cincer_42_test_preds.csv
pendigits_resnet1d_instancedependent-0.2_cleanlab_42_test_preds.csv
pendigits_resnet1d_instancedependent-0.2_nocleaning_42_test_preds.csv
pendigits_resnet1d_instancedependent-0.2_simifeat_42_test_preds.csv
pendigits_resnet1d_instancedependent-0.4_aum_42_test_preds.csv
pendigits_resnet1d_instancedependent-0.4_cincer_42_test_preds.csv
pendigits_resnet1d_instancedependent-0.4_cleanlab_42_test_preds.csv
pendigits_resnet1d_instancedependent-0.4_nocleaning_42_test_preds.csv
pe

 77%|██████████████████████████████████████▍           | 467/608 [00:05<00:01, 74.00it/s]

pendigits_resnet1d_no-noise-0.0_nocleaning_42_test_preds.csv
pendigits_resnet1d_no-noise-0.0_simifeat_42_test_preds.csv
pendigits_resnet1d_nonoise_nocleaning_42_test_preds.csv
pendigits_resnet1d_uniform-0.1_aum_42_test_preds.csv
pendigits_resnet1d_uniform-0.1_cincer_42_test_preds.csv
pendigits_resnet1d_uniform-0.1_cleanlab_42_test_preds.csv
pendigits_resnet1d_uniform-0.1_nocleaning_42_test_preds.csv
pendigits_resnet1d_uniform-0.1_simifeat_42_test_preds.csv
pendigits_resnet1d_uniform-0.2_aum_42_test_preds.csv
pendigits_resnet1d_uniform-0.2_cincer_42_test_preds.csv
pendigits_resnet1d_uniform-0.2_cleanlab_42_test_preds.csv
pendigits_resnet1d_uniform-0.2_nocleaning_42_test_preds.csv
pendigits_resnet1d_uniform-0.2_simifeat_42_test_preds.csv
pendigits_resnet1d_uniform-0.4_aum_42_test_preds.csv
pendigits_resnet1d_uniform-0.4_cincer_42_test_preds.csv
pendigits_resnet1d_uniform-0.4_cleanlab_42_test_preds.csv
pendigits_resnet1d_uniform-0.4_nocleaning_42_test_preds.csv


 79%|███████████████████████████████████████▋          | 483/608 [00:05<00:01, 86.06it/s]

pendigits_resnet1d_uniform-0.4_simifeat_42_test_preds.csv


 85%|██████████████████████████████████████████▍       | 516/608 [00:54<00:56,  1.64it/s]

whalecalls_resnet1d_asymmetric-0.1_aum_42_test_preds.csv
whalecalls_resnet1d_asymmetric-0.1_cincer_42_test_preds.csv
whalecalls_resnet1d_asymmetric-0.1_cleanlab_42_test_preds.csv
whalecalls_resnet1d_asymmetric-0.1_nocleaning_42_test_preds.csv
whalecalls_resnet1d_asymmetric-0.1_simifeat_42_test_preds.csv
whalecalls_resnet1d_asymmetric-0.2_aum_42_test_preds.csv
whalecalls_resnet1d_asymmetric-0.2_cincer_42_test_preds.csv
whalecalls_resnet1d_asymmetric-0.2_cleanlab_42_test_preds.csv
whalecalls_resnet1d_asymmetric-0.2_nocleaning_42_test_preds.csv
whalecalls_resnet1d_asymmetric-0.2_simifeat_42_test_preds.csv
whalecalls_resnet1d_asymmetric-0.4_aum_42_test_preds.csv
whalecalls_resnet1d_asymmetric-0.4_cincer_42_test_preds.csv
whalecalls_resnet1d_asymmetric-0.4_cleanlab_42_test_preds.csv


 90%|████████████████████████████████████████████▉     | 546/608 [00:54<00:19,  3.10it/s]

whalecalls_resnet1d_asymmetric-0.4_nocleaning_42_test_preds.csv
whalecalls_resnet1d_asymmetric-0.4_simifeat_42_test_preds.csv
whalecalls_resnet1d_classdependent_aum_42_test_preds.csv
whalecalls_resnet1d_classdependent_cincer_42_test_preds.csv
whalecalls_resnet1d_classdependent_cleanlab_42_test_preds.csv
whalecalls_resnet1d_classdependent_nocleaning_42_test_preds.csv
whalecalls_resnet1d_classdependent_simifeat_42_test_preds.csv
whalecalls_resnet1d_instancedependent-0.1_aum_42_test_preds.csv
whalecalls_resnet1d_instancedependent-0.1_cincer_42_test_preds.csv
whalecalls_resnet1d_instancedependent-0.1_cleanlab_42_test_preds.csv
whalecalls_resnet1d_instancedependent-0.1_nocleaning_42_test_preds.csv
whalecalls_resnet1d_instancedependent-0.1_simifeat_42_test_preds.csv
whalecalls_resnet1d_instancedependent-0.2_aum_42_test_preds.csv
whalecalls_resnet1d_instancedependent-0.2_cincer_42_test_preds.csv


 94%|███████████████████████████████████████████████▏  | 574/608 [00:54<00:05,  5.80it/s]

whalecalls_resnet1d_instancedependent-0.2_cleanlab_42_test_preds.csv
whalecalls_resnet1d_instancedependent-0.2_nocleaning_42_test_preds.csv
whalecalls_resnet1d_instancedependent-0.2_simifeat_42_test_preds.csv
whalecalls_resnet1d_instancedependent-0.4_aum_42_test_preds.csv
whalecalls_resnet1d_instancedependent-0.4_cincer_42_test_preds.csv
whalecalls_resnet1d_instancedependent-0.4_cleanlab_42_test_preds.csv
whalecalls_resnet1d_instancedependent-0.4_nocleaning_42_test_preds.csv
whalecalls_resnet1d_instancedependent-0.4_simifeat_42_test_preds.csv
whalecalls_resnet1d_no-noise-0.0_aum_42_test_preds.csv
whalecalls_resnet1d_no-noise-0.0_cincer_42_test_preds.csv
whalecalls_resnet1d_no-noise-0.0_cleanlab_42_test_preds.csv
whalecalls_resnet1d_no-noise-0.0_nocleaning_42_test_preds.csv
whalecalls_resnet1d_no-noise-0.0_simifeat_42_test_preds.csv
whalecalls_resnet1d_nonoise_nocleaning_42_test_preds.csv


 99%|█████████████████████████████████████████████████▍| 601/608 [00:54<00:00, 10.69it/s]

whalecalls_resnet1d_uniform-0.1_aum_42_test_preds.csv
whalecalls_resnet1d_uniform-0.1_cincer_42_test_preds.csv
whalecalls_resnet1d_uniform-0.1_cleanlab_42_test_preds.csv
whalecalls_resnet1d_uniform-0.1_nocleaning_42_test_preds.csv
whalecalls_resnet1d_uniform-0.1_simifeat_42_test_preds.csv
whalecalls_resnet1d_uniform-0.2_aum_42_test_preds.csv
whalecalls_resnet1d_uniform-0.2_cincer_42_test_preds.csv
whalecalls_resnet1d_uniform-0.2_cleanlab_42_test_preds.csv
whalecalls_resnet1d_uniform-0.2_nocleaning_42_test_preds.csv
whalecalls_resnet1d_uniform-0.2_simifeat_42_test_preds.csv
whalecalls_resnet1d_uniform-0.4_aum_42_test_preds.csv
whalecalls_resnet1d_uniform-0.4_cincer_42_test_preds.csv
whalecalls_resnet1d_uniform-0.4_cleanlab_42_test_preds.csv


100%|██████████████████████████████████████████████████| 608/608 [00:55<00:00, 11.05it/s]

whalecalls_resnet1d_uniform-0.4_nocleaning_42_test_preds.csv
whalecalls_resnet1d_uniform-0.4_simifeat_42_test_preds.csv





In [235]:
datasets = list(results.keys())
datasets.sort()
print(datasets)
keys = list(results[datasets[0]].keys())
keys.sort()
#print(keys)
keys = [key for key in keys if any([noise_rate in key for noise_rate in noise_rate_filters]) or ("-" not in key)]
if "nonoise_nocleaning" in keys:
    keys.remove("nonoise_nocleaning")
#print(keys)
print(",".join(keys))
for dataset in datasets:
    print(",".join([str(results[dataset][key]) for key in keys]))

['crop', 'electricdevices', 'mitbih', 'pendigits', 'whalecalls']
asymmetric-0.1_aum,asymmetric-0.1_cincer,asymmetric-0.1_cleanlab,asymmetric-0.1_nocleaning,asymmetric-0.1_simifeat,classdependent_aum,classdependent_cincer,classdependent_cleanlab,classdependent_nocleaning,classdependent_simifeat,instancedependent-0.1_aum,instancedependent-0.1_cincer,instancedependent-0.1_cleanlab,instancedependent-0.1_nocleaning,instancedependent-0.1_simifeat,no-noise-0.0_aum,no-noise-0.0_cincer,no-noise-0.0_cleanlab,no-noise-0.0_nocleaning,no-noise-0.0_simifeat,uniform-0.1_aum,uniform-0.1_cincer,uniform-0.1_cleanlab,uniform-0.1_nocleaning,uniform-0.1_simifeat
0.462159,0.452555,0.03383,0.462184,0.561067,0.119114,0.142728,0.039247,0.13905,0.087039,0.511884,0.410172,0.041007,0.537823,0.531393,0.50741,0.478841,0.022351,0.527624,0.600873,0.570607,0.527054,0.068029,0.514137,0.496993
0.636555,0.612057,0.304792,0.616079,0.611635,0.063004,0.091042,0.094596,0.05099,0.0498,0.519577,0.641556,0.429865,0.624531,0.582

In [219]:
# TABLE 2

base_model = "all-distilroberta-v1"
noise_rate_filter = "0.1"
random_seed = "42"

results = dict()

for dataset in results_dict:
    if base_model not in results_dict[dataset]:
        continue
    results[dataset] = dict()
    for noise_type in results_dict[dataset][base_model]:
        if ("-" in noise_type) and (not noise_type.endswith(noise_rate_filter)):
            continue
        cm_performance = results_dict[dataset][base_model][noise_type][random_seed]["cleaning_method_performance"]
        for cleaning_method in CLEANING_METHODS:
            results[dataset][f"{noise_type}_{cleaning_method}_noise_pred"] = cm_performance[f"{cleaning_method}_noise_pred"]["weighted_f1"]

In [220]:
datasets = list(results.keys())
keys = list(results[datasets[0]].keys())
keys.sort()
datasets.sort()
print(datasets)
print(",".join(keys))
for dataset in datasets:
    print(",".join([str(results[dataset][key]) for key in keys if key != "nonoise_nocleaning"]))

['imdb', 'tweeteval']
asymmetric-0.1_aum_noise_pred,asymmetric-0.1_cincer_noise_pred,asymmetric-0.1_cleanlab_noise_pred,asymmetric-0.1_simifeat_noise_pred,classdependent_aum_noise_pred,classdependent_cincer_noise_pred,classdependent_cleanlab_noise_pred,classdependent_simifeat_noise_pred,instancedependent-0.1_aum_noise_pred,instancedependent-0.1_cincer_noise_pred,instancedependent-0.1_cleanlab_noise_pred,instancedependent-0.1_simifeat_noise_pred,uniform-0.1_aum_noise_pred,uniform-0.1_cincer_noise_pred,uniform-0.1_cleanlab_noise_pred,uniform-0.1_simifeat_noise_pred
0.849988,0.912264,0.661696,0.91838,0.934909,0.948087,0.634458,0.963854,0.786345,0.87366,0.569475,0.862008,0.84859,0.908847,0.597322,0.916003
0.849633,0.866267,0.568563,0.873284,0.70195,0.709005,0.637259,0.718505,0.785454,0.789651,0.599058,0.813948,0.855973,0.867511,0.62691,0.874276
