# Multi Model Evaluation

In [1]:
import sys
sys.path.append("..")

import copy
import cProfile
from datasets import load_dataset
import itertools
import json
import math
import nltk
from nltk.tokenize.treebank import TreebankWordDetokenizer
import pandas as pd
import random
from sklearn.metrics import classification_report, accuracy_score, f1_score
import time
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, DataCollatorWithPadding, \
                         AutoModelForSequenceClassification, BertForSequenceClassification, \
                         BertModel, RobertaForSequenceClassification, RobertaModel

from resilient_nlp.mini_roben import Clustering, ClusterRepRecoverer, ClusterRecovererWithPassthrough
from resilient_nlp.models import BertClassifier
from resilient_nlp.perturbers import ToyPerturber, WordScramblerPerturber
from runner import ExperimentRunner
from word_score_attack import BertWordScoreAttack

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [2]:
#tasks = ('imdb', 'sst', 'sst_bin', 'yelp_bin', 'yelp_full')
#tasks = ('sst_bin', 'yelp_bin', 'yelp_full')
tasks = ('sst_bin',)

In [3]:
model_types = ('bert', 'roberta')

Config for final evaluation on test set

In [4]:
eval_set_size = 500
use_dev_set = False

Config for evaluation on dev set

In [5]:
#eval_set_size = 113
#use_dev_set = True

In [6]:
max_raw_length = 826
preprocess = lambda row: { 'text': row['text'].lower()[:max_raw_length]}

## IMDb Dataset

In [7]:
sampled_test_set = {}
sampled_test_set_dict = {}
sampled_test_set_adv_no_ws = {}
sampled_test_set_adv_incl_ws = {}

In [8]:
imdb = load_dataset('../output/huggingface/imdb')

Using custom data configuration redacted--imdb-f63738dec0d5e230
Reusing dataset parquet (/home/user/.cache/huggingface/datasets/parquet/redacted--imdb-f63738dec0d5e230/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)


  0%|          | 0/4 [00:00<?, ?it/s]

In [9]:
random.seed(11)
if use_dev_set:
    sampled_test_set['imdb'] = imdb['dev'].select(random.choices(range(len(imdb['dev'])), k=eval_set_size)).map(preprocess)
else:
    sampled_test_set['imdb'] = imdb['attack_eval_truncated'].select(range(eval_set_size)).map(preprocess)


# This is silly but apparently huggingface datasets are immutable?
# Representing it as something a bit more sane
sampled_test_set_dict['imdb'] = [
    {
        'text': row['text'],
        'label': row['label'],
    }
    for row in sampled_test_set['imdb']
]

Loading cached processed dataset at /home/user/.cache/huggingface/datasets/parquet/redacted--imdb-f63738dec0d5e230/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901/cache-b45d493b37ab1dc7.arrow


## SST-5 Dataset

In [10]:
treebank_detok = TreebankWordDetokenizer()

sst = load_dataset('sst').map(
    lambda row: {
        "text": treebank_detok.detokenize(row["sentence"].split()),
        "label": min(math.floor(row["label"] / 0.2), 4.0),
    }, remove_columns=['sentence', 'tokens', 'tree']
)

random.seed(11)
if use_dev_set:
    sampled_test_set['sst'] = sst['validation'].select(random.choices(range(len(sst['validation'])), k=eval_set_size)).map(preprocess)
else:
    sampled_test_set['sst'] = sst['test'].select(random.choices(range(len(sst['test'])), k=eval_set_size)).map(preprocess)

sampled_test_set_dict['sst'] = [
    {
        'text': row['text'],
        'label': row['label'],
    }
    for row in sampled_test_set['sst']
]

No config specified, defaulting to: sst/default
Reusing dataset sst (/home/user/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /home/user/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-3c142acdab53f98c.arrow
Loading cached processed dataset at /home/user/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-0bf56ce0086915ee.arrow
Loading cached processed dataset at /home/user/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-19fdf8d124be4ba7.arrow
Loading cached processed dataset at /home/user/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-fc3e75429c3f7637.arrow


## SST-2 Dataset

In [11]:
treebank_detok = TreebankWordDetokenizer()

sst_bin = load_dataset('sst').filter(
        lambda row: row["label"] < 0.4 or row["label"] >= 0.6
    ).map(
    lambda row: {
        "text": treebank_detok.detokenize(row["sentence"].split()),
        "label": min(math.floor(row["label"] / 0.5), 1.0),
    }
)

random.seed(11)
if use_dev_set:
    sampled_test_set['sst_bin'] = sst_bin['validation'].select(random.choices(range(len(sst_bin['validation'])), k=eval_set_size)).map(preprocess)
else:
    sampled_test_set['sst_bin'] = sst_bin['test'].select(random.choices(range(len(sst_bin['test'])), k=eval_set_size)).map(preprocess)

sampled_test_set_dict['sst_bin'] = [
    {
        'text': row['text'],
        'label': row['label'],
    }
    for row in sampled_test_set['sst_bin']
]

No config specified, defaulting to: sst/default
Reusing dataset sst (/home/user/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /home/user/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-f4f1ada73617d193.arrow
Loading cached processed dataset at /home/user/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-6279b6f0f8a08f9a.arrow
Loading cached processed dataset at /home/user/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-6c5f77e5aefdd0e2.arrow
Loading cached processed dataset at /home/user/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-c46f07c913633b4d.arrow
Loading cached processed dataset at /home/user/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-87779081ead23eae.arrow
Loading cached processed dataset at /home/user/.cache/huggingface

## Yelp-2

In [12]:
yelp_bin = load_dataset('yelp_polarity')

random.seed(11)
sampled_test_set['yelp_bin'] = yelp_bin['test'].select(random.choices(range(len(sst['test'])), k=eval_set_size)).map(preprocess)

sampled_test_set_dict['yelp_bin'] = [
    {
        'text': row['text'],
        'label': row['label'],
    }
    for row in sampled_test_set['yelp_bin']
]

Reusing dataset yelp_polarity (/home/user/.cache/huggingface/datasets/yelp_polarity/plain_text/1.0.0/a770787b2526bdcbfc29ac2d9beb8e820fbc15a03afd3ebc4fb9d8529de57544)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached processed dataset at /home/user/.cache/huggingface/datasets/yelp_polarity/plain_text/1.0.0/a770787b2526bdcbfc29ac2d9beb8e820fbc15a03afd3ebc4fb9d8529de57544/cache-103b3f679d53323c.arrow


## Yelp-5

In [13]:
yelp_full = load_dataset('yelp_polarity')

random.seed(11)
sampled_test_set['yelp_full'] = yelp_bin['test'].select(random.choices(range(len(sst['test'])), k=eval_set_size)).map(preprocess)

sampled_test_set_dict['yelp_full'] = [
    {
        'text': row['text'],
        'label': row['label'],
    }
    for row in sampled_test_set['yelp_full']
]

Reusing dataset yelp_polarity (/home/user/.cache/huggingface/datasets/yelp_polarity/plain_text/1.0.0/a770787b2526bdcbfc29ac2d9beb8e820fbc15a03afd3ebc4fb9d8529de57544)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached processed dataset at /home/user/.cache/huggingface/datasets/yelp_polarity/plain_text/1.0.0/a770787b2526bdcbfc29ac2d9beb8e820fbc15a03afd3ebc4fb9d8529de57544/cache-103b3f679d53323c.arrow


### Perturbations

In [14]:
def generate_perturbed_multiset(input, wsp):
    random.seed(11)
    result = []

    for i in range(10):
        test_item = copy.deepcopy(input)

        for row in test_item:
            row['text'] = wsp.perturb([row['text']])[0][0]
        result.append(test_item)
    
    return result

Perturbed set with no whitespace modifications

In [15]:
wsp = WordScramblerPerturber(perturb_prob=0.1, weight_add=1, weight_drop=1, weight_swap=1,
                             weight_split_word=0, weight_merge_words=0)

for task in tasks:
    sampled_test_set_adv_no_ws[task] = generate_perturbed_multiset(sampled_test_set_dict[task], wsp)

Perturbed set with whitespace modifications

In [16]:
wsp = WordScramblerPerturber(perturb_prob=0.1, weight_add=1, weight_drop=1, weight_swap=1,
                             weight_split_word=1, weight_merge_words=1)

for task in tasks:
    sampled_test_set_adv_incl_ws[task] = generate_perturbed_multiset(sampled_test_set_dict[task], wsp)

## Models

### BERT, including finetuned variants

In [17]:
tokenizer = {}
model_base = {}
model_finetuned = { type: {} for type in model_types }
model_finetuned_all_pert = { type: {} for type in model_types }

In [18]:
bert_checkpoint = "bert-base-uncased"
tokenizer['bert'] = AutoTokenizer.from_pretrained(bert_checkpoint)
model_base['bert'] = BertModel.from_pretrained(bert_checkpoint).to(device)

In [19]:
bert_checkpoint_finetuned_imdb = "../output/huggingface/bert-base-uncased-imdb"
if 'bert' in model_types and 'imdb' in tasks:
    model_finetuned['bert']['imdb'] = BertForSequenceClassification.from_pretrained(bert_checkpoint_finetuned_imdb).to(device)

In [20]:
bert_checkpoint_finetuned_imdb_all_pert = "../output/huggingface/bert-base-uncased-imdb-all-pert"
if 'bert' in model_types and 'imdb' in tasks:
    model_finetuned_all_pert['bert']['imdb'] = BertForSequenceClassification.from_pretrained(bert_checkpoint_finetuned_imdb_all_pert).to(device)

In [21]:
bert_checkpoint_finetuned_sst = "../output/huggingface/bert-base-uncased-sst"
if 'bert' in model_types and 'sst' in tasks:
    model_finetuned['bert']['sst'] = BertForSequenceClassification.from_pretrained(bert_checkpoint_finetuned_sst).to(device)

In [22]:
bert_checkpoint_finetuned_sst_bin = '../output/huggingface/bert-base-uncased-sst_bin'
if 'bert' in model_types and 'sst_bin' in tasks:
    model_finetuned['bert']['sst_bin'] = BertForSequenceClassification.from_pretrained(bert_checkpoint_finetuned_sst_bin).to(device)

In [23]:
bert_checkpoint_finetuned_yelp_bin = '../output/huggingface/bert-base-uncased-yelp_bin'
if 'bert' in model_types and 'yelp_bin' in tasks:
    model_finetuned['bert']['yelp_bin'] = BertForSequenceClassification.from_pretrained(bert_checkpoint_finetuned_yelp_bin).to(device)

In [24]:
bert_checkpoint_finetuned_yelp_full = '../output/huggingface/bert-base-uncased-yelp_full'
if 'bert' in model_types and 'yelp_full' in tasks:
    model_finetuned['bert']['yelp_full'] = BertForSequenceClassification.from_pretrained(bert_checkpoint_finetuned_yelp_full).to(device)

In [25]:
roberta_checkpoint = "roberta-base"
tokenizer['roberta'] = AutoTokenizer.from_pretrained(roberta_checkpoint)
model_base['roberta'] = RobertaModel.from_pretrained(roberta_checkpoint).to(device)

In [26]:
roberta_checkpoint_finetuned_imdb = "../output/huggingface/roberta-base-imdb"
if 'roberta' in model_types and 'imdb' in tasks:
    model_finetuned['roberta']['imdb'] = RobertaForSequenceClassification.from_pretrained(roberta_checkpoint_finetuned_imdb).to(device)

In [27]:
roberta_checkpoint_finetuned_sst = '../output/huggingface/roberta-base-sst'
if 'roberta' in model_types and 'sst' in tasks:
    model_finetuned['roberta']['sst'] = RobertaForSequenceClassification.from_pretrained(roberta_checkpoint_finetuned_sst).to(device)

In [28]:
roberta_checkpoint_finetuned_sst_bin = '../output/huggingface/roberta-base-sst_bin'
if 'roberta' in model_types and 'sst_bin' in tasks:
    model_finetuned['roberta']['sst_bin'] = RobertaForSequenceClassification.from_pretrained(roberta_checkpoint_finetuned_sst_bin).to(device)

In [29]:
roberta_checkpoint_finetuned_yelp_bin = '../output/huggingface/roberta-base-yelp_bin'
if 'roberta' in model_types and 'yelp_bin' in tasks:
    model_finetuned['roberta']['yelp_bin'] = RobertaForSequenceClassification.from_pretrained(roberta_checkpoint_finetuned_yelp_bin).to(device)

In [30]:
roberta_checkpoint_finetuned_yelp_full = '../output/huggingface/roberta-base-yelp_full'
if 'roberta' in model_types and 'yelp_full' in tasks:
    model_finetuned['roberta']['yelp_full'] = RobertaForSequenceClassification.from_pretrained(roberta_checkpoint_finetuned_yelp_full).to(device)

### RobEn clusterings (as baseline)

The first clustering is ConnComp (which very aggressively merges clusters). The second is AggClust, which uses a cost function to better preserve fidelity. The second one should generally be better.

In [31]:
roben_clustering = Clustering.from_pickle("../vocab100000_ed1.pkl")
roben_recoverer = ClusterRecovererWithPassthrough("cache", roben_clustering)
roben_clustering2 = Clustering.from_pickle("../vocab100000_ed1_gamma0.3.pkl")
roben_recoverer2 = ClusterRecovererWithPassthrough("cache", roben_clustering2)

## Model Prediction Helpers

In [32]:
max_sequence_length = 128
batch_size = 32

These are wrappers for standard (possibly finetuned) Huggingface models, using their normal tokenizers.

In [33]:
def standard_model_predict(tokenizer, model, sentences, recoverer, return_pred_tensor, recoverer_tokenize):
    if recoverer is not None:
        if recoverer_tokenize:
            tok = nltk.tokenize.treebank.TreebankWordTokenizer()
            sentences = [ " ".join(tok_list) for tok_list in tok.tokenize_sents(sentences) ]
        sentences = [ recoverer.recover(s.lower()) for s in sentences ]
        if recoverer_tokenize:
            detok = nltk.tokenize.treebank.TreebankWordDetokenizer()
            sentences = [ detok.detokenize(s.split(" ")) for s in sentences]
    tokenized = tokenizer(sentences, truncation=True, padding='max_length', max_length=max_sequence_length,
                          return_tensors='pt')
    tokenized = { k: v.to(device) for k, v in tokenized.items() }
    preds = model(**tokenized)
    if return_pred_tensor:
        return preds
    else:
        return torch.argmax(preds.logits, dim=1)

def wrap_standard_model(tokenizer, model, recoverer=None, return_pred_tensor=True, recoverer_tokenize=False):
    return lambda sentences: standard_model_predict(tokenizer, model, sentences, recoverer, return_pred_tensor,
                                                    recoverer_tokenize)

This is a wrapper for the machine trained tokenizer+embedder (aka MockingBERT)

In [34]:
def mltokenizer_model_predict(runner, model, cls_embedding, sep_embedding, pad_embedding, sentences, return_pred_tensor):
    # Truncate and lower case. Truncation is for performance only
    sentences = [ s.lower()[:8*max_sequence_length] for s in sentences]
    embedding = runner.embed(sentences=sentences,
        start_token=cls_embedding, end_token=sep_embedding, pad_token=pad_embedding,
        max_tokens=max_sequence_length)
    preds = model(inputs_embeds=embedding['inputs_embeds'], attention_mask=embedding['attention_mask'])
    if return_pred_tensor:
        return preds
    else:
        return torch.argmax(preds.logits, dim=1)

def wrap_mltokenizer_model(mltokenizer_prefix, tokenizer, model, cf_embedding, type, return_pred_tensor=True):
    filename = "../{}.pth".format(mltokenizer_prefix)
    runner = ExperimentRunner(device, model_filename=filename)
    if type == 'bert':
        cls_token_id = tokenizer.vocab['[CLS]']
        sep_token_id = tokenizer.vocab['[SEP]']
        pad_token_id = tokenizer.vocab['[PAD]']
    elif type == 'roberta':
        cls_token_id = tokenizer.vocab['<s>']
        sep_token_id = tokenizer.vocab['</s>']
        pad_token_id = tokenizer.vocab['<pad>']
    cls_embedding = cf_embedding(torch.tensor([cls_token_id], device=device)).view(-1)
    sep_embedding = cf_embedding(torch.tensor([sep_token_id], device=device)).view(-1)
    pad_embedding = cf_embedding(torch.tensor([pad_token_id], device=device)).view(-1)
    
    return lambda sentences: mltokenizer_model_predict(runner, model, cls_embedding, sep_embedding,
                                                      pad_embedding, sentences, return_pred_tensor)

## Evaluation Helpers

Evaluates a wrapped model on a test set

In [35]:
@torch.no_grad()
def evaluate_model(model, test_set):
    num_batches = math.ceil(len(test_set) / batch_size)
    
    sentences = [ x['text'] for x in test_set ]
    labels = [ x['label'] for x in test_set ]
    pred_batches = []
    
    for i in tqdm(range(num_batches)):
        bs = i * batch_size
        be = bs + batch_size
        
        output = model(sentences[bs:be])
        
        pred_batches.append(torch.argmax(output.logits, dim=1).detach().cpu())
    preds = torch.cat(pred_batches)
    
    print(classification_report(labels, preds, digits=4))
    
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='macro')
    
    return accuracy, f1

Evaluates a wrapped model on a stochastic, pseudo-adversarial test set. This means that each input sentence is replicated x times (typically 10) with randomized perturbations, and an attack is considered successful if *any* of the predictions is incorrect.

In [36]:
@torch.no_grad()
def evaluate_model_adv(model, test_sets):
    labels = [ x['label'] for x in test_sets[0] ]
    adv_preds = copy.copy(labels)
    accuracy_list = []
    f1_list = []
    
    for idx in tqdm(range(len(test_sets))):
        test_set = test_sets[idx]
        num_batches = math.ceil(len(test_set) / batch_size)
    
        sentences = [ x['text'] for x in test_set ]
        pred_batches = []
    
        for i in range(num_batches):
            bs = i * batch_size
            be = bs + batch_size
        
            output = model(sentences[bs:be])
        
            pred_batches.append(torch.argmax(output.logits, dim=1).detach().cpu())
        preds = torch.cat(pred_batches)
        
        for i in range(len(adv_preds)):
            if labels[i] != preds[i]:
                adv_preds[i] = preds[i]

        accuracy_list.append(accuracy_score(labels, adv_preds))
        f1_list.append(f1_score(labels, adv_preds, average='macro'))
    
    print(classification_report(labels, adv_preds, digits=4))    
    
    return accuracy_list, f1_list

Evaluates a model using WordScoreAttack

In [37]:
@torch.no_grad()
def evaluate_model_word_score(model, test_set, allow_whitespace_pert=True, report_prefix=None, word_scores_file=None):
    attacker = BertWordScoreAttack(
        WordScramblerPerturber(perturb_prob=1, weight_add=1, weight_drop=1, weight_swap=1,
                               weight_split_word=int(allow_whitespace_pert),
                               weight_merge_words=0),
        word_scores_file, model, tokenizer=None, max_sequence_length=max_sequence_length,
        attack_whitespace=allow_whitespace_pert,
    )

    res = attacker.attack(test_set, max_tokens_to_perturb=10, max_tries_per_token=4, mode=0, print_summary=False)

    if report_prefix is not None:
        res.to_csv(f"{report_prefix}_df.csv")
        with open(f"{report_prefix}_stats.json", "w") as f:
            json.dump(attacker.compute_attack_stats(), fp=f)            
    
    print(classification_report(res['ground_truth'], res['perturbed_preds'], digits=4))    
    
    accuracy = accuracy_score(res['ground_truth'], res['perturbed_preds'])
    f1 = f1_score(res['ground_truth'], res['perturbed_preds'], average='macro')
    
    return accuracy, f1

In [38]:
all_models = {
    'baseline': lambda task, type: wrap_standard_model(tokenizer[type], model_finetuned[type][task]),
    'baseline_all_pert': lambda task, type: wrap_standard_model(tokenizer[type], model_finetuned_all_pert[type][task]),
    'roben_1': lambda task, type: wrap_standard_model(tokenizer[type], model_finetuned[type][task], roben_recoverer),
    'roben_2': lambda task, type: wrap_standard_model(tokenizer[type], model_finetuned[type][task], roben_recoverer2),
    'roben_1_tok': lambda task, type: wrap_standard_model(tokenizer[type], model_finetuned[type][task], roben_recoverer, recoverer_tokenize=True),
    'roben_2_tok': lambda task, type: wrap_standard_model(tokenizer[type], model_finetuned[type][task], roben_recoverer2, recoverer_tokenize=True),
}

mltok_model_names = [
    '64k_lstm_clean_vanilla',
    '64k_lstm_no_whitespace_pert_vanilla',
    '64k_lstm_all_pert_vanilla',
    '64k_lstm_clean_finetuned',
    '64k_lstm_no_whitespace_pert_finetuned',
    '64k_lstm_all_pert_finetuned',
    '64k_cnn_no_whitespace_pert_finetuned',
    '2m_lstm_all_pert_finetuned',
    '32k_lstm_all_pert_finetuned_100ep',
]

for name in mltok_model_names:
    if name.endswith('_vanilla'):
        cf_embedding = lambda task, type: model_base[type].embeddings.word_embeddings
        filename = lambda task, type, name: f'output/{type}_{name}'
    else:
        cf_embedding = lambda task, type: model_finetuned[type][task].base_model.embeddings.word_embeddings
        filename = lambda task, type, name: f'output/{type}_{name}_{task}'
    # name=name is a hack to avoid Python late binding
    all_models[name] = lambda task, type, name=name, filename=filename, cf_embedding=cf_embedding: wrap_mltokenizer_model(filename(task, type, name), tokenizer[type], model_finetuned[type][task], cf_embedding(task, type), type)

In [39]:
evaluations = [
    'clean',
    'stochastic_no_ws',
    'stochastic_incl_ws',
    'word_score_no_ws',
    'word_score_incl_ws',
]

In [40]:
model_task_ids = [ f"{model}_{type}_{task}" for task, type, model in itertools.product(tasks, model_types, all_models.keys()) ]

accuracy_df = pd.DataFrame(columns=evaluations, index=model_task_ids)
f1_df = pd.DataFrame(columns=evaluations, index=model_task_ids)

for task, type in itertools.product(tasks, model_types):
    for cur_model_name, cur_model_factory in all_models.items():
        try:
            cur_model = cur_model_factory(task, type)
        except:
            print(f'Failed loading model {cur_model_name} on {type} for task {task}, skipping')
            accuracy_df.drop(f"{cur_model_name}_{type}_{task}", inplace=True)
            f1_df.drop(f"{cur_model_name}_{type}_{task}", inplace=True)
            continue
        for cur_evaluation in evaluations:
            print(f'Evaluating model {cur_model_name} on {type} on {cur_evaluation} for task {task}')
            start_time = time.time()
            random.seed(11)
            if cur_evaluation == 'clean':
                acc, f1 = evaluate_model(cur_model, sampled_test_set[task])
            elif cur_evaluation.startswith('stochastic_'):
                if cur_evaluation == 'stochastic_no_ws':
                    acc_list, f1_list = evaluate_model_adv(cur_model, sampled_test_set_adv_no_ws[task])
                elif cur_evaluation == 'stochastic_incl_ws':
                    acc_list, f1_list = evaluate_model_adv(cur_model, sampled_test_set_adv_incl_ws[task])
                acc = acc_list[-1]
                f1 = f1_list[-1]
                with open(f"../output/eval/{cur_model_name}_{type}_{task}_{cur_evaluation}_acc_list.json", "w") as f:
                    json.dump(acc_list, fp=f)
                with open(f"../output/eval/{cur_model_name}_{type}_{task}_{cur_evaluation}_f1_list.json", "w") as f:
                    json.dump(f1_list, fp=f)
            elif cur_evaluation.startswith('word_score_'):
                if cur_evaluation == 'word_score_no_ws':
                    acc, f1 = evaluate_model_word_score(cur_model, sampled_test_set[task], allow_whitespace_pert=False,
                                                        report_prefix=f"../output/eval/{cur_model_name}_{type}_{task}_{cur_evaluation}",
                                                        word_scores_file=f"../output/{task}_word_scores.json")
                elif cur_evaluation == 'word_score_incl_ws':
                    acc, f1 = evaluate_model_word_score(cur_model, sampled_test_set[task], allow_whitespace_pert=True,
                                                        report_prefix=f"../output/eval/{cur_model_name}_{type}_{task}_{cur_evaluation}",
                                                        word_scores_file=f"../output/{task}_word_scores.json")

            accuracy_df[cur_evaluation][f"{cur_model_name}_{type}_{task}"] = acc
            f1_df[cur_evaluation][f"{cur_model_name}_{type}_{task}"] = f1
            end_time = time.time()
            print(f"Evaluation took {end_time-start_time} seconds")
        del cur_model

Evaluating model baseline on bert on clean for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02<00:00,  6.32it/s]


              precision    recall  f1-score   support

         0.0     0.9482    0.8881    0.9171       268
         1.0     0.8795    0.9440    0.9106       232

    accuracy                         0.9140       500
   macro avg     0.9139    0.9160    0.9139       500
weighted avg     0.9163    0.9140    0.9141       500

Evaluation took 2.590557336807251 seconds
Evaluating model baseline on bert on stochastic_no_ws for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:24<00:00,  2.50s/it]


              precision    recall  f1-score   support

         0.0     0.7210    0.7425    0.7316       268
         1.0     0.6920    0.6681    0.6798       232

    accuracy                         0.7080       500
   macro avg     0.7065    0.7053    0.7057       500
weighted avg     0.7075    0.7080    0.7076       500

Evaluation took 24.96378231048584 seconds
Evaluating model baseline on bert on stochastic_incl_ws for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:25<00:00,  2.51s/it]


              precision    recall  f1-score   support

         0.0     0.7107    0.7425    0.7263       268
         1.0     0.6864    0.6509    0.6681       232

    accuracy                         0.7000       500
   macro avg     0.6985    0.6967    0.6972       500
weighted avg     0.6994    0.7000    0.6993       500

Evaluation took 25.06691837310791 seconds
Evaluating model baseline on bert on word_score_no_ws for task sst_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:54<00:00,  9.11it/s]


              precision    recall  f1-score   support

         0.0     0.4618    0.4739    0.4678       268
         1.0     0.3733    0.3621    0.3676       232

    accuracy                         0.4220       500
   macro avg     0.4176    0.4180    0.4177       500
weighted avg     0.4208    0.4220    0.4213       500

Evaluation took 54.94610905647278 seconds
Evaluating model baseline on bert on word_score_incl_ws for task sst_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:53<00:00,  9.38it/s]


              precision    recall  f1-score   support

         0.0     0.4336    0.4627    0.4477       268
         1.0     0.3271    0.3017    0.3139       232

    accuracy                         0.3880       500
   macro avg     0.3803    0.3822    0.3808       500
weighted avg     0.3842    0.3880    0.3856       500

Evaluation took 53.2917218208313 seconds
Failed loading model baseline_all_pert on bert for task sst_bin, skipping
Evaluating model roben_1 on bert on clean for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02<00:00,  6.23it/s]


              precision    recall  f1-score   support

         0.0     0.8438    0.7052    0.7683       268
         1.0     0.7138    0.8491    0.7756       232

    accuracy                         0.7720       500
   macro avg     0.7788    0.7772    0.7719       500
weighted avg     0.7834    0.7720    0.7717       500

Evaluation took 2.62431001663208 seconds
Evaluating model roben_1 on bert on stochastic_no_ws for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:25<00:00,  2.54s/it]


              precision    recall  f1-score   support

         0.0     0.6955    0.6306    0.6614       268
         1.0     0.6148    0.6810    0.6462       232

    accuracy                         0.6540       500
   macro avg     0.6551    0.6558    0.6538       500
weighted avg     0.6580    0.6540    0.6544       500

Evaluation took 25.36691403388977 seconds
Evaluating model roben_1 on bert on stochastic_incl_ws for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:25<00:00,  2.54s/it]


              precision    recall  f1-score   support

         0.0     0.5613    0.5299    0.5451       268
         1.0     0.4899    0.5216    0.5052       232

    accuracy                         0.5260       500
   macro avg     0.5256    0.5257    0.5252       500
weighted avg     0.5281    0.5260    0.5266       500

Evaluation took 25.36521863937378 seconds
Evaluating model roben_1 on bert on word_score_no_ws for task sst_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:56<00:00,  8.88it/s]


              precision    recall  f1-score   support

         0.0     0.5630    0.5672    0.5651       268
         1.0     0.4957    0.4914    0.4935       232

    accuracy                         0.5320       500
   macro avg     0.5293    0.5293    0.5293       500
weighted avg     0.5317    0.5320    0.5319       500

Evaluation took 56.3480019569397 seconds
Evaluating model roben_1 on bert on word_score_incl_ws for task sst_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:44<00:00, 11.19it/s]


              precision    recall  f1-score   support

         0.0     0.3464    0.3619    0.3540       268
         1.0     0.2227    0.2112    0.2168       232

    accuracy                         0.2920       500
   macro avg     0.2846    0.2866    0.2854       500
weighted avg     0.2890    0.2920    0.2904       500

Evaluation took 44.69224834442139 seconds
Evaluating model roben_2 on bert on clean for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02<00:00,  6.25it/s]


              precision    recall  f1-score   support

         0.0     0.8750    0.7313    0.7967       268
         1.0     0.7391    0.8793    0.8031       232

    accuracy                         0.8000       500
   macro avg     0.8071    0.8053    0.7999       500
weighted avg     0.8120    0.8000    0.7997       500

Evaluation took 2.615090847015381 seconds
Evaluating model roben_2 on bert on stochastic_no_ws for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:25<00:00,  2.53s/it]


              precision    recall  f1-score   support

         0.0     0.7479    0.6754    0.7098       268
         1.0     0.6628    0.7371    0.6980       232

    accuracy                         0.7040       500
   macro avg     0.7054    0.7062    0.7039       500
weighted avg     0.7084    0.7040    0.7043       500

Evaluation took 25.281092882156372 seconds
Evaluating model roben_2 on bert on stochastic_incl_ws for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:25<00:00,  2.53s/it]


              precision    recall  f1-score   support

         0.0     0.6148    0.5896    0.6019       268
         1.0     0.5473    0.5733    0.5600       232

    accuracy                         0.5820       500
   macro avg     0.5811    0.5814    0.5810       500
weighted avg     0.5835    0.5820    0.5825       500

Evaluation took 25.275885581970215 seconds
Evaluating model roben_2 on bert on word_score_no_ws for task sst_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:59<00:00,  8.35it/s]


              precision    recall  f1-score   support

         0.0     0.6160    0.6045    0.6102       268
         1.0     0.5527    0.5647    0.5586       232

    accuracy                         0.5860       500
   macro avg     0.5844    0.5846    0.5844       500
weighted avg     0.5866    0.5860    0.5863       500

Evaluation took 59.88006615638733 seconds
Evaluating model roben_2 on bert on word_score_incl_ws for task sst_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:47<00:00, 10.49it/s]


              precision    recall  f1-score   support

         0.0     0.3633    0.3619    0.3626       268
         1.0     0.2661    0.2672    0.2667       232

    accuracy                         0.3180       500
   macro avg     0.3147    0.3146    0.3146       500
weighted avg     0.3182    0.3180    0.3181       500

Evaluation took 47.68703055381775 seconds
Evaluating model roben_1_tok on bert on clean for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02<00:00,  6.14it/s]


              precision    recall  f1-score   support

         0.0     0.7511    0.6418    0.6922       268
         1.0     0.6458    0.7543    0.6958       232

    accuracy                         0.6940       500
   macro avg     0.6984    0.6981    0.6940       500
weighted avg     0.7022    0.6940    0.6939       500

Evaluation took 2.6620352268218994 seconds
Evaluating model roben_1_tok on bert on stochastic_no_ws for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:25<00:00,  2.59s/it]


              precision    recall  f1-score   support

         0.0     0.7237    0.6157    0.6653       268
         1.0     0.6213    0.7284    0.6706       232

    accuracy                         0.6680       500
   macro avg     0.6725    0.6721    0.6680       500
weighted avg     0.6762    0.6680    0.6678       500

Evaluation took 25.934560537338257 seconds
Evaluating model roben_1_tok on bert on stochastic_incl_ws for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:26<00:00,  2.61s/it]


              precision    recall  f1-score   support

         0.0     0.5336    0.5037    0.5182       268
         1.0     0.4615    0.4914    0.4760       232

    accuracy                         0.4980       500
   macro avg     0.4976    0.4976    0.4971       500
weighted avg     0.5002    0.4980    0.4986       500

Evaluation took 26.063369035720825 seconds
Evaluating model roben_1_tok on bert on word_score_no_ws for task sst_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:59<00:00,  8.47it/s]


              precision    recall  f1-score   support

         0.0     0.6904    0.6157    0.6509       268
         1.0     0.6054    0.6810    0.6410       232

    accuracy                         0.6460       500
   macro avg     0.6479    0.6484    0.6459       500
weighted avg     0.6509    0.6460    0.6463       500

Evaluation took 59.04615139961243 seconds
Evaluating model roben_1_tok on bert on word_score_incl_ws for task sst_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:41<00:00, 11.96it/s]


              precision    recall  f1-score   support

         0.0     0.3221    0.3209    0.3215       268
         1.0     0.2189    0.2198    0.2194       232

    accuracy                         0.2740       500
   macro avg     0.2705    0.2704    0.2704       500
weighted avg     0.2742    0.2740    0.2741       500

Evaluation took 41.82783341407776 seconds
Evaluating model roben_2_tok on bert on clean for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02<00:00,  5.97it/s]


              precision    recall  f1-score   support

         0.0     0.8387    0.6791    0.7505       268
         1.0     0.6961    0.8491    0.7650       232

    accuracy                         0.7580       500
   macro avg     0.7674    0.7641    0.7578       500
weighted avg     0.7725    0.7580    0.7573       500

Evaluation took 2.7374465465545654 seconds
Evaluating model roben_2_tok on bert on stochastic_no_ws for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:25<00:00,  2.59s/it]


              precision    recall  f1-score   support

         0.0     0.8318    0.6642    0.7386       268
         1.0     0.6853    0.8448    0.7568       232

    accuracy                         0.7480       500
   macro avg     0.7585    0.7545    0.7477       500
weighted avg     0.7638    0.7480    0.7470       500

Evaluation took 25.92541480064392 seconds
Evaluating model roben_2_tok on bert on stochastic_incl_ws for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:25<00:00,  2.59s/it]


              precision    recall  f1-score   support

         0.0     0.6082    0.5560    0.5809       268
         1.0     0.5333    0.5862    0.5585       232

    accuracy                         0.5700       500
   macro avg     0.5707    0.5711    0.5697       500
weighted avg     0.5734    0.5700    0.5705       500

Evaluation took 25.92218041419983 seconds
Evaluating model roben_2_tok on bert on word_score_no_ws for task sst_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:05<00:00,  7.68it/s]


              precision    recall  f1-score   support

         0.0     0.7964    0.6567    0.7198       268
         1.0     0.6703    0.8060    0.7319       232

    accuracy                         0.7260       500
   macro avg     0.7333    0.7314    0.7259       500
weighted avg     0.7379    0.7260    0.7254       500

Evaluation took 65.13420104980469 seconds
Evaluating model roben_2_tok on bert on word_score_incl_ws for task sst_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:47<00:00, 10.52it/s]


              precision    recall  f1-score   support

         0.0     0.3735    0.3470    0.3598       268
         1.0     0.3028    0.3276    0.3147       232

    accuracy                         0.3380       500
   macro avg     0.3381    0.3373    0.3372       500
weighted avg     0.3407    0.3380    0.3389       500

Evaluation took 47.52352476119995 seconds
Evaluating model 64k_lstm_clean_vanilla on bert on clean for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:05<00:00,  2.92it/s]


              precision    recall  f1-score   support

         0.0     0.9240    0.8619    0.8919       268
         1.0     0.8520    0.9181    0.8838       232

    accuracy                         0.8880       500
   macro avg     0.8880    0.8900    0.8879       500
weighted avg     0.8906    0.8880    0.8881       500

Evaluation took 5.54081392288208 seconds
Evaluating model 64k_lstm_clean_vanilla on bert on stochastic_no_ws for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:54<00:00,  5.50s/it]


              precision    recall  f1-score   support

         0.0     0.7490    0.7239    0.7362       268
         1.0     0.6929    0.7198    0.7061       232

    accuracy                         0.7220       500
   macro avg     0.7210    0.7219    0.7212       500
weighted avg     0.7230    0.7220    0.7223       500

Evaluation took 54.971192598342896 seconds
Evaluating model 64k_lstm_clean_vanilla on bert on stochastic_incl_ws for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:55<00:00,  5.54s/it]


              precision    recall  f1-score   support

         0.0     0.7082    0.6791    0.6933       268
         1.0     0.6461    0.6767    0.6611       232

    accuracy                         0.6780       500
   macro avg     0.6771    0.6779    0.6772       500
weighted avg     0.6794    0.6780    0.6784       500

Evaluation took 55.439828395843506 seconds
Evaluating model 64k_lstm_clean_vanilla on bert on word_score_no_ws for task sst_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [02:41<00:00,  3.09it/s]


              precision    recall  f1-score   support

         0.0     0.4431    0.4067    0.4241       268
         1.0     0.3740    0.4095    0.3909       232

    accuracy                         0.4080       500
   macro avg     0.4086    0.4081    0.4075       500
weighted avg     0.4110    0.4080    0.4087       500

Evaluation took 161.7474226951599 seconds
Evaluating model 64k_lstm_clean_vanilla on bert on word_score_incl_ws for task sst_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [02:25<00:00,  3.44it/s]


              precision    recall  f1-score   support

         0.0     0.3611    0.3396    0.3500       268
         1.0     0.2863    0.3060    0.2958       232

    accuracy                         0.3240       500
   macro avg     0.3237    0.3228    0.3229       500
weighted avg     0.3264    0.3240    0.3249       500

Evaluation took 145.51855397224426 seconds
Evaluating model 64k_lstm_no_whitespace_pert_vanilla on bert on clean for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:05<00:00,  2.85it/s]


              precision    recall  f1-score   support

         0.0     0.9016    0.8209    0.8594       268
         1.0     0.8125    0.8966    0.8525       232

    accuracy                         0.8560       500
   macro avg     0.8571    0.8587    0.8559       500
weighted avg     0.8603    0.8560    0.8562       500

Evaluation took 5.66375994682312 seconds
Evaluating model 64k_lstm_no_whitespace_pert_vanilla on bert on stochastic_no_ws for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:55<00:00,  5.50s/it]


              precision    recall  f1-score   support

         0.0     0.8638    0.7575    0.8072       268
         1.0     0.7547    0.8621    0.8048       232

    accuracy                         0.8060       500
   macro avg     0.8093    0.8098    0.8060       500
weighted avg     0.8132    0.8060    0.8061       500

Evaluation took 55.00786900520325 seconds
Evaluating model 64k_lstm_no_whitespace_pert_vanilla on bert on stochastic_incl_ws for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:54<00:00,  5.45s/it]


              precision    recall  f1-score   support

         0.0     0.7511    0.6530    0.6986       268
         1.0     0.6517    0.7500    0.6974       232

    accuracy                         0.6980       500
   macro avg     0.7014    0.7015    0.6980       500
weighted avg     0.7050    0.6980    0.6980       500

Evaluation took 54.50622606277466 seconds
Evaluating model 64k_lstm_no_whitespace_pert_vanilla on bert on word_score_no_ws for task sst_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [03:00<00:00,  2.77it/s]


              precision    recall  f1-score   support

         0.0     0.6444    0.5410    0.5882       268
         1.0     0.5527    0.6552    0.5996       232

    accuracy                         0.5940       500
   macro avg     0.5986    0.5981    0.5939       500
weighted avg     0.6019    0.5940    0.5935       500

Evaluation took 180.78644013404846 seconds
Evaluating model 64k_lstm_no_whitespace_pert_vanilla on bert on word_score_incl_ws for task sst_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [02:35<00:00,  3.21it/s]


              precision    recall  f1-score   support

         0.0     0.4397    0.3806    0.4080       268
         1.0     0.3806    0.4397    0.4080       232

    accuracy                         0.4080       500
   macro avg     0.4101    0.4101    0.4080       500
weighted avg     0.4123    0.4080    0.4080       500

Evaluation took 155.5572328567505 seconds
Evaluating model 64k_lstm_all_pert_vanilla on bert on clean for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:05<00:00,  2.92it/s]


              precision    recall  f1-score   support

         0.0     0.9163    0.8172    0.8639       268
         1.0     0.8123    0.9138    0.8600       232

    accuracy                         0.8620       500
   macro avg     0.8643    0.8655    0.8620       500
weighted avg     0.8680    0.8620    0.8621       500

Evaluation took 5.527689218521118 seconds
Evaluating model 64k_lstm_all_pert_vanilla on bert on stochastic_no_ws for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:53<00:00,  5.39s/it]


              precision    recall  f1-score   support

         0.0     0.8498    0.7388    0.7904       268
         1.0     0.7378    0.8491    0.7896       232

    accuracy                         0.7900       500
   macro avg     0.7938    0.7940    0.7900       500
weighted avg     0.7978    0.7900    0.7900       500

Evaluation took 53.93970799446106 seconds
Evaluating model 64k_lstm_all_pert_vanilla on bert on stochastic_incl_ws for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:54<00:00,  5.42s/it]


              precision    recall  f1-score   support

         0.0     0.8393    0.7015    0.7642       268
         1.0     0.7101    0.8448    0.7717       232

    accuracy                         0.7680       500
   macro avg     0.7747    0.7732    0.7679       500
weighted avg     0.7794    0.7680    0.7677       500

Evaluation took 54.21471428871155 seconds
Evaluating model 64k_lstm_all_pert_vanilla on bert on word_score_no_ws for task sst_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [02:53<00:00,  2.88it/s]


              precision    recall  f1-score   support

         0.0     0.6147    0.5299    0.5691       268
         1.0     0.5316    0.6164    0.5709       232

    accuracy                         0.5700       500
   macro avg     0.5732    0.5731    0.5700       500
weighted avg     0.5762    0.5700    0.5699       500

Evaluation took 173.6905767917633 seconds
Evaluating model 64k_lstm_all_pert_vanilla on bert on word_score_incl_ws for task sst_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [02:55<00:00,  2.85it/s]


              precision    recall  f1-score   support

         0.0     0.6215    0.4963    0.5519       268
         1.0     0.5280    0.6509    0.5830       232

    accuracy                         0.5680       500
   macro avg     0.5747    0.5736    0.5674       500
weighted avg     0.5781    0.5680    0.5663       500

Evaluation took 175.5556674003601 seconds
Failed loading model 64k_lstm_clean_finetuned on bert for task sst_bin, skipping
Failed loading model 64k_lstm_no_whitespace_pert_finetuned on bert for task sst_bin, skipping
Evaluating model 64k_lstm_all_pert_finetuned on bert on clean for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:05<00:00,  2.91it/s]


              precision    recall  f1-score   support

         0.0     0.9244    0.8209    0.8696       268
         1.0     0.8168    0.9224    0.8664       232

    accuracy                         0.8680       500
   macro avg     0.8706    0.8717    0.8680       500
weighted avg     0.8745    0.8680    0.8681       500

Evaluation took 5.5547194480896 seconds
Evaluating model 64k_lstm_all_pert_finetuned on bert on stochastic_no_ws for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:54<00:00,  5.49s/it]


              precision    recall  f1-score   support

         0.0     0.8700    0.7239    0.7902       268
         1.0     0.7329    0.8750    0.7976       232

    accuracy                         0.7940       500
   macro avg     0.8014    0.7994    0.7939       500
weighted avg     0.8063    0.7940    0.7937       500

Evaluation took 54.890515089035034 seconds
Evaluating model 64k_lstm_all_pert_finetuned on bert on stochastic_incl_ws for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:54<00:00,  5.46s/it]


              precision    recall  f1-score   support

         0.0     0.8333    0.6903    0.7551       268
         1.0     0.7014    0.8405    0.7647       232

    accuracy                         0.7600       500
   macro avg     0.7674    0.7654    0.7599       500
weighted avg     0.7721    0.7600    0.7596       500

Evaluation took 54.60873103141785 seconds
Evaluating model 64k_lstm_all_pert_finetuned on bert on word_score_no_ws for task sst_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [02:55<00:00,  2.85it/s]


              precision    recall  f1-score   support

         0.0     0.6303    0.4963    0.5553       268
         1.0     0.5329    0.6638    0.5912       232

    accuracy                         0.5740       500
   macro avg     0.5816    0.5800    0.5732       500
weighted avg     0.5851    0.5740    0.5720       500

Evaluation took 175.73686933517456 seconds
Evaluating model 64k_lstm_all_pert_finetuned on bert on word_score_incl_ws for task sst_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [02:55<00:00,  2.84it/s]


              precision    recall  f1-score   support

         0.0     0.6182    0.5075    0.5574       268
         1.0     0.5286    0.6379    0.5781       232

    accuracy                         0.5680       500
   macro avg     0.5734    0.5727    0.5678       500
weighted avg     0.5766    0.5680    0.5670       500

Evaluation took 175.80747747421265 seconds
Failed loading model 64k_cnn_no_whitespace_pert_finetuned on bert for task sst_bin, skipping
Failed loading model 2m_lstm_all_pert_finetuned on bert for task sst_bin, skipping
Failed loading model 32k_lstm_all_pert_finetuned_100ep on bert for task sst_bin, skipping
Evaluating model baseline on roberta on clean for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02<00:00,  6.41it/s]


              precision    recall  f1-score   support

         0.0     0.9570    0.9142    0.9351       268
         1.0     0.9057    0.9526    0.9286       232

    accuracy                         0.9320       500
   macro avg     0.9314    0.9334    0.9318       500
weighted avg     0.9332    0.9320    0.9321       500

Evaluation took 2.5529420375823975 seconds
Evaluating model baseline on roberta on stochastic_no_ws for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:25<00:00,  2.50s/it]


              precision    recall  f1-score   support

         0.0     0.7586    0.8209    0.7885       268
         1.0     0.7714    0.6983    0.7330       232

    accuracy                         0.7640       500
   macro avg     0.7650    0.7596    0.7608       500
weighted avg     0.7646    0.7640    0.7628       500

Evaluation took 25.019185543060303 seconds
Evaluating model baseline on roberta on stochastic_incl_ws for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:25<00:00,  2.51s/it]


              precision    recall  f1-score   support

         0.0     0.7759    0.8396    0.8065       268
         1.0     0.7952    0.7198    0.7557       232

    accuracy                         0.7840       500
   macro avg     0.7856    0.7797    0.7811       500
weighted avg     0.7849    0.7840    0.7829       500

Evaluation took 25.069686889648438 seconds
Evaluating model baseline on roberta on word_score_no_ws for task sst_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:59<00:00,  8.35it/s]


              precision    recall  f1-score   support

         0.0     0.5166    0.6381    0.5710       268
         1.0     0.4260    0.3103    0.3591       232

    accuracy                         0.4860       500
   macro avg     0.4713    0.4742    0.4650       500
weighted avg     0.4746    0.4860    0.4727       500

Evaluation took 59.93393397331238 seconds
Evaluating model baseline on roberta on word_score_incl_ws for task sst_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:58<00:00,  8.59it/s]


              precision    recall  f1-score   support

         0.0     0.5031    0.5970    0.5461       268
         1.0     0.4066    0.3190    0.3575       232

    accuracy                         0.4680       500
   macro avg     0.4549    0.4580    0.4518       500
weighted avg     0.4583    0.4680    0.4586       500

Evaluation took 58.22168469429016 seconds
Failed loading model baseline_all_pert on roberta for task sst_bin, skipping
Evaluating model roben_1 on roberta on clean for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02<00:00,  6.15it/s]


              precision    recall  f1-score   support

         0.0     0.7911    0.8619    0.8250       268
         1.0     0.8221    0.7371    0.7773       232

    accuracy                         0.8040       500
   macro avg     0.8066    0.7995    0.8011       500
weighted avg     0.8055    0.8040    0.8029       500

Evaluation took 2.657008647918701 seconds
Evaluating model roben_1 on roberta on stochastic_no_ws for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:25<00:00,  2.53s/it]


              precision    recall  f1-score   support

         0.0     0.6657    0.8172    0.7337       268
         1.0     0.7135    0.5259    0.6055       232

    accuracy                         0.6820       500
   macro avg     0.6896    0.6715    0.6696       500
weighted avg     0.6878    0.6820    0.6742       500

Evaluation took 25.33419442176819 seconds
Evaluating model roben_1 on roberta on stochastic_incl_ws for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:25<00:00,  2.55s/it]


              precision    recall  f1-score   support

         0.0     0.6242    0.7687    0.6890       268
         1.0     0.6353    0.4655    0.5373       232

    accuracy                         0.6280       500
   macro avg     0.6298    0.6171    0.6131       500
weighted avg     0.6294    0.6280    0.6186       500

Evaluation took 25.472959756851196 seconds
Evaluating model roben_1 on roberta on word_score_no_ws for task sst_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:59<00:00,  8.35it/s]


              precision    recall  f1-score   support

         0.0     0.5944    0.7985    0.6815       268
         1.0     0.6143    0.3707    0.4624       232

    accuracy                         0.6000       500
   macro avg     0.6044    0.5846    0.5719       500
weighted avg     0.6037    0.6000    0.5798       500

Evaluation took 59.877885818481445 seconds
Evaluating model roben_1 on roberta on word_score_incl_ws for task sst_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:49<00:00, 10.09it/s]


              precision    recall  f1-score   support

         0.0     0.4207    0.5149    0.4631       268
         1.0     0.2442    0.1810    0.2079       232

    accuracy                         0.3600       500
   macro avg     0.3325    0.3480    0.3355       500
weighted avg     0.3388    0.3600    0.3447       500

Evaluation took 49.997939109802246 seconds
Evaluating model roben_2 on roberta on clean for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02<00:00,  6.37it/s]


              precision    recall  f1-score   support

         0.0     0.8436    0.8657    0.8545       268
         1.0     0.8400    0.8147    0.8271       232

    accuracy                         0.8420       500
   macro avg     0.8418    0.8402    0.8408       500
weighted avg     0.8419    0.8420    0.8418       500

Evaluation took 2.571233034133911 seconds
Evaluating model roben_2 on roberta on stochastic_no_ws for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:25<00:00,  2.60s/it]


              precision    recall  f1-score   support

         0.0     0.7311    0.8321    0.7784       268
         1.0     0.7692    0.6466    0.7026       232

    accuracy                         0.7460       500
   macro avg     0.7502    0.7393    0.7405       500
weighted avg     0.7488    0.7460    0.7432       500

Evaluation took 25.9934401512146 seconds
Evaluating model roben_2 on roberta on stochastic_incl_ws for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:25<00:00,  2.53s/it]


              precision    recall  f1-score   support

         0.0     0.6720    0.7873    0.7251       268
         1.0     0.6935    0.5560    0.6172       232

    accuracy                         0.6800       500
   macro avg     0.6828    0.6717    0.6712       500
weighted avg     0.6820    0.6800    0.6750       500

Evaluation took 25.32649564743042 seconds
Evaluating model roben_2 on roberta on word_score_no_ws for task sst_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:07<00:00,  7.36it/s]


              precision    recall  f1-score   support

         0.0     0.6366    0.7910    0.7055       268
         1.0     0.6647    0.4784    0.5564       232

    accuracy                         0.6460       500
   macro avg     0.6507    0.6347    0.6309       500
weighted avg     0.6496    0.6460    0.6363       500

Evaluation took 67.95787072181702 seconds
Evaluating model roben_2 on roberta on word_score_incl_ws for task sst_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:56<00:00,  8.92it/s]


              precision    recall  f1-score   support

         0.0     0.4537    0.5672    0.5041       268
         1.0     0.2970    0.2112    0.2469       232

    accuracy                         0.4020       500
   macro avg     0.3754    0.3892    0.3755       500
weighted avg     0.3810    0.4020    0.3848       500

Evaluation took 56.046173334121704 seconds
Evaluating model roben_1_tok on roberta on clean for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02<00:00,  5.82it/s]


              precision    recall  f1-score   support

         0.0     0.7284    0.8507    0.7849       268
         1.0     0.7861    0.6336    0.7017       232

    accuracy                         0.7500       500
   macro avg     0.7573    0.7422    0.7433       500
weighted avg     0.7552    0.7500    0.7463       500

Evaluation took 2.807769298553467 seconds
Evaluating model roben_1_tok on roberta on stochastic_no_ws for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:27<00:00,  2.73s/it]


              precision    recall  f1-score   support

         0.0     0.7066    0.8358    0.7658       268
         1.0     0.7596    0.5991    0.6699       232

    accuracy                         0.7260       500
   macro avg     0.7331    0.7175    0.7178       500
weighted avg     0.7312    0.7260    0.7213       500

Evaluation took 27.356130123138428 seconds
Evaluating model roben_1_tok on roberta on stochastic_incl_ws for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:26<00:00,  2.60s/it]


              precision    recall  f1-score   support

         0.0     0.5924    0.7537    0.6634       268
         1.0     0.5849    0.4009    0.4757       232

    accuracy                         0.5900       500
   macro avg     0.5886    0.5773    0.5695       500
weighted avg     0.5889    0.5900    0.5763       500

Evaluation took 26.046791315078735 seconds
Evaluating model roben_1_tok on roberta on word_score_no_ws for task sst_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:04<00:00,  7.78it/s]


              precision    recall  f1-score   support

         0.0     0.6799    0.8321    0.7483       268
         1.0     0.7384    0.5474    0.6287       232

    accuracy                         0.7000       500
   macro avg     0.7091    0.6898    0.6885       500
weighted avg     0.7070    0.7000    0.6928       500

Evaluation took 64.26963114738464 seconds
Evaluating model roben_1_tok on roberta on word_score_incl_ws for task sst_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:47<00:00, 10.53it/s]


              precision    recall  f1-score   support

         0.0     0.4317    0.5187    0.4712       268
         1.0     0.2753    0.2112    0.2390       232

    accuracy                         0.3760       500
   macro avg     0.3535    0.3649    0.3551       500
weighted avg     0.3591    0.3760    0.3635       500

Evaluation took 47.49795055389404 seconds
Evaluating model roben_2_tok on roberta on clean for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02<00:00,  6.23it/s]


              precision    recall  f1-score   support

         0.0     0.7677    0.8507    0.8071       268
         1.0     0.8030    0.7026    0.7494       232

    accuracy                         0.7820       500
   macro avg     0.7853    0.7767    0.7783       500
weighted avg     0.7840    0.7820    0.7803       500

Evaluation took 2.625157356262207 seconds
Evaluating model roben_2_tok on roberta on stochastic_no_ws for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:26<00:00,  2.60s/it]


              precision    recall  f1-score   support

         0.0     0.7559    0.8321    0.7922       268
         1.0     0.7805    0.6897    0.7323       232

    accuracy                         0.7660       500
   macro avg     0.7682    0.7609    0.7622       500
weighted avg     0.7673    0.7660    0.7644       500

Evaluation took 26.02118754386902 seconds
Evaluating model roben_2_tok on roberta on stochastic_incl_ws for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:25<00:00,  2.59s/it]


              precision    recall  f1-score   support

         0.0     0.6469    0.7724    0.7041       268
         1.0     0.6611    0.5129    0.5777       232

    accuracy                         0.6520       500
   macro avg     0.6540    0.6427    0.6409       500
weighted avg     0.6535    0.6520    0.6454       500

Evaluation took 25.857235431671143 seconds
Evaluating model roben_2_tok on roberta on word_score_no_ws for task sst_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:06<00:00,  7.48it/s]


              precision    recall  f1-score   support

         0.0     0.7336    0.8321    0.7797       268
         1.0     0.7704    0.6509    0.7056       232

    accuracy                         0.7480       500
   macro avg     0.7520    0.7415    0.7427       500
weighted avg     0.7507    0.7480    0.7453       500

Evaluation took 66.83108067512512 seconds
Evaluating model roben_2_tok on roberta on word_score_incl_ws for task sst_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:50<00:00,  9.94it/s]


              precision    recall  f1-score   support

         0.0     0.4537    0.5299    0.4888       268
         1.0     0.3262    0.2629    0.2912       232

    accuracy                         0.4060       500
   macro avg     0.3899    0.3964    0.3900       500
weighted avg     0.3945    0.4060    0.3971       500

Evaluation took 50.31882643699646 seconds
Failed loading model 64k_lstm_clean_vanilla on roberta for task sst_bin, skipping
Failed loading model 64k_lstm_no_whitespace_pert_vanilla on roberta for task sst_bin, skipping
Evaluating model 64k_lstm_all_pert_vanilla on roberta on clean for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:05<00:00,  2.81it/s]


              precision    recall  f1-score   support

         0.0     0.8985    0.8918    0.8951       268
         1.0     0.8761    0.8836    0.8798       232

    accuracy                         0.8880       500
   macro avg     0.8873    0.8877    0.8875       500
weighted avg     0.8881    0.8880    0.8880       500

Evaluation took 5.7604992389678955 seconds
Evaluating model 64k_lstm_all_pert_vanilla on roberta on stochastic_no_ws for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:57<00:00,  5.74s/it]


              precision    recall  f1-score   support

         0.0     0.8145    0.8358    0.8250       268
         1.0     0.8044    0.7802    0.7921       232

    accuracy                         0.8100       500
   macro avg     0.8095    0.8080    0.8086       500
weighted avg     0.8099    0.8100    0.8098       500

Evaluation took 57.395200967788696 seconds
Evaluating model 64k_lstm_all_pert_vanilla on roberta on stochastic_incl_ws for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:58<00:00,  5.87s/it]


              precision    recall  f1-score   support

         0.0     0.8229    0.8321    0.8275       268
         1.0     0.8035    0.7931    0.7983       232

    accuracy                         0.8140       500
   macro avg     0.8132    0.8126    0.8129       500
weighted avg     0.8139    0.8140    0.8139       500

Evaluation took 58.717193603515625 seconds
Evaluating model 64k_lstm_all_pert_vanilla on roberta on word_score_no_ws for task sst_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [03:04<00:00,  2.71it/s]


              precision    recall  f1-score   support

         0.0     0.6364    0.6791    0.6570       268
         1.0     0.5981    0.5517    0.5740       232

    accuracy                         0.6200       500
   macro avg     0.6172    0.6154    0.6155       500
weighted avg     0.6186    0.6200    0.6185       500

Evaluation took 184.77083325386047 seconds
Evaluating model 64k_lstm_all_pert_vanilla on roberta on word_score_incl_ws for task sst_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [03:12<00:00,  2.59it/s]


              precision    recall  f1-score   support

         0.0     0.6348    0.6940    0.6631       268
         1.0     0.6039    0.5388    0.5695       232

    accuracy                         0.6220       500
   macro avg     0.6193    0.6164    0.6163       500
weighted avg     0.6205    0.6220    0.6197       500

Evaluation took 192.936678647995 seconds
Failed loading model 64k_lstm_clean_finetuned on roberta for task sst_bin, skipping
Failed loading model 64k_lstm_no_whitespace_pert_finetuned on roberta for task sst_bin, skipping
Evaluating model 64k_lstm_all_pert_finetuned on roberta on clean for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:06<00:00,  2.56it/s]


              precision    recall  f1-score   support

         0.0     0.9325    0.8769    0.9038       268
         1.0     0.8669    0.9267    0.8958       232

    accuracy                         0.9000       500
   macro avg     0.8997    0.9018    0.8998       500
weighted avg     0.9021    0.9000    0.9001       500

Evaluation took 6.301508903503418 seconds
Evaluating model 64k_lstm_all_pert_finetuned on roberta on stochastic_no_ws for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [01:00<00:00,  6.03s/it]


              precision    recall  f1-score   support

         0.0     0.8537    0.7836    0.8171       268
         1.0     0.7717    0.8448    0.8066       232

    accuracy                         0.8120       500
   macro avg     0.8127    0.8142    0.8119       500
weighted avg     0.8156    0.8120    0.8122       500

Evaluation took 60.30332326889038 seconds
Evaluating model 64k_lstm_all_pert_finetuned on roberta on stochastic_incl_ws for task sst_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:57<00:00,  5.75s/it]


              precision    recall  f1-score   support

         0.0     0.8583    0.8134    0.8352       268
         1.0     0.7967    0.8448    0.8201       232

    accuracy                         0.8280       500
   macro avg     0.8275    0.8291    0.8277       500
weighted avg     0.8297    0.8280    0.8282       500

Evaluation took 57.47007870674133 seconds
Evaluating model 64k_lstm_all_pert_finetuned on roberta on word_score_no_ws for task sst_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [03:24<00:00,  2.44it/s]


              precision    recall  f1-score   support

         0.0     0.6367    0.6082    0.6221       268
         1.0     0.5697    0.5991    0.5840       232

    accuracy                         0.6040       500
   macro avg     0.6032    0.6037    0.6031       500
weighted avg     0.6056    0.6040    0.6045       500

Evaluation took 204.63213920593262 seconds
Evaluating model 64k_lstm_all_pert_finetuned on roberta on word_score_incl_ws for task sst_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [03:10<00:00,  2.62it/s]

              precision    recall  f1-score   support

         0.0     0.6513    0.6343    0.6427       268
         1.0     0.5900    0.6078    0.5987       232

    accuracy                         0.6220       500
   macro avg     0.6206    0.6210    0.6207       500
weighted avg     0.6229    0.6220    0.6223       500

Evaluation took 190.95674324035645 seconds
Failed loading model 64k_cnn_no_whitespace_pert_finetuned on roberta for task sst_bin, skipping
Failed loading model 2m_lstm_all_pert_finetuned on roberta for task sst_bin, skipping
Failed loading model 32k_lstm_all_pert_finetuned_100ep on roberta for task sst_bin, skipping





In [41]:
accuracy_df

Unnamed: 0,clean,stochastic_no_ws,stochastic_incl_ws,word_score_no_ws,word_score_incl_ws
baseline_bert_sst_bin,0.914,0.708,0.7,0.422,0.388
roben_1_bert_sst_bin,0.772,0.654,0.526,0.532,0.292
roben_2_bert_sst_bin,0.8,0.704,0.582,0.586,0.318
roben_1_tok_bert_sst_bin,0.694,0.668,0.498,0.646,0.274
roben_2_tok_bert_sst_bin,0.758,0.748,0.57,0.726,0.338
64k_lstm_clean_vanilla_bert_sst_bin,0.888,0.722,0.678,0.408,0.324
64k_lstm_no_whitespace_pert_vanilla_bert_sst_bin,0.856,0.806,0.698,0.594,0.408
64k_lstm_all_pert_vanilla_bert_sst_bin,0.862,0.79,0.768,0.57,0.568
64k_lstm_all_pert_finetuned_bert_sst_bin,0.868,0.794,0.76,0.574,0.568
baseline_roberta_sst_bin,0.932,0.764,0.784,0.486,0.468


In [42]:
f1_df

Unnamed: 0,clean,stochastic_no_ws,stochastic_incl_ws,word_score_no_ws,word_score_incl_ws
baseline_bert_sst_bin,0.913876,0.705721,0.697209,0.417693,0.380777
roben_1_bert_sst_bin,0.771942,0.653832,0.525162,0.529281,0.285414
roben_2_bert_sst_bin,0.799949,0.703882,0.580952,0.584402,0.314642
roben_1_tok_bert_sst_bin,0.693989,0.667979,0.497113,0.645931,0.270425
roben_2_tok_bert_sst_bin,0.757782,0.747673,0.569709,0.725867,0.337234
64k_lstm_clean_vanilla_bert_sst_bin,0.887855,0.721187,0.677193,0.407536,0.322917
64k_lstm_no_whitespace_pert_vanilla_bert_sst_bin,0.855917,0.805993,0.697999,0.59392,0.408
64k_lstm_all_pert_vanilla_bert_sst_bin,0.861973,0.789999,0.767941,0.569998,0.567439
64k_lstm_all_pert_finetuned_bert_sst_bin,0.867981,0.793933,0.759904,0.573247,0.567751
baseline_roberta_sst_bin,0.931843,0.760781,0.781054,0.465027,0.451782


In [43]:
accuracy_df.to_csv("../output/grid_accuracy.csv")

In [44]:
f1_df.to_csv("../output/grid_f1.csv")

-----