# Multi Model Evaluation

In [1]:
import sys
sys.path.append("..")

import copy
import cProfile
from datasets import load_dataset
import itertools
import json
import math
import nltk
from nltk.tokenize.treebank import TreebankWordDetokenizer
import pandas as pd
import random
from sklearn.metrics import classification_report, accuracy_score, f1_score
import time
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, DataCollatorWithPadding, \
                         AutoModelForSequenceClassification, BertForSequenceClassification, \
                         BertModel, RobertaForSequenceClassification, RobertaModel

from resilient_nlp.mini_roben import Clustering, ClusterRepRecoverer, ClusterRecovererWithPassthrough
from resilient_nlp.models import BertClassifier
from resilient_nlp.perturbers import ToyPerturber, WordScramblerPerturber
from runner import ExperimentRunner
from word_score_attack import BertWordScoreAttack

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [2]:
#tasks = ('imdb', 'sst', 'sst_bin', 'yelp_bin', 'yelp_full')
#tasks = ('sst_bin', 'yelp_bin', 'yelp_full')
tasks = ('yelp_full',)

In [3]:
model_types = ('bert', 'roberta')

Config for final evaluation on test set

In [4]:
eval_set_size = 500
use_dev_set = False

Config for evaluation on dev set

In [5]:
#eval_set_size = 113
#use_dev_set = True

In [6]:
max_raw_length = 826
preprocess = lambda row: { 'text': row['text'].lower()[:max_raw_length]}

## IMDb Dataset

In [7]:
sampled_test_set = {}
sampled_test_set_dict = {}
sampled_test_set_adv_no_ws = {}
sampled_test_set_adv_incl_ws = {}

In [8]:
imdb = load_dataset('../output/huggingface/imdb')

Using custom data configuration redacted--imdb-f63738dec0d5e230
Reusing dataset parquet (/home/user/.cache/huggingface/datasets/parquet/redacted--imdb-f63738dec0d5e230/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)


  0%|          | 0/4 [00:00<?, ?it/s]

In [9]:
random.seed(11)
if use_dev_set:
    sampled_test_set['imdb'] = imdb['dev'].select(random.choices(range(len(imdb['dev'])), k=eval_set_size)).map(preprocess)
else:
    sampled_test_set['imdb'] = imdb['attack_eval_truncated'].select(range(eval_set_size)).map(preprocess)


# This is silly but apparently huggingface datasets are immutable?
# Representing it as something a bit more sane
sampled_test_set_dict['imdb'] = [
    {
        'text': row['text'],
        'label': row['label'],
    }
    for row in sampled_test_set['imdb']
]

Loading cached processed dataset at /home/user/.cache/huggingface/datasets/parquet/redacted--imdb-f63738dec0d5e230/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901/cache-b45d493b37ab1dc7.arrow


## SST-5 Dataset

In [10]:
treebank_detok = TreebankWordDetokenizer()

sst = load_dataset('sst').map(
    lambda row: {
        "text": treebank_detok.detokenize(row["sentence"].split()),
        "label": min(math.floor(row["label"] / 0.2), 4.0),
    }, remove_columns=['sentence', 'tokens', 'tree']
)

random.seed(11)
if use_dev_set:
    sampled_test_set['sst'] = sst['validation'].select(random.choices(range(len(sst['validation'])), k=eval_set_size)).map(preprocess)
else:
    sampled_test_set['sst'] = sst['test'].select(random.choices(range(len(sst['test'])), k=eval_set_size)).map(preprocess)

sampled_test_set_dict['sst'] = [
    {
        'text': row['text'],
        'label': row['label'],
    }
    for row in sampled_test_set['sst']
]

No config specified, defaulting to: sst/default
Reusing dataset sst (/home/user/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /home/user/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-3c142acdab53f98c.arrow
Loading cached processed dataset at /home/user/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-0bf56ce0086915ee.arrow
Loading cached processed dataset at /home/user/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-19fdf8d124be4ba7.arrow
Loading cached processed dataset at /home/user/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-fc3e75429c3f7637.arrow


## SST-2 Dataset

In [11]:
treebank_detok = TreebankWordDetokenizer()

sst_bin = load_dataset('sst').filter(
        lambda row: row["label"] < 0.4 or row["label"] >= 0.6
    ).map(
    lambda row: {
        "text": treebank_detok.detokenize(row["sentence"].split()),
        "label": min(math.floor(row["label"] / 0.5), 1.0),
    }
)

random.seed(11)
if use_dev_set:
    sampled_test_set['sst_bin'] = sst_bin['validation'].select(random.choices(range(len(sst_bin['validation'])), k=eval_set_size)).map(preprocess)
else:
    sampled_test_set['sst_bin'] = sst_bin['test'].select(random.choices(range(len(sst_bin['test'])), k=eval_set_size)).map(preprocess)

sampled_test_set_dict['sst_bin'] = [
    {
        'text': row['text'],
        'label': row['label'],
    }
    for row in sampled_test_set['sst_bin']
]

No config specified, defaulting to: sst/default
Reusing dataset sst (/home/user/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /home/user/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-f4f1ada73617d193.arrow
Loading cached processed dataset at /home/user/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-6279b6f0f8a08f9a.arrow
Loading cached processed dataset at /home/user/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-6c5f77e5aefdd0e2.arrow
Loading cached processed dataset at /home/user/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-c46f07c913633b4d.arrow
Loading cached processed dataset at /home/user/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-87779081ead23eae.arrow
Loading cached processed dataset at /home/user/.cache/huggingface

## Yelp-2

In [12]:
yelp_bin = load_dataset('yelp_polarity')

random.seed(11)
sampled_test_set['yelp_bin'] = yelp_bin['test'].select(random.choices(range(len(yelp_bin['test'])), k=eval_set_size)).map(preprocess)

sampled_test_set_dict['yelp_bin'] = [
    {
        'text': row['text'],
        'label': row['label'],
    }
    for row in sampled_test_set['yelp_bin']
]

Reusing dataset yelp_polarity (/home/user/.cache/huggingface/datasets/yelp_polarity/plain_text/1.0.0/a770787b2526bdcbfc29ac2d9beb8e820fbc15a03afd3ebc4fb9d8529de57544)


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?ex/s]

## Yelp-5

In [13]:
yelp_full = load_dataset('yelp_review_full')

random.seed(11)
sampled_test_set['yelp_full'] = yelp_full['test'].select(random.choices(range(len(yelp_full['test'])), k=eval_set_size)).map(preprocess)

sampled_test_set_dict['yelp_full'] = [
    {
        'text': row['text'],
        'label': row['label'],
    }
    for row in sampled_test_set['yelp_full']
]

Reusing dataset yelp_review_full (/home/user/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/13c31a618ba62568ec8572a222a283dfc29a6517776a3ac5945fb508877dde43)


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?ex/s]

### Perturbations

In [14]:
def generate_perturbed_multiset(input, wsp):
    random.seed(11)
    result = []

    for i in range(10):
        test_item = copy.deepcopy(input)

        for row in test_item:
            row['text'] = wsp.perturb([row['text']])[0][0]
        result.append(test_item)
    
    return result

Perturbed set with no whitespace modifications

In [15]:
wsp = WordScramblerPerturber(perturb_prob=0.1, weight_add=1, weight_drop=1, weight_swap=1,
                             weight_split_word=0, weight_merge_words=0)

for task in tasks:
    sampled_test_set_adv_no_ws[task] = generate_perturbed_multiset(sampled_test_set_dict[task], wsp)

Perturbed set with whitespace modifications

In [16]:
wsp = WordScramblerPerturber(perturb_prob=0.1, weight_add=1, weight_drop=1, weight_swap=1,
                             weight_split_word=1, weight_merge_words=1)

for task in tasks:
    sampled_test_set_adv_incl_ws[task] = generate_perturbed_multiset(sampled_test_set_dict[task], wsp)

## Models

### BERT, including finetuned variants

In [17]:
tokenizer = {}
model_base = {}
model_finetuned = { type: {} for type in model_types }
model_finetuned_all_pert = { type: {} for type in model_types }

In [18]:
bert_checkpoint = "bert-base-uncased"
tokenizer['bert'] = AutoTokenizer.from_pretrained(bert_checkpoint)
model_base['bert'] = BertModel.from_pretrained(bert_checkpoint).to(device)

In [19]:
bert_checkpoint_finetuned_imdb = "../output/huggingface/bert-base-uncased-imdb"
if 'bert' in model_types and 'imdb' in tasks:
    model_finetuned['bert']['imdb'] = BertForSequenceClassification.from_pretrained(bert_checkpoint_finetuned_imdb).to(device)

In [20]:
bert_checkpoint_finetuned_imdb_all_pert = "../output/huggingface/bert-base-uncased-imdb-all-pert"
if 'bert' in model_types and 'imdb' in tasks:
    model_finetuned_all_pert['bert']['imdb'] = BertForSequenceClassification.from_pretrained(bert_checkpoint_finetuned_imdb_all_pert).to(device)

In [21]:
bert_checkpoint_finetuned_sst = "../output/huggingface/bert-base-uncased-sst"
if 'bert' in model_types and 'sst' in tasks:
    model_finetuned['bert']['sst'] = BertForSequenceClassification.from_pretrained(bert_checkpoint_finetuned_sst).to(device)

In [22]:
bert_checkpoint_finetuned_sst_bin = '../output/huggingface/bert-base-uncased-sst_bin/checkpoint-800'
if 'bert' in model_types and 'sst_bin' in tasks:
    model_finetuned['bert']['sst_bin'] = BertForSequenceClassification.from_pretrained(bert_checkpoint_finetuned_sst_bin).to(device)

In [23]:
bert_checkpoint_finetuned_yelp_bin = '../output/huggingface/bert-base-uncased-yelp_bin/checkpoint-3500'
if 'bert' in model_types and 'yelp_bin' in tasks:
    model_finetuned['bert']['yelp_bin'] = BertForSequenceClassification.from_pretrained(bert_checkpoint_finetuned_yelp_bin).to(device)

In [24]:
bert_checkpoint_finetuned_yelp_full = '../output/huggingface/bert-base-uncased-yelp_full/checkpoint-1500'
if 'bert' in model_types and 'yelp_full' in tasks:
    model_finetuned['bert']['yelp_full'] = BertForSequenceClassification.from_pretrained(bert_checkpoint_finetuned_yelp_full).to(device)

In [25]:
roberta_checkpoint = "roberta-base"
tokenizer['roberta'] = AutoTokenizer.from_pretrained(roberta_checkpoint)
model_base['roberta'] = RobertaModel.from_pretrained(roberta_checkpoint).to(device)

In [26]:
roberta_checkpoint_finetuned_imdb = "../output/huggingface/roberta-base-imdb"
if 'roberta' in model_types and 'imdb' in tasks:
    model_finetuned['roberta']['imdb'] = RobertaForSequenceClassification.from_pretrained(roberta_checkpoint_finetuned_imdb).to(device)

In [27]:
roberta_checkpoint_finetuned_sst = '../output/huggingface/roberta-base-sst/checkpoint-900'
if 'roberta' in model_types and 'sst' in tasks:
    model_finetuned['roberta']['sst'] = RobertaForSequenceClassification.from_pretrained(roberta_checkpoint_finetuned_sst).to(device)

In [28]:
roberta_checkpoint_finetuned_sst_bin = '../output/huggingface/roberta-base-sst_bin/checkpoint-700'
if 'roberta' in model_types and 'sst_bin' in tasks:
    model_finetuned['roberta']['sst_bin'] = RobertaForSequenceClassification.from_pretrained(roberta_checkpoint_finetuned_sst_bin).to(device)

In [29]:
roberta_checkpoint_finetuned_yelp_bin = '../output/huggingface/roberta-base-yelp_bin/checkpoint-2200'
if 'roberta' in model_types and 'yelp_bin' in tasks:
    model_finetuned['roberta']['yelp_bin'] = RobertaForSequenceClassification.from_pretrained(roberta_checkpoint_finetuned_yelp_bin).to(device)

In [30]:
roberta_checkpoint_finetuned_yelp_full = '../output/huggingface/roberta-base-yelp_full/checkpoint-7500'
if 'roberta' in model_types and 'yelp_full' in tasks:
    model_finetuned['roberta']['yelp_full'] = RobertaForSequenceClassification.from_pretrained(roberta_checkpoint_finetuned_yelp_full).to(device)

### RobEn clusterings (as baseline)

The first clustering is ConnComp (which very aggressively merges clusters). The second is AggClust, which uses a cost function to better preserve fidelity. The second one should generally be better.

In [31]:
roben_clustering = Clustering.from_pickle("../vocab100000_ed1.pkl")
roben_recoverer = ClusterRecovererWithPassthrough("cache", roben_clustering)
roben_clustering2 = Clustering.from_pickle("../vocab100000_ed1_gamma0.3.pkl")
roben_recoverer2 = ClusterRecovererWithPassthrough("cache", roben_clustering2)

## Model Prediction Helpers

In [32]:
max_sequence_length = 128
batch_size = 32

These are wrappers for standard (possibly finetuned) Huggingface models, using their normal tokenizers.

In [33]:
def standard_model_predict(tokenizer, model, sentences, recoverer, return_pred_tensor, recoverer_tokenize):
    if recoverer is not None:
        if recoverer_tokenize:
            tok = nltk.tokenize.treebank.TreebankWordTokenizer()
            sentences = [ " ".join(tok_list) for tok_list in tok.tokenize_sents(sentences) ]
        sentences = [ recoverer.recover(s.lower()) for s in sentences ]
        if recoverer_tokenize:
            detok = nltk.tokenize.treebank.TreebankWordDetokenizer()
            sentences = [ detok.detokenize(s.split(" ")) for s in sentences]
    tokenized = tokenizer(sentences, truncation=True, padding='max_length', max_length=max_sequence_length,
                          return_tensors='pt')
    tokenized = { k: v.to(device) for k, v in tokenized.items() }
    preds = model(**tokenized)
    if return_pred_tensor:
        return preds
    else:
        return torch.argmax(preds.logits, dim=1)

def wrap_standard_model(tokenizer, model, recoverer=None, return_pred_tensor=True, recoverer_tokenize=False):
    return lambda sentences: standard_model_predict(tokenizer, model, sentences, recoverer, return_pred_tensor,
                                                    recoverer_tokenize)

This is a wrapper for the machine trained tokenizer+embedder (aka MockingBERT)

In [34]:
def mltokenizer_model_predict(runner, model, cls_embedding, sep_embedding, pad_embedding, sentences, return_pred_tensor):
    # Truncate and lower case. Truncation is for performance only
    sentences = [ s.lower()[:8*max_sequence_length] for s in sentences]
    embedding = runner.embed(sentences=sentences,
        start_token=cls_embedding, end_token=sep_embedding, pad_token=pad_embedding,
        max_tokens=max_sequence_length)
    preds = model(inputs_embeds=embedding['inputs_embeds'], attention_mask=embedding['attention_mask'])
    if return_pred_tensor:
        return preds
    else:
        return torch.argmax(preds.logits, dim=1)

def wrap_mltokenizer_model(mltokenizer_prefix, tokenizer, model, cf_embedding, type, return_pred_tensor=True):
    filename = "../{}.pth".format(mltokenizer_prefix)
    runner = ExperimentRunner(device, model_filename=filename)
    if type == 'bert':
        cls_token_id = tokenizer.vocab['[CLS]']
        sep_token_id = tokenizer.vocab['[SEP]']
        pad_token_id = tokenizer.vocab['[PAD]']
    elif type == 'roberta':
        cls_token_id = tokenizer.vocab['<s>']
        sep_token_id = tokenizer.vocab['</s>']
        pad_token_id = tokenizer.vocab['<pad>']
    cls_embedding = cf_embedding(torch.tensor([cls_token_id], device=device)).view(-1)
    sep_embedding = cf_embedding(torch.tensor([sep_token_id], device=device)).view(-1)
    pad_embedding = cf_embedding(torch.tensor([pad_token_id], device=device)).view(-1)
    
    return lambda sentences: mltokenizer_model_predict(runner, model, cls_embedding, sep_embedding,
                                                      pad_embedding, sentences, return_pred_tensor)

## Evaluation Helpers

Evaluates a wrapped model on a test set

In [35]:
@torch.no_grad()
def evaluate_model(model, test_set):
    num_batches = math.ceil(len(test_set) / batch_size)
    
    sentences = [ x['text'] for x in test_set ]
    labels = [ x['label'] for x in test_set ]
    pred_batches = []
    
    for i in tqdm(range(num_batches)):
        bs = i * batch_size
        be = bs + batch_size
        
        output = model(sentences[bs:be])
        
        pred_batches.append(torch.argmax(output.logits, dim=1).detach().cpu())
    preds = torch.cat(pred_batches)
    
    print(classification_report(labels, preds, digits=4))
    
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='macro')
    
    return accuracy, f1

Evaluates a wrapped model on a stochastic, pseudo-adversarial test set. This means that each input sentence is replicated x times (typically 10) with randomized perturbations, and an attack is considered successful if *any* of the predictions is incorrect.

In [36]:
@torch.no_grad()
def evaluate_model_adv(model, test_sets):
    labels = [ x['label'] for x in test_sets[0] ]
    adv_preds = copy.copy(labels)
    accuracy_list = []
    f1_list = []
    
    for idx in tqdm(range(len(test_sets))):
        test_set = test_sets[idx]
        num_batches = math.ceil(len(test_set) / batch_size)
    
        sentences = [ x['text'] for x in test_set ]
        pred_batches = []
    
        for i in range(num_batches):
            bs = i * batch_size
            be = bs + batch_size
        
            output = model(sentences[bs:be])
        
            pred_batches.append(torch.argmax(output.logits, dim=1).detach().cpu())
        preds = torch.cat(pred_batches)
        
        for i in range(len(adv_preds)):
            if labels[i] != preds[i]:
                adv_preds[i] = preds[i]

        accuracy_list.append(accuracy_score(labels, adv_preds))
        f1_list.append(f1_score(labels, adv_preds, average='macro'))
    
    print(classification_report(labels, adv_preds, digits=4))    
    
    return accuracy_list, f1_list

Evaluates a model using WordScoreAttack

In [37]:
@torch.no_grad()
def evaluate_model_word_score(model, test_set, allow_whitespace_pert=True, report_prefix=None, word_scores_file=None):
    attacker = BertWordScoreAttack(
        WordScramblerPerturber(perturb_prob=1, weight_add=1, weight_drop=1, weight_swap=1,
                               weight_split_word=int(allow_whitespace_pert),
                               weight_merge_words=0),
        word_scores_file, model, tokenizer=None, max_sequence_length=max_sequence_length,
        attack_whitespace=allow_whitespace_pert,
    )

    res = attacker.attack(test_set, max_tokens_to_perturb=10, max_tries_per_token=4, mode=0, print_summary=False)

    if report_prefix is not None:
        res.to_csv(f"{report_prefix}_df.csv")
        with open(f"{report_prefix}_stats.json", "w") as f:
            json.dump(attacker.compute_attack_stats(), fp=f)            
    
    print(classification_report(res['ground_truth'], res['perturbed_preds'], digits=4))    
    
    accuracy = accuracy_score(res['ground_truth'], res['perturbed_preds'])
    f1 = f1_score(res['ground_truth'], res['perturbed_preds'], average='macro')
    
    return accuracy, f1

In [38]:
all_models = {
    'baseline': lambda task, type: wrap_standard_model(tokenizer[type], model_finetuned[type][task]),
    'baseline_all_pert': lambda task, type: wrap_standard_model(tokenizer[type], model_finetuned_all_pert[type][task]),
    'roben_1': lambda task, type: wrap_standard_model(tokenizer[type], model_finetuned[type][task], roben_recoverer),
    'roben_2': lambda task, type: wrap_standard_model(tokenizer[type], model_finetuned[type][task], roben_recoverer2),
    'roben_1_tok': lambda task, type: wrap_standard_model(tokenizer[type], model_finetuned[type][task], roben_recoverer, recoverer_tokenize=True),
    'roben_2_tok': lambda task, type: wrap_standard_model(tokenizer[type], model_finetuned[type][task], roben_recoverer2, recoverer_tokenize=True),
}

mltok_model_names = [
    '64k_lstm_clean_vanilla',
    '64k_lstm_no_whitespace_pert_vanilla',
    '64k_lstm_all_pert_vanilla',
    '64k_lstm_clean_finetuned',
    '64k_lstm_no_whitespace_pert_finetuned',
    '64k_lstm_all_pert_finetuned',
    '64k_cnn_no_whitespace_pert_finetuned',
    '2m_lstm_all_pert_finetuned',
    '32k_lstm_all_pert_finetuned_100ep',
]

for name in mltok_model_names:
    if name.endswith('_vanilla'):
        cf_embedding = lambda task, type: model_base[type].embeddings.word_embeddings
        filename = lambda task, type, name: f'output/{type}_{name}'
    else:
        cf_embedding = lambda task, type: model_finetuned[type][task].base_model.embeddings.word_embeddings
        filename = lambda task, type, name: f'output/{type}_{name}_{task}'
    # name=name is a hack to avoid Python late binding
    all_models[name] = lambda task, type, name=name, filename=filename, cf_embedding=cf_embedding: wrap_mltokenizer_model(filename(task, type, name), tokenizer[type], model_finetuned[type][task], cf_embedding(task, type), type)

In [39]:
evaluations = [
    'clean',
    'stochastic_no_ws',
    'stochastic_incl_ws',
    'word_score_no_ws',
    'word_score_incl_ws',
]

In [40]:
model_task_ids = [ f"{model}_{type}_{task}" for task, type, model in itertools.product(tasks, model_types, all_models.keys()) ]

accuracy_df = pd.DataFrame(columns=evaluations, index=model_task_ids)
f1_df = pd.DataFrame(columns=evaluations, index=model_task_ids)

for task, type in itertools.product(tasks, model_types):
    for cur_model_name, cur_model_factory in all_models.items():
        try:
            cur_model = cur_model_factory(task, type)
        except:
            print(f'Failed loading model {cur_model_name} on {type} for task {task}, skipping')
            accuracy_df.drop(f"{cur_model_name}_{type}_{task}", inplace=True)
            f1_df.drop(f"{cur_model_name}_{type}_{task}", inplace=True)
            continue
        for cur_evaluation in evaluations:
            print(f'Evaluating model {cur_model_name} on {type} on {cur_evaluation} for task {task}')
            start_time = time.time()
            random.seed(11)
            if cur_evaluation == 'clean':
                acc, f1 = evaluate_model(cur_model, sampled_test_set[task])
            elif cur_evaluation.startswith('stochastic_'):
                if cur_evaluation == 'stochastic_no_ws':
                    acc_list, f1_list = evaluate_model_adv(cur_model, sampled_test_set_adv_no_ws[task])
                elif cur_evaluation == 'stochastic_incl_ws':
                    acc_list, f1_list = evaluate_model_adv(cur_model, sampled_test_set_adv_incl_ws[task])
                acc = acc_list[-1]
                f1 = f1_list[-1]
                with open(f"../output/eval/{cur_model_name}_{type}_{task}_{cur_evaluation}_acc_list.json", "w") as f:
                    json.dump(acc_list, fp=f)
                with open(f"../output/eval/{cur_model_name}_{type}_{task}_{cur_evaluation}_f1_list.json", "w") as f:
                    json.dump(f1_list, fp=f)
            elif cur_evaluation.startswith('word_score_'):
                if cur_evaluation == 'word_score_no_ws':
                    acc, f1 = evaluate_model_word_score(cur_model, sampled_test_set[task], allow_whitespace_pert=False,
                                                        report_prefix=f"../output/eval/{cur_model_name}_{type}_{task}_{cur_evaluation}",
                                                        word_scores_file=f"../output/{task}_word_scores.json")
                elif cur_evaluation == 'word_score_incl_ws':
                    acc, f1 = evaluate_model_word_score(cur_model, sampled_test_set[task], allow_whitespace_pert=True,
                                                        report_prefix=f"../output/eval/{cur_model_name}_{type}_{task}_{cur_evaluation}",
                                                        word_scores_file=f"../output/{task}_word_scores.json")

            accuracy_df[cur_evaluation][f"{cur_model_name}_{type}_{task}"] = acc
            f1_df[cur_evaluation][f"{cur_model_name}_{type}_{task}"] = f1
            end_time = time.time()
            print(f"Evaluation took {end_time-start_time} seconds")
        del cur_model

Evaluating model baseline on bert on clean for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:03<00:00,  5.16it/s]


              precision    recall  f1-score   support

           0     0.7130    0.7624    0.7368       101
           1     0.5244    0.4778    0.5000        90
           2     0.5189    0.5446    0.5314       101
           3     0.5529    0.4896    0.5193        96
           4     0.7311    0.7768    0.7532       112

    accuracy                         0.6180       500
   macro avg     0.6081    0.6102    0.6082       500
weighted avg     0.6131    0.6180    0.6146       500

Evaluation took 3.2599024772644043 seconds
Evaluating model baseline on bert on stochastic_no_ws for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:26<00:00,  2.65s/it]


              precision    recall  f1-score   support

           0     0.5980    0.6040    0.6010       101
           1     0.2674    0.2556    0.2614        90
           2     0.2966    0.3465    0.3196       101
           3     0.2738    0.2396    0.2556        96
           4     0.6364    0.6250    0.6306       112

    accuracy                         0.4240       500
   macro avg     0.4145    0.4141    0.4136       500
weighted avg     0.4240    0.4240    0.4233       500

Evaluation took 26.55232071876526 seconds
Evaluating model baseline on bert on stochastic_incl_ws for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:26<00:00,  2.61s/it]


              precision    recall  f1-score   support

           0     0.6122    0.5941    0.6030       101
           1     0.2581    0.2667    0.2623        90
           2     0.2632    0.2970    0.2791       101
           3     0.2317    0.1979    0.2135        96
           4     0.5929    0.5982    0.5956       112

    accuracy                         0.4000       500
   macro avg     0.3916    0.3908    0.3907       500
weighted avg     0.4006    0.4000    0.3998       500

Evaluation took 26.09554696083069 seconds
Evaluating model baseline on bert on word_score_no_ws for task yelp_full


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:45<00:00, 10.95it/s]


              precision    recall  f1-score   support

           0     0.4894    0.4554    0.4718       101
           1     0.1667    0.2000    0.1818        90
           2     0.1545    0.1683    0.1611       101
           3     0.1518    0.1771    0.1635        96
           4     0.5000    0.3393    0.4043       112

    accuracy                         0.2720       500
   macro avg     0.2925    0.2680    0.2765       500
weighted avg     0.3012    0.2720    0.2825       500

Evaluation took 46.12910056114197 seconds
Evaluating model baseline on bert on word_score_incl_ws for task yelp_full


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:43<00:00, 11.60it/s]


              precision    recall  f1-score   support

           0     0.4783    0.4356    0.4560       101
           1     0.1635    0.1889    0.1753        90
           2     0.1062    0.1188    0.1121       101
           3     0.1339    0.1562    0.1442        96
           4     0.4684    0.3304    0.3874       112

    accuracy                         0.2500       500
   macro avg     0.2700    0.2460    0.2550       500
weighted avg     0.2781    0.2500    0.2608       500

Evaluation took 43.17601299285889 seconds
Failed loading model baseline_all_pert on bert for task yelp_full, skipping
Evaluating model roben_1 on bert on clean for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02<00:00,  5.96it/s]


              precision    recall  f1-score   support

           0     0.5093    0.8119    0.6260       101
           1     0.5000    0.3333    0.4000        90
           2     0.4381    0.4554    0.4466       101
           3     0.4545    0.2604    0.3311        96
           4     0.5714    0.6071    0.5887       112

    accuracy                         0.5020       500
   macro avg     0.4947    0.4936    0.4785       500
weighted avg     0.4966    0.5020    0.4841       500

Evaluation took 2.726670742034912 seconds
Evaluating model roben_1 on bert on stochastic_no_ws for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:26<00:00,  2.68s/it]


              precision    recall  f1-score   support

           0     0.4458    0.7327    0.5543       101
           1     0.2807    0.1778    0.2177        90
           2     0.2883    0.3168    0.3019       101
           3     0.2778    0.1562    0.2000        96
           4     0.5357    0.5357    0.5357       112

    accuracy                         0.3940       500
   macro avg     0.3657    0.3838    0.3619       500
weighted avg     0.3721    0.3940    0.3705       500

Evaluation took 26.84457516670227 seconds
Evaluating model roben_1 on bert on stochastic_incl_ws for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:27<00:00,  2.75s/it]


              precision    recall  f1-score   support

           0     0.4046    0.6931    0.5109       101
           1     0.1930    0.1222    0.1497        90
           2     0.2056    0.2178    0.2115       101
           3     0.2000    0.1146    0.1457        96
           4     0.4630    0.4464    0.4545       112

    accuracy                         0.3280       500
   macro avg     0.2932    0.3188    0.2945       500
weighted avg     0.3001    0.3280    0.3027       500

Evaluation took 27.460546493530273 seconds
Evaluating model roben_1 on bert on word_score_no_ws for task yelp_full


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:46<00:00, 10.64it/s]


              precision    recall  f1-score   support

           0     0.4458    0.7327    0.5543       101
           1     0.2581    0.1778    0.2105        90
           2     0.2500    0.2673    0.2584       101
           3     0.2456    0.1458    0.1830        96
           4     0.4486    0.4286    0.4384       112

    accuracy                         0.3580       500
   macro avg     0.3296    0.3504    0.3289       500
weighted avg     0.3346    0.3580    0.3354       500

Evaluation took 47.03564810752869 seconds
Evaluating model roben_1 on bert on word_score_incl_ws for task yelp_full


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:38<00:00, 12.86it/s]


              precision    recall  f1-score   support

           0     0.3690    0.6139    0.4610       101
           1     0.1061    0.0778    0.0897        90
           2     0.1455    0.1584    0.1517       101
           3     0.1094    0.0729    0.0875        96
           4     0.2826    0.2321    0.2549       112

    accuracy                         0.2360       500
   macro avg     0.2025    0.2310    0.2090       500
weighted avg     0.2073    0.2360    0.2138       500

Evaluation took 38.939995527267456 seconds
Evaluating model roben_2 on bert on clean for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02<00:00,  6.00it/s]


              precision    recall  f1-score   support

           0     0.5915    0.8317    0.6914       101
           1     0.4648    0.3667    0.4099        90
           2     0.4510    0.4554    0.4532       101
           3     0.5000    0.3333    0.4000        96
           4     0.6446    0.6964    0.6695       112

    accuracy                         0.5460       500
   macro avg     0.5304    0.5367    0.5248       500
weighted avg     0.5346    0.5460    0.5318       500

Evaluation took 2.7071306705474854 seconds
Evaluating model roben_2 on bert on stochastic_no_ws for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:26<00:00,  2.60s/it]


              precision    recall  f1-score   support

           0     0.5208    0.7426    0.6122       101
           1     0.2632    0.2222    0.2410        90
           2     0.2727    0.2673    0.2700       101
           3     0.2656    0.1771    0.2125        96
           4     0.5983    0.6250    0.6114       112

    accuracy                         0.4180       500
   macro avg     0.3841    0.4068    0.3894       500
weighted avg     0.3927    0.4180    0.3993       500

Evaluation took 26.03660750389099 seconds
Evaluating model roben_2 on bert on stochastic_incl_ws for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:25<00:00,  2.59s/it]


              precision    recall  f1-score   support

           0     0.4600    0.6832    0.5498       101
           1     0.1250    0.1000    0.1111        90
           2     0.1900    0.1881    0.1891       101
           3     0.2353    0.1667    0.1951        96
           4     0.5000    0.4911    0.4955       112

    accuracy                         0.3360       500
   macro avg     0.3021    0.3258    0.3081       500
weighted avg     0.3110    0.3360    0.3177       500

Evaluation took 25.937244176864624 seconds
Evaluating model roben_2 on bert on word_score_no_ws for task yelp_full


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:50<00:00,  9.96it/s]


              precision    recall  f1-score   support

           0     0.5203    0.7624    0.6185       101
           1     0.2778    0.2222    0.2469        90
           2     0.2523    0.2673    0.2596       101
           3     0.2500    0.1771    0.2073        96
           4     0.5524    0.5179    0.5346       112

    accuracy                         0.3980       500
   macro avg     0.3706    0.3894    0.3734       500
weighted avg     0.3778    0.3980    0.3814       500

Evaluation took 50.373942375183105 seconds
Evaluating model roben_2 on bert on word_score_incl_ws for task yelp_full


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:43<00:00, 11.59it/s]


              precision    recall  f1-score   support

           0     0.4211    0.6337    0.5059       101
           1     0.1343    0.1000    0.1146        90
           2     0.1327    0.1485    0.1402       101
           3     0.1250    0.1042    0.1136        96
           4     0.3636    0.2857    0.3200       112

    accuracy                         0.2600       500
   macro avg     0.2354    0.2544    0.2389       500
weighted avg     0.2415    0.2600    0.2447       500

Evaluation took 43.18964886665344 seconds
Evaluating model roben_1_tok on bert on clean for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:03<00:00,  5.16it/s]


              precision    recall  f1-score   support

           0     0.4121    0.8119    0.5467       101
           1     0.3333    0.1889    0.2411        90
           2     0.3913    0.3564    0.3731       101
           3     0.5000    0.2292    0.3143        96
           4     0.5877    0.5982    0.5929       112

    accuracy                         0.4480       500
   macro avg     0.4449    0.4369    0.4136       500
weighted avg     0.4499    0.4480    0.4223       500

Evaluation took 3.142418622970581 seconds
Evaluating model roben_1_tok on bert on stochastic_no_ws for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:29<00:00,  2.94s/it]


              precision    recall  f1-score   support

           0     0.3980    0.7921    0.5298       101
           1     0.2222    0.1111    0.1481        90
           2     0.2816    0.2871    0.2843       101
           3     0.3590    0.1458    0.2074        96
           4     0.5446    0.5446    0.5446       112

    accuracy                         0.3880       500
   macro avg     0.3611    0.3762    0.3429       500
weighted avg     0.3682    0.3880    0.3529       500

Evaluation took 29.369975328445435 seconds
Evaluating model roben_1_tok on bert on stochastic_incl_ws for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:29<00:00,  2.90s/it]


              precision    recall  f1-score   support

           0     0.3564    0.7129    0.4752       101
           1     0.1132    0.0667    0.0839        90
           2     0.1613    0.1485    0.1546       101
           3     0.1951    0.0833    0.1168        96
           4     0.4775    0.4732    0.4753       112

    accuracy                         0.3080       500
   macro avg     0.2607    0.2969    0.2612       500
weighted avg     0.2694    0.3080    0.2712       500

Evaluation took 29.032265424728394 seconds
Evaluating model roben_1_tok on bert on word_score_no_ws for task yelp_full


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:48<00:00, 10.37it/s]


              precision    recall  f1-score   support

           0     0.3816    0.7822    0.5130       101
           1     0.1765    0.1000    0.1277        90
           2     0.2872    0.2673    0.2769       101
           3     0.3864    0.1771    0.2429        96
           4     0.5288    0.4911    0.5093       112

    accuracy                         0.3740       500
   macro avg     0.3521    0.3635    0.3339       500
weighted avg     0.3595    0.3740    0.3432       500

Evaluation took 48.27109122276306 seconds
Evaluating model roben_1_tok on bert on word_score_incl_ws for task yelp_full


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:40<00:00, 12.24it/s]


              precision    recall  f1-score   support

           0     0.3131    0.6634    0.4254       101
           1     0.0893    0.0556    0.0685        90
           2     0.1340    0.1287    0.1313       101
           3     0.1395    0.0625    0.0863        96
           4     0.3111    0.2500    0.2772       112

    accuracy                         0.2380       500
   macro avg     0.1974    0.2320    0.1978       500
weighted avg     0.2029    0.2380    0.2035       500

Evaluation took 40.89259672164917 seconds
Evaluating model roben_2_tok on bert on clean for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02<00:00,  5.52it/s]


              precision    recall  f1-score   support

           0     0.5263    0.7921    0.6324       101
           1     0.4189    0.3444    0.3780        90
           2     0.4421    0.4158    0.4286       101
           3     0.5660    0.3125    0.4027        96
           4     0.6270    0.7054    0.6639       112

    accuracy                         0.5240       500
   macro avg     0.5161    0.5140    0.5011       500
weighted avg     0.5202    0.5240    0.5084       500

Evaluation took 2.9381890296936035 seconds
Evaluating model roben_2_tok on bert on stochastic_no_ws for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:28<00:00,  2.84s/it]


              precision    recall  f1-score   support

           0     0.4780    0.7525    0.5846       101
           1     0.2917    0.2333    0.2593        90
           2     0.3158    0.2970    0.3061       101
           3     0.3793    0.2292    0.2857        96
           4     0.5776    0.5982    0.5877       112

    accuracy                         0.4320       500
   macro avg     0.4085    0.4220    0.4047       500
weighted avg     0.4150    0.4320    0.4131       500

Evaluation took 28.412063360214233 seconds
Evaluating model roben_2_tok on bert on stochastic_incl_ws for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:28<00:00,  2.86s/it]


              precision    recall  f1-score   support

           0     0.4403    0.6931    0.5385       101
           1     0.1429    0.1111    0.1250        90
           2     0.1596    0.1485    0.1538       101
           3     0.1754    0.1042    0.1307        96
           4     0.5000    0.5357    0.5172       112

    accuracy                         0.3300       500
   macro avg     0.2836    0.3185    0.2931       500
weighted avg     0.2926    0.3300    0.3033       500

Evaluation took 28.583747386932373 seconds
Evaluating model roben_2_tok on bert on word_score_no_ws for task yelp_full


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:53<00:00,  9.28it/s]


              precision    recall  f1-score   support

           0     0.4938    0.7822    0.6054       101
           1     0.3143    0.2444    0.2750        90
           2     0.2887    0.2772    0.2828       101
           3     0.3455    0.1979    0.2517        96
           4     0.5508    0.5804    0.5652       112

    accuracy                         0.4260       500
   macro avg     0.3986    0.4164    0.3960       500
weighted avg     0.4043    0.4260    0.4038       500

Evaluation took 53.935322523117065 seconds
Evaluating model roben_2_tok on bert on word_score_incl_ws for task yelp_full


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:43<00:00, 11.59it/s]


              precision    recall  f1-score   support

           0     0.3988    0.6436    0.4924       101
           1     0.1304    0.1000    0.1132        90
           2     0.1296    0.1386    0.1340       101
           3     0.1231    0.0833    0.0994        96
           4     0.3579    0.3036    0.3285       112

    accuracy                         0.2600       500
   macro avg     0.2280    0.2538    0.2335       500
weighted avg     0.2340    0.2600    0.2396       500

Evaluation took 43.20488953590393 seconds
Evaluating model 64k_lstm_clean_vanilla on bert on clean for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:21<00:00,  1.37s/it]


              precision    recall  f1-score   support

           0     0.7143    0.7426    0.7282       101
           1     0.5059    0.4778    0.4914        90
           2     0.5300    0.5248    0.5274       101
           3     0.5155    0.5208    0.5181        96
           4     0.7168    0.7232    0.7200       112

    accuracy                         0.6040       500
   macro avg     0.5965    0.5978    0.5970       500
weighted avg     0.6019    0.6040    0.6028       500

Evaluation took 21.974204778671265 seconds
Evaluating model 64k_lstm_clean_vanilla on bert on stochastic_no_ws for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:45<00:00, 22.56s/it]


              precision    recall  f1-score   support

           0     0.6100    0.6040    0.6070       101
           1     0.2442    0.2333    0.2386        90
           2     0.2931    0.3366    0.3134       101
           3     0.2340    0.2292    0.2316        96
           4     0.6346    0.5893    0.6111       112

    accuracy                         0.4080       500
   macro avg     0.4032    0.3985    0.4003       500
weighted avg     0.4135    0.4080    0.4102       500

Evaluation took 225.61358785629272 seconds
Evaluating model 64k_lstm_clean_vanilla on bert on stochastic_incl_ws for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:52<00:00, 23.22s/it]


              precision    recall  f1-score   support

           0     0.5728    0.5842    0.5784       101
           1     0.1625    0.1444    0.1529        90
           2     0.2373    0.2772    0.2557       101
           3     0.2604    0.2604    0.2604        96
           4     0.6019    0.5536    0.5767       112

    accuracy                         0.3740       500
   macro avg     0.3670    0.3640    0.3648       500
weighted avg     0.3777    0.3740    0.3752       500

Evaluation took 232.2303569316864 seconds
Evaluating model 64k_lstm_clean_vanilla on bert on word_score_no_ws for task yelp_full


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [06:49<00:00,  1.22it/s]


              precision    recall  f1-score   support

           0     0.5275    0.4752    0.5000       101
           1     0.1717    0.1889    0.1799        90
           2     0.1652    0.1881    0.1759       101
           3     0.1524    0.1667    0.1592        96
           4     0.5556    0.4464    0.4950       112

    accuracy                         0.3000       500
   macro avg     0.3145    0.2931    0.3020       500
weighted avg     0.3245    0.3000    0.3104       500

Evaluation took 409.79142451286316 seconds
Evaluating model 64k_lstm_clean_vanilla on bert on word_score_incl_ws for task yelp_full


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [06:17<00:00,  1.32it/s]


              precision    recall  f1-score   support

           0     0.4835    0.4356    0.4583       101
           1     0.1170    0.1222    0.1196        90
           2     0.1102    0.1287    0.1187       101
           3     0.1150    0.1354    0.1244        96
           4     0.5000    0.3750    0.4286       112

    accuracy                         0.2460       500
   macro avg     0.2652    0.2394    0.2499       500
weighted avg     0.2751    0.2460    0.2580       500

Evaluation took 377.7074685096741 seconds
Evaluating model 64k_lstm_no_whitespace_pert_vanilla on bert on clean for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:22<00:00,  1.38s/it]


              precision    recall  f1-score   support

           0     0.6847    0.7525    0.7170       101
           1     0.4868    0.4111    0.4458        90
           2     0.5268    0.5842    0.5540       101
           3     0.5632    0.5104    0.5355        96
           4     0.7193    0.7321    0.7257       112

    accuracy                         0.6060       500
   macro avg     0.5962    0.5981    0.5956       500
weighted avg     0.6016    0.6060    0.6023       500

Evaluation took 22.09602999687195 seconds
Evaluating model 64k_lstm_no_whitespace_pert_vanilla on bert on stochastic_no_ws for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:44<00:00, 22.42s/it]


              precision    recall  f1-score   support

           0     0.6606    0.7129    0.6857       101
           1     0.3662    0.2889    0.3230        90
           2     0.3983    0.4653    0.4292       101
           3     0.4045    0.3750    0.3892        96
           4     0.6726    0.6786    0.6756       112

    accuracy                         0.5140       500
   macro avg     0.5004    0.5041    0.5005       500
weighted avg     0.5081    0.5140    0.5094       500

Evaluation took 224.23424768447876 seconds
Evaluating model 64k_lstm_no_whitespace_pert_vanilla on bert on stochastic_incl_ws for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:46<00:00, 22.70s/it]


              precision    recall  f1-score   support

           0     0.6126    0.6733    0.6415       101
           1     0.2329    0.1889    0.2086        90
           2     0.2705    0.3267    0.2960       101
           3     0.2706    0.2396    0.2541        96
           4     0.6330    0.6161    0.6244       112

    accuracy                         0.4200       500
   macro avg     0.4039    0.4089    0.4049       500
weighted avg     0.4141    0.4200    0.4156       500

Evaluation took 226.9945273399353 seconds
Evaluating model 64k_lstm_no_whitespace_pert_vanilla on bert on word_score_no_ws for task yelp_full


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [07:33<00:00,  1.10it/s]


              precision    recall  f1-score   support

           0     0.6296    0.6733    0.6507       101
           1     0.2297    0.1889    0.2073        90
           2     0.2810    0.3366    0.3063       101
           3     0.2637    0.2500    0.2567        96
           4     0.6132    0.5804    0.5963       112

    accuracy                         0.4160       500
   macro avg     0.4035    0.4058    0.4035       500
weighted avg     0.4133    0.4160    0.4135       500

Evaluation took 453.665025472641 seconds
Evaluating model 64k_lstm_no_whitespace_pert_vanilla on bert on word_score_incl_ws for task yelp_full


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [06:30<00:00,  1.28it/s]


              precision    recall  f1-score   support

           0     0.5347    0.5347    0.5347       101
           1     0.0921    0.0778    0.0843        90
           2     0.1890    0.2376    0.2105       101
           3     0.1881    0.1979    0.1929        96
           4     0.5263    0.4464    0.4831       112

    accuracy                         0.3080       500
   macro avg     0.3060    0.2989    0.3011       500
weighted avg     0.3168    0.3080    0.3110       500

Evaluation took 390.55054664611816 seconds
Evaluating model 64k_lstm_all_pert_vanilla on bert on clean for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:21<00:00,  1.35s/it]


              precision    recall  f1-score   support

           0     0.6952    0.7228    0.7087       101
           1     0.4940    0.4556    0.4740        90
           2     0.4956    0.5545    0.5234       101
           3     0.5233    0.4688    0.4945        96
           4     0.7522    0.7589    0.7556       112

    accuracy                         0.6000       500
   macro avg     0.5921    0.5921    0.5912       500
weighted avg     0.5984    0.6000    0.5984       500

Evaluation took 21.681747436523438 seconds
Evaluating model 64k_lstm_all_pert_vanilla on bert on stochastic_no_ws for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:35<00:00, 21.52s/it]


              precision    recall  f1-score   support

           0     0.6635    0.6832    0.6732       101
           1     0.3974    0.3444    0.3690        90
           2     0.3770    0.4554    0.4126       101
           3     0.3523    0.3229    0.3370        96
           4     0.6759    0.6518    0.6636       112

    accuracy                         0.5000       500
   macro avg     0.4932    0.4916    0.4911       500
weighted avg     0.5008    0.5000    0.4991       500

Evaluation took 215.19122004508972 seconds
Evaluating model 64k_lstm_all_pert_vanilla on bert on stochastic_incl_ws for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:33<00:00, 21.37s/it]


              precision    recall  f1-score   support

           0     0.6381    0.6634    0.6505       101
           1     0.3659    0.3333    0.3488        90
           2     0.3793    0.4356    0.4055       101
           3     0.3614    0.3125    0.3352        96
           4     0.6579    0.6696    0.6637       112

    accuracy                         0.4920       500
   macro avg     0.4805    0.4829    0.4808       500
weighted avg     0.4881    0.4920    0.4891       500

Evaluation took 213.69527649879456 seconds
Evaluating model 64k_lstm_all_pert_vanilla on bert on word_score_no_ws for task yelp_full


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [07:16<00:00,  1.15it/s]


              precision    recall  f1-score   support

           0     0.6117    0.6238    0.6176       101
           1     0.2738    0.2556    0.2644        90
           2     0.2589    0.2871    0.2723       101
           3     0.2500    0.2500    0.2500        96
           4     0.6190    0.5804    0.5991       112

    accuracy                         0.4080       500
   macro avg     0.4027    0.3994    0.4007       500
weighted avg     0.4118    0.4080    0.4095       500

Evaluation took 436.53992772102356 seconds
Evaluating model 64k_lstm_all_pert_vanilla on bert on word_score_incl_ws for task yelp_full


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [07:20<00:00,  1.13it/s]


              precision    recall  f1-score   support

           0     0.6117    0.6238    0.6176       101
           1     0.2568    0.2111    0.2317        90
           2     0.3016    0.3762    0.3348       101
           3     0.2449    0.2500    0.2474        96
           4     0.6162    0.5446    0.5782       112

    accuracy                         0.4100       500
   macro avg     0.4062    0.4012    0.4020       500
weighted avg     0.4157    0.4100    0.4111       500

Evaluation took 440.8498933315277 seconds
Failed loading model 64k_lstm_clean_finetuned on bert for task yelp_full, skipping
Failed loading model 64k_lstm_no_whitespace_pert_finetuned on bert for task yelp_full, skipping
Evaluating model 64k_lstm_all_pert_finetuned on bert on clean for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:21<00:00,  1.35s/it]


              precision    recall  f1-score   support

           0     0.6944    0.7426    0.7177       101
           1     0.5357    0.5000    0.5172        90
           2     0.5000    0.5050    0.5025       101
           3     0.5393    0.5000    0.5189        96
           4     0.7436    0.7768    0.7598       112

    accuracy                         0.6120       500
   macro avg     0.6026    0.6049    0.6032       500
weighted avg     0.6078    0.6120    0.6094       500

Evaluation took 21.573001861572266 seconds
Evaluating model 64k_lstm_all_pert_finetuned on bert on stochastic_no_ws for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:35<00:00, 21.52s/it]


              precision    recall  f1-score   support

           0     0.6364    0.6931    0.6635       101
           1     0.4286    0.3667    0.3952        90
           2     0.3810    0.3960    0.3883       101
           3     0.3956    0.3750    0.3850        96
           4     0.6667    0.6964    0.6812       112

    accuracy                         0.5140       500
   macro avg     0.5016    0.5054    0.5027       500
weighted avg     0.5079    0.5140    0.5101       500

Evaluation took 215.23102068901062 seconds
Evaluating model 64k_lstm_all_pert_finetuned on bert on stochastic_incl_ws for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:33<00:00, 21.38s/it]


              precision    recall  f1-score   support

           0     0.6455    0.7030    0.6730       101
           1     0.3636    0.3111    0.3353        90
           2     0.3495    0.3564    0.3529       101
           3     0.4070    0.3646    0.3846        96
           4     0.6452    0.7143    0.6780       112

    accuracy                         0.5000       500
   macro avg     0.4821    0.4899    0.4848       500
weighted avg     0.4891    0.5000    0.4933       500

Evaluation took 213.7671618461609 seconds
Evaluating model 64k_lstm_all_pert_finetuned on bert on word_score_no_ws for task yelp_full


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [07:26<00:00,  1.12it/s]


              precision    recall  f1-score   support

           0     0.5766    0.6337    0.6038       101
           1     0.2603    0.2111    0.2331        90
           2     0.2569    0.2772    0.2667       101
           3     0.2621    0.2812    0.2714        96
           4     0.6058    0.5625    0.5833       112

    accuracy                         0.4020       500
   macro avg     0.3923    0.3932    0.3917       500
weighted avg     0.4012    0.4020    0.4006       500

Evaluation took 446.15980672836304 seconds
Evaluating model 64k_lstm_all_pert_finetuned on bert on word_score_incl_ws for task yelp_full


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [07:23<00:00,  1.13it/s]


              precision    recall  f1-score   support

           0     0.5888    0.6238    0.6058       101
           1     0.2805    0.2556    0.2674        90
           2     0.2400    0.2376    0.2388       101
           3     0.2549    0.2708    0.2626        96
           4     0.5963    0.5804    0.5882       112

    accuracy                         0.4020       500
   macro avg     0.3921    0.3936    0.3926       500
weighted avg     0.4004    0.4020    0.4009       500

Evaluation took 443.9069604873657 seconds
Failed loading model 64k_cnn_no_whitespace_pert_finetuned on bert for task yelp_full, skipping
Failed loading model 2m_lstm_all_pert_finetuned on bert for task yelp_full, skipping
Failed loading model 32k_lstm_all_pert_finetuned_100ep on bert for task yelp_full, skipping
Evaluating model baseline on roberta on clean for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02<00:00,  6.10it/s]


              precision    recall  f1-score   support

           0     0.8372    0.7129    0.7701       101
           1     0.5644    0.6333    0.5969        90
           2     0.5761    0.5248    0.5492       101
           3     0.5083    0.6354    0.5648        96
           4     0.7525    0.6786    0.7136       112

    accuracy                         0.6380       500
   macro avg     0.6477    0.6370    0.6389       500
weighted avg     0.6532    0.6380    0.6422       500

Evaluation took 2.6618523597717285 seconds
Evaluating model baseline on roberta on stochastic_no_ws for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:25<00:00,  2.53s/it]


              precision    recall  f1-score   support

           0     0.7059    0.5941    0.6452       101
           1     0.3673    0.4000    0.3830        90
           2     0.3516    0.3168    0.3333       101
           3     0.3409    0.4688    0.3947        96
           4     0.6383    0.5357    0.5825       112

    accuracy                         0.4660       500
   macro avg     0.4808    0.4631    0.4677       500
weighted avg     0.4882    0.4660    0.4729       500

Evaluation took 25.310571432113647 seconds
Evaluating model baseline on roberta on stochastic_incl_ws for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:25<00:00,  2.54s/it]


              precision    recall  f1-score   support

           0     0.7073    0.5743    0.6339       101
           1     0.3600    0.4000    0.3789        90
           2     0.3511    0.3267    0.3385       101
           3     0.3468    0.4479    0.3909        96
           4     0.6400    0.5714    0.6038       112

    accuracy                         0.4680       500
   macro avg     0.4810    0.4641    0.4692       500
weighted avg     0.4885    0.4680    0.4749       500

Evaluation took 25.368825912475586 seconds
Evaluating model baseline on roberta on word_score_no_ws for task yelp_full


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:50<00:00,  9.81it/s]


              precision    recall  f1-score   support

           0     0.6364    0.4158    0.5030       101
           1     0.2845    0.3667    0.3204        90
           2     0.2800    0.2772    0.2786       101
           3     0.2643    0.3854    0.3136        96
           4     0.5641    0.3929    0.4632       112

    accuracy                         0.3680       500
   macro avg     0.4058    0.3676    0.3757       500
weighted avg     0.4134    0.3680    0.3795       500

Evaluation took 50.99361276626587 seconds
Evaluating model baseline on roberta on word_score_incl_ws for task yelp_full


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:50<00:00,  9.84it/s]


              precision    recall  f1-score   support

           0     0.6216    0.4554    0.5257       101
           1     0.2752    0.3333    0.3015        90
           2     0.2816    0.2871    0.2843       101
           3     0.2348    0.3229    0.2719        96
           4     0.5366    0.3929    0.4536       112

    accuracy                         0.3600       500
   macro avg     0.3900    0.3583    0.3674       500
weighted avg     0.3973    0.3600    0.3717       500

Evaluation took 50.854520082473755 seconds
Failed loading model baseline_all_pert on roberta for task yelp_full, skipping
Evaluating model roben_1 on roberta on clean for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02<00:00,  6.27it/s]


              precision    recall  f1-score   support

           0     0.6032    0.7525    0.6696       101
           1     0.4359    0.3778    0.4048        90
           2     0.4675    0.3564    0.4045       101
           3     0.4595    0.5312    0.4928        96
           4     0.6296    0.6071    0.6182       112

    accuracy                         0.5300       500
   macro avg     0.5191    0.5250    0.5180       500
weighted avg     0.5240    0.5300    0.5229       500

Evaluation took 2.592484951019287 seconds
Evaluating model roben_1 on roberta on stochastic_no_ws for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:25<00:00,  2.54s/it]


              precision    recall  f1-score   support

           0     0.5635    0.7030    0.6256       101
           1     0.3333    0.3000    0.3158        90
           2     0.3210    0.2574    0.2857       101
           3     0.3273    0.3750    0.3495        96
           4     0.5588    0.5089    0.5327       112

    accuracy                         0.4340       500
   macro avg     0.4208    0.4289    0.4219       500
weighted avg     0.4267    0.4340    0.4274       500

Evaluation took 25.363498210906982 seconds
Evaluating model roben_1 on roberta on stochastic_incl_ws for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:25<00:00,  2.54s/it]


              precision    recall  f1-score   support

           0     0.5276    0.6634    0.5877       101
           1     0.2439    0.2222    0.2326        90
           2     0.2179    0.1683    0.1899       101
           3     0.2456    0.2917    0.2667        96
           4     0.5051    0.4464    0.4739       112

    accuracy                         0.3640       500
   macro avg     0.3480    0.3584    0.3502       500
weighted avg     0.3548    0.3640    0.3563       500

Evaluation took 25.383612394332886 seconds
Evaluating model roben_1 on roberta on word_score_no_ws for task yelp_full


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:48<00:00, 10.29it/s]


              precision    recall  f1-score   support

           0     0.5469    0.6931    0.6114       101
           1     0.2750    0.2444    0.2588        90
           2     0.2840    0.2277    0.2527       101
           3     0.3109    0.3854    0.3442        96
           4     0.5217    0.4286    0.4706       112

    accuracy                         0.4000       500
   macro avg     0.3877    0.3958    0.3875       500
weighted avg     0.3939    0.4000    0.3926       500

Evaluation took 48.658371925354004 seconds
Evaluating model roben_1 on roberta on word_score_incl_ws for task yelp_full


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:44<00:00, 11.31it/s]


              precision    recall  f1-score   support

           0     0.4925    0.6535    0.5617       101
           1     0.2125    0.1889    0.2000        90
           2     0.1579    0.1188    0.1356       101
           3     0.2031    0.2708    0.2321        96
           4     0.3537    0.2589    0.2990       112

    accuracy                         0.3000       500
   macro avg     0.2839    0.2982    0.2857       500
weighted avg     0.2879    0.3000    0.2884       500

Evaluation took 44.2679545879364 seconds
Evaluating model roben_2 on roberta on clean for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02<00:00,  6.27it/s]


              precision    recall  f1-score   support

           0     0.6757    0.7426    0.7075       101
           1     0.5500    0.4889    0.5176        90
           2     0.5316    0.4158    0.4667       101
           3     0.4615    0.6250    0.5310        96
           4     0.6400    0.5714    0.6038       112

    accuracy                         0.5700       500
   macro avg     0.5718    0.5687    0.5653       500
weighted avg     0.5749    0.5700    0.5676       500

Evaluation took 2.5952091217041016 seconds
Evaluating model roben_2 on roberta on stochastic_no_ws for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:25<00:00,  2.54s/it]


              precision    recall  f1-score   support

           0     0.6283    0.7030    0.6636       101
           1     0.4217    0.3889    0.4046        90
           2     0.3797    0.2970    0.3333       101
           3     0.3760    0.4896    0.4253        96
           4     0.5700    0.5089    0.5377       112

    accuracy                         0.4800       500
   macro avg     0.4752    0.4775    0.4729       500
weighted avg     0.4794    0.4800    0.4763       500

Evaluation took 25.383232593536377 seconds
Evaluating model roben_2 on roberta on stochastic_incl_ws for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:25<00:00,  2.54s/it]


              precision    recall  f1-score   support

           0     0.5804    0.6436    0.6103       101
           1     0.3415    0.3111    0.3256        90
           2     0.3250    0.2574    0.2873       101
           3     0.3101    0.4167    0.3556        96
           4     0.5567    0.4821    0.5167       112

    accuracy                         0.4260       500
   macro avg     0.4227    0.4222    0.4191       500
weighted avg     0.4286    0.4260    0.4239       500

Evaluation took 25.397918224334717 seconds
Evaluating model roben_2 on roberta on word_score_no_ws for task yelp_full


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:52<00:00,  9.50it/s]


              precision    recall  f1-score   support

           0     0.6126    0.6733    0.6415       101
           1     0.3721    0.3556    0.3636        90
           2     0.3421    0.2574    0.2938       101
           3     0.3433    0.4792    0.4000        96
           4     0.5484    0.4554    0.4976       112

    accuracy                         0.4460       500
   macro avg     0.4437    0.4442    0.4393       500
weighted avg     0.4486    0.4460    0.4426       500

Evaluation took 52.691075563430786 seconds
Evaluating model roben_2 on roberta on word_score_incl_ws for task yelp_full


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:48<00:00, 10.36it/s]


              precision    recall  f1-score   support

           0     0.5364    0.5842    0.5592       101
           1     0.2619    0.2444    0.2529        90
           2     0.2500    0.1980    0.2210       101
           3     0.2746    0.4062    0.3277        96
           4     0.4762    0.3571    0.4082       112

    accuracy                         0.3600       500
   macro avg     0.3598    0.3580    0.3538       500
weighted avg     0.3654    0.3600    0.3575       500

Evaluation took 48.30201697349548 seconds
Evaluating model roben_1_tok on roberta on clean for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02<00:00,  5.82it/s]


              precision    recall  f1-score   support

           0     0.5226    0.8020    0.6328       101
           1     0.3857    0.3000    0.3375        90
           2     0.4925    0.3267    0.3929       101
           3     0.4400    0.4583    0.4490        96
           4     0.6019    0.5804    0.5909       112

    accuracy                         0.5000       500
   macro avg     0.4885    0.4935    0.4806       500
weighted avg     0.4938    0.5000    0.4865       500

Evaluation took 2.7910046577453613 seconds
Evaluating model roben_1_tok on roberta on stochastic_no_ws for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:27<00:00,  2.73s/it]


              precision    recall  f1-score   support

           0     0.4906    0.7723    0.6000       101
           1     0.2958    0.2333    0.2609        90
           2     0.3562    0.2574    0.2989       101
           3     0.3297    0.3125    0.3209        96
           4     0.5283    0.5000    0.5138       112

    accuracy                         0.4220       500
   macro avg     0.4001    0.4151    0.3989       500
weighted avg     0.4059    0.4220    0.4052       500

Evaluation took 27.279224157333374 seconds
Evaluating model roben_1_tok on roberta on stochastic_incl_ws for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:27<00:00,  2.73s/it]


              precision    recall  f1-score   support

           0     0.4423    0.6832    0.5370       101
           1     0.1622    0.1333    0.1463        90
           2     0.2535    0.1782    0.2093       101
           3     0.2292    0.2292    0.2292        96
           4     0.4757    0.4375    0.4558       112

    accuracy                         0.3400       500
   macro avg     0.3126    0.3323    0.3155       500
weighted avg     0.3203    0.3400    0.3232       500

Evaluation took 27.29029679298401 seconds
Evaluating model roben_1_tok on roberta on word_score_no_ws for task yelp_full


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:53<00:00,  9.43it/s]


              precision    recall  f1-score   support

           0     0.4937    0.7723    0.6023       101
           1     0.3014    0.2444    0.2699        90
           2     0.3333    0.2178    0.2635       101
           3     0.3173    0.3438    0.3300        96
           4     0.5354    0.4732    0.5024       112

    accuracy                         0.4160       500
   macro avg     0.3962    0.4103    0.3936       500
weighted avg     0.4021    0.4160    0.3994       500

Evaluation took 53.07710552215576 seconds
Evaluating model roben_1_tok on roberta on word_score_incl_ws for task yelp_full


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:46<00:00, 10.70it/s]


              precision    recall  f1-score   support

           0     0.4207    0.6832    0.5208       101
           1     0.1857    0.1444    0.1625        90
           2     0.2394    0.1683    0.1977       101
           3     0.2252    0.2604    0.2415        96
           4     0.3929    0.2946    0.3367       112

    accuracy                         0.3140       500
   macro avg     0.2928    0.3102    0.2918       500
weighted avg     0.2980    0.3140    0.2962       500

Evaluation took 46.7955584526062 seconds
Evaluating model roben_2_tok on roberta on clean for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02<00:00,  5.83it/s]


              precision    recall  f1-score   support

           0     0.6387    0.7525    0.6909       101
           1     0.5342    0.4333    0.4785        90
           2     0.5185    0.4158    0.4615       101
           3     0.4786    0.5833    0.5258        96
           4     0.6455    0.6339    0.6396       112

    accuracy                         0.5680       500
   macro avg     0.5631    0.5638    0.5593       500
weighted avg     0.5664    0.5680    0.5632       500

Evaluation took 2.7836661338806152 seconds
Evaluating model roben_2_tok on roberta on stochastic_no_ws for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:27<00:00,  2.73s/it]


              precision    recall  f1-score   support

           0     0.6016    0.7327    0.6607       101
           1     0.4366    0.3444    0.3851        90
           2     0.4074    0.3267    0.3626       101
           3     0.3793    0.4583    0.4151        96
           4     0.5780    0.5625    0.5701       112

    accuracy                         0.4900       500
   macro avg     0.4806    0.4849    0.4787       500
weighted avg     0.4847    0.4900    0.4834       500

Evaluation took 27.318827152252197 seconds
Evaluating model roben_2_tok on roberta on stochastic_incl_ws for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:27<00:00,  2.73s/it]


              precision    recall  f1-score   support

           0     0.5635    0.7030    0.6256       101
           1     0.2973    0.2444    0.2683        90
           2     0.2791    0.2376    0.2567       101
           3     0.2931    0.3542    0.3208        96
           4     0.5510    0.4821    0.5143       112

    accuracy                         0.4100       500
   macro avg     0.3968    0.4043    0.3971       500
weighted avg     0.4034    0.4100    0.4033       500

Evaluation took 27.298561334609985 seconds
Evaluating model roben_2_tok on roberta on word_score_no_ws for task yelp_full


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:59<00:00,  8.41it/s]


              precision    recall  f1-score   support

           0     0.5984    0.7228    0.6547       101
           1     0.4133    0.3444    0.3758        90
           2     0.3827    0.3069    0.3407       101
           3     0.4000    0.5000    0.4444        96
           4     0.5980    0.5446    0.5701       112

    accuracy                         0.4880       500
   macro avg     0.4785    0.4838    0.4771       500
weighted avg     0.4833    0.4880    0.4817       500

Evaluation took 59.525678634643555 seconds
Evaluating model roben_2_tok on roberta on word_score_incl_ws for task yelp_full


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:52<00:00,  9.50it/s]


              precision    recall  f1-score   support

           0     0.5217    0.5941    0.5556       101
           1     0.2927    0.2667    0.2791        90
           2     0.2471    0.2079    0.2258       101
           3     0.2645    0.3333    0.2949        96
           4     0.4742    0.4107    0.4402       112

    accuracy                         0.3660       500
   macro avg     0.3600    0.3625    0.3591       500
weighted avg     0.3650    0.3660    0.3633       500

Evaluation took 52.67255401611328 seconds
Failed loading model 64k_lstm_clean_vanilla on roberta for task yelp_full, skipping
Failed loading model 64k_lstm_no_whitespace_pert_vanilla on roberta for task yelp_full, skipping
Evaluating model 64k_lstm_all_pert_vanilla on roberta on clean for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:21<00:00,  1.37s/it]


              precision    recall  f1-score   support

           0     0.8046    0.6931    0.7447       101
           1     0.5368    0.5667    0.5514        90
           2     0.5532    0.5149    0.5333       101
           3     0.5470    0.6667    0.6009        96
           4     0.7196    0.6875    0.7032       112

    accuracy                         0.6280       500
   macro avg     0.6323    0.6258    0.6267       500
weighted avg     0.6371    0.6280    0.6303       500

Evaluation took 21.975029945373535 seconds
Evaluating model 64k_lstm_all_pert_vanilla on roberta on stochastic_no_ws for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:59<00:00, 23.90s/it]


              precision    recall  f1-score   support

           0     0.7500    0.6535    0.6984       101
           1     0.4362    0.4556    0.4457        90
           2     0.4516    0.4158    0.4330       101
           3     0.4914    0.5938    0.5377        96
           4     0.6697    0.6518    0.6606       112

    accuracy                         0.5580       500
   macro avg     0.5598    0.5541    0.5551       500
weighted avg     0.5656    0.5580    0.5600       500

Evaluation took 239.00967121124268 seconds
Evaluating model 64k_lstm_all_pert_vanilla on roberta on stochastic_incl_ws for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:37<00:00, 21.73s/it]


              precision    recall  f1-score   support

           0     0.7711    0.6337    0.6957       101
           1     0.4095    0.4778    0.4410        90
           2     0.4483    0.3861    0.4149       101
           3     0.4737    0.5625    0.5143        96
           4     0.6577    0.6518    0.6547       112

    accuracy                         0.5460       500
   macro avg     0.5520    0.5424    0.5441       500
weighted avg     0.5583    0.5460    0.5491       500

Evaluation took 217.34920406341553 seconds
Evaluating model 64k_lstm_all_pert_vanilla on roberta on word_score_no_ws for task yelp_full


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [08:07<00:00,  1.03it/s]


              precision    recall  f1-score   support

           0     0.7531    0.6040    0.6703       101
           1     0.3846    0.4444    0.4124        90
           2     0.3370    0.3069    0.3212       101
           3     0.3417    0.4271    0.3796        96
           4     0.6019    0.5536    0.5767       112

    accuracy                         0.4700       500
   macro avg     0.4837    0.4672    0.4721       500
weighted avg     0.4899    0.4700    0.4766       500

Evaluation took 487.4838879108429 seconds
Evaluating model 64k_lstm_all_pert_vanilla on roberta on word_score_incl_ws for task yelp_full


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [08:29<00:00,  1.02s/it]


              precision    recall  f1-score   support

           0     0.7439    0.6040    0.6667       101
           1     0.3868    0.4556    0.4184        90
           2     0.3412    0.2871    0.3118       101
           3     0.3607    0.4583    0.4037        96
           4     0.5905    0.5536    0.5714       112

    accuracy                         0.4740       500
   macro avg     0.4846    0.4717    0.4744       500
weighted avg     0.4903    0.4740    0.4785       500

Evaluation took 509.5371949672699 seconds
Failed loading model 64k_lstm_clean_finetuned on roberta for task yelp_full, skipping
Failed loading model 64k_lstm_no_whitespace_pert_finetuned on roberta for task yelp_full, skipping
Evaluating model 64k_lstm_all_pert_finetuned on roberta on clean for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:21<00:00,  1.36s/it]


              precision    recall  f1-score   support

           0     0.7789    0.7327    0.7551       101
           1     0.5213    0.5444    0.5326        90
           2     0.5667    0.5050    0.5340       101
           3     0.5345    0.6458    0.5849        96
           4     0.7333    0.6875    0.7097       112

    accuracy                         0.6260       500
   macro avg     0.6269    0.6231    0.6233       500
weighted avg     0.6325    0.6260    0.6275       500

Evaluation took 21.731281042099 seconds
Evaluating model 64k_lstm_all_pert_finetuned on roberta on stochastic_no_ws for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:35<00:00, 21.56s/it]


              precision    recall  f1-score   support

           0     0.7363    0.6634    0.6979       101
           1     0.4444    0.4889    0.4656        90
           2     0.5000    0.4455    0.4712       101
           3     0.4737    0.5625    0.5143        96
           4     0.6792    0.6429    0.6606       112

    accuracy                         0.5640       500
   macro avg     0.5667    0.5606    0.5619       500
weighted avg     0.5728    0.5640    0.5667       500

Evaluation took 215.6561803817749 seconds
Evaluating model 64k_lstm_all_pert_finetuned on roberta on stochastic_incl_ws for task yelp_full


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:37<00:00, 21.72s/it]


              precision    recall  f1-score   support

           0     0.7416    0.6535    0.6947       101
           1     0.4082    0.4444    0.4255        90
           2     0.4535    0.3861    0.4171       101
           3     0.4500    0.5625    0.5000        96
           4     0.6822    0.6518    0.6667       112

    accuracy                         0.5440       500
   macro avg     0.5471    0.5397    0.5408       500
weighted avg     0.5541    0.5440    0.5465       500

Evaluation took 217.17877173423767 seconds
Evaluating model 64k_lstm_all_pert_finetuned on roberta on word_score_no_ws for task yelp_full


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [08:20<00:00,  1.00s/it]


              precision    recall  f1-score   support

           0     0.6867    0.5644    0.6196       101
           1     0.3654    0.4222    0.3918        90
           2     0.3736    0.3366    0.3542       101
           3     0.3554    0.4479    0.3963        96
           4     0.6139    0.5536    0.5822       112

    accuracy                         0.4680       500
   macro avg     0.4790    0.4649    0.4688       500
weighted avg     0.4857    0.4680    0.4737       500

Evaluation took 500.0615990161896 seconds
Evaluating model 64k_lstm_all_pert_finetuned on roberta on word_score_incl_ws for task yelp_full


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [08:15<00:00,  1.01it/s]

              precision    recall  f1-score   support

           0     0.6629    0.5842    0.6211       101
           1     0.3679    0.4333    0.3980        90
           2     0.3412    0.2871    0.3118       101
           3     0.3333    0.4062    0.3662        96
           4     0.6117    0.5625    0.5860       112

    accuracy                         0.4580       500
   macro avg     0.4634    0.4547    0.4566       500
weighted avg     0.4701    0.4580    0.4617       500

Evaluation took 495.07842326164246 seconds
Failed loading model 64k_cnn_no_whitespace_pert_finetuned on roberta for task yelp_full, skipping
Failed loading model 2m_lstm_all_pert_finetuned on roberta for task yelp_full, skipping
Failed loading model 32k_lstm_all_pert_finetuned_100ep on roberta for task yelp_full, skipping





In [41]:
accuracy_df

Unnamed: 0,clean,stochastic_no_ws,stochastic_incl_ws,word_score_no_ws,word_score_incl_ws
baseline_bert_yelp_full,0.618,0.424,0.4,0.272,0.25
roben_1_bert_yelp_full,0.502,0.394,0.328,0.358,0.236
roben_2_bert_yelp_full,0.546,0.418,0.336,0.398,0.26
roben_1_tok_bert_yelp_full,0.448,0.388,0.308,0.374,0.238
roben_2_tok_bert_yelp_full,0.524,0.432,0.33,0.426,0.26
64k_lstm_clean_vanilla_bert_yelp_full,0.604,0.408,0.374,0.3,0.246
64k_lstm_no_whitespace_pert_vanilla_bert_yelp_full,0.606,0.514,0.42,0.416,0.308
64k_lstm_all_pert_vanilla_bert_yelp_full,0.6,0.5,0.492,0.408,0.41
64k_lstm_all_pert_finetuned_bert_yelp_full,0.612,0.514,0.5,0.402,0.402
baseline_roberta_yelp_full,0.638,0.466,0.468,0.368,0.36


In [42]:
f1_df

Unnamed: 0,clean,stochastic_no_ws,stochastic_incl_ws,word_score_no_ws,word_score_incl_ws
baseline_bert_yelp_full,0.608165,0.413634,0.390684,0.276493,0.255006
roben_1_bert_yelp_full,0.478485,0.361919,0.294478,0.328914,0.208954
roben_2_bert_yelp_full,0.524805,0.389412,0.308117,0.373376,0.23888
roben_1_tok_bert_yelp_full,0.413613,0.342863,0.261185,0.333937,0.197752
roben_2_tok_bert_yelp_full,0.501116,0.404686,0.293054,0.396013,0.233497
64k_lstm_clean_vanilla_bert_yelp_full,0.597016,0.400331,0.364848,0.302015,0.249919
64k_lstm_no_whitespace_pert_vanilla_bert_yelp_full,0.595588,0.500533,0.404928,0.403471,0.3011
64k_lstm_all_pert_vanilla_bert_yelp_full,0.59123,0.491073,0.480753,0.400679,0.401956
64k_lstm_all_pert_finetuned_bert_yelp_full,0.60323,0.502663,0.484768,0.391652,0.392576
baseline_roberta_yelp_full,0.638913,0.467747,0.469194,0.375741,0.367415


In [43]:
accuracy_df.to_csv("../output/grid_accuracy.csv")

In [44]:
f1_df.to_csv("../output/grid_f1.csv")

-----