# Multi Model Evaluation

In [1]:
import sys
sys.path.append("..")

import copy
import cProfile
from datasets import load_dataset
import itertools
import json
import math
import nltk
from nltk.tokenize.treebank import TreebankWordDetokenizer
import pandas as pd
import random
from sklearn.metrics import classification_report, accuracy_score, f1_score
import time
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, DataCollatorWithPadding, \
                         AutoModelForSequenceClassification, BertForSequenceClassification, \
                         BertModel, RobertaForSequenceClassification, RobertaModel

from resilient_nlp.mini_roben import Clustering, ClusterRepRecoverer, ClusterRecovererWithPassthrough
from resilient_nlp.models import BertClassifier
from resilient_nlp.perturbers import ToyPerturber, WordScramblerPerturber
from runner import ExperimentRunner
from word_score_attack import BertWordScoreAttack

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [2]:
#tasks = ('imdb', 'sst', 'sst_bin', 'yelp_bin', 'yelp_full')
#tasks = ('sst_bin', 'yelp_bin', 'yelp_full')
tasks = ('yelp_bin',)

In [3]:
model_types = ('bert', 'roberta')

Config for final evaluation on test set

In [4]:
eval_set_size = 500
use_dev_set = False

Config for evaluation on dev set

In [5]:
#eval_set_size = 113
#use_dev_set = True

In [6]:
max_raw_length = 826
preprocess = lambda row: { 'text': row['text'].lower()[:max_raw_length]}

## IMDb Dataset

In [7]:
sampled_test_set = {}
sampled_test_set_dict = {}
sampled_test_set_adv_no_ws = {}
sampled_test_set_adv_incl_ws = {}

In [8]:
imdb = load_dataset('artemis13fowl/imdb')

Using custom data configuration artemis13fowl--imdb-f63738dec0d5e230
Reusing dataset parquet (/home/jasko/.cache/huggingface/datasets/parquet/artemis13fowl--imdb-f63738dec0d5e230/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)


  0%|          | 0/4 [00:00<?, ?it/s]

In [9]:
random.seed(11)
if use_dev_set:
    sampled_test_set['imdb'] = imdb['dev'].select(random.choices(range(len(imdb['dev'])), k=eval_set_size)).map(preprocess)
else:
    sampled_test_set['imdb'] = imdb['attack_eval_truncated'].select(range(eval_set_size)).map(preprocess)


# This is silly but apparently huggingface datasets are immutable?
# Representing it as something a bit more sane
sampled_test_set_dict['imdb'] = [
    {
        'text': row['text'],
        'label': row['label'],
    }
    for row in sampled_test_set['imdb']
]

Loading cached processed dataset at /home/jasko/.cache/huggingface/datasets/parquet/artemis13fowl--imdb-f63738dec0d5e230/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901/cache-b45d493b37ab1dc7.arrow


## SST-5 Dataset

In [10]:
treebank_detok = TreebankWordDetokenizer()

sst = load_dataset('sst').map(
    lambda row: {
        "text": treebank_detok.detokenize(row["sentence"].split()),
        "label": min(math.floor(row["label"] / 0.2), 4.0),
    }, remove_columns=['sentence', 'tokens', 'tree']
)

random.seed(11)
if use_dev_set:
    sampled_test_set['sst'] = sst['validation'].select(random.choices(range(len(sst['validation'])), k=eval_set_size)).map(preprocess)
else:
    sampled_test_set['sst'] = sst['test'].select(random.choices(range(len(sst['test'])), k=eval_set_size)).map(preprocess)

sampled_test_set_dict['sst'] = [
    {
        'text': row['text'],
        'label': row['label'],
    }
    for row in sampled_test_set['sst']
]

No config specified, defaulting to: sst/default
Reusing dataset sst (/home/jasko/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /home/jasko/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-3c142acdab53f98c.arrow
Loading cached processed dataset at /home/jasko/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-0bf56ce0086915ee.arrow
Loading cached processed dataset at /home/jasko/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-19fdf8d124be4ba7.arrow
Loading cached processed dataset at /home/jasko/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-fc3e75429c3f7637.arrow


## SST-2 Dataset

In [11]:
treebank_detok = TreebankWordDetokenizer()

sst_bin = load_dataset('sst').filter(
        lambda row: row["label"] < 0.4 or row["label"] >= 0.6
    ).map(
    lambda row: {
        "text": treebank_detok.detokenize(row["sentence"].split()),
        "label": min(math.floor(row["label"] / 0.5), 1.0),
    }
)

random.seed(11)
if use_dev_set:
    sampled_test_set['sst_bin'] = sst_bin['validation'].select(random.choices(range(len(sst_bin['validation'])), k=eval_set_size)).map(preprocess)
else:
    sampled_test_set['sst_bin'] = sst_bin['test'].select(random.choices(range(len(sst_bin['test'])), k=eval_set_size)).map(preprocess)

sampled_test_set_dict['sst_bin'] = [
    {
        'text': row['text'],
        'label': row['label'],
    }
    for row in sampled_test_set['sst_bin']
]

No config specified, defaulting to: sst/default
Reusing dataset sst (/home/jasko/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /home/jasko/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-f4f1ada73617d193.arrow
Loading cached processed dataset at /home/jasko/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-6279b6f0f8a08f9a.arrow
Loading cached processed dataset at /home/jasko/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-6c5f77e5aefdd0e2.arrow
Loading cached processed dataset at /home/jasko/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-c46f07c913633b4d.arrow
Loading cached processed dataset at /home/jasko/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-87779081ead23eae.arrow
Loading cached processed dataset at /home/jasko/.cache/huggi

## Yelp-2

In [12]:
yelp_bin = load_dataset('yelp_polarity')

random.seed(11)
sampled_test_set['yelp_bin'] = yelp_bin['test'].select(random.choices(range(len(yelp_bin['test'])), k=eval_set_size)).map(preprocess)

sampled_test_set_dict['yelp_bin'] = [
    {
        'text': row['text'],
        'label': row['label'],
    }
    for row in sampled_test_set['yelp_bin']
]

Reusing dataset yelp_polarity (/home/jasko/.cache/huggingface/datasets/yelp_polarity/plain_text/1.0.0/a770787b2526bdcbfc29ac2d9beb8e820fbc15a03afd3ebc4fb9d8529de57544)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached processed dataset at /home/jasko/.cache/huggingface/datasets/yelp_polarity/plain_text/1.0.0/a770787b2526bdcbfc29ac2d9beb8e820fbc15a03afd3ebc4fb9d8529de57544/cache-4aafc3a0650c7800.arrow


## Yelp-5

In [13]:
yelp_full = load_dataset('yelp_review_full')

random.seed(11)
sampled_test_set['yelp_full'] = yelp_full['test'].select(random.choices(range(len(yelp_full['test'])), k=eval_set_size)).map(preprocess)

sampled_test_set_dict['yelp_full'] = [
    {
        'text': row['text'],
        'label': row['label'],
    }
    for row in sampled_test_set['yelp_full']
]

Reusing dataset yelp_review_full (/home/jasko/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/13c31a618ba62568ec8572a222a283dfc29a6517776a3ac5945fb508877dde43)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached processed dataset at /home/jasko/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/13c31a618ba62568ec8572a222a283dfc29a6517776a3ac5945fb508877dde43/cache-be339907fd6abb29.arrow


### Perturbations

In [14]:
def generate_perturbed_multiset(input, wsp):
    random.seed(11)
    result = []

    for i in range(10):
        test_item = copy.deepcopy(input)

        for row in test_item:
            row['text'] = wsp.perturb([row['text']])[0][0]
        result.append(test_item)
    
    return result

Perturbed set with no whitespace modifications

In [15]:
wsp = WordScramblerPerturber(perturb_prob=0.1, weight_add=1, weight_drop=1, weight_swap=1,
                             weight_split_word=0, weight_merge_words=0)

for task in tasks:
    sampled_test_set_adv_no_ws[task] = generate_perturbed_multiset(sampled_test_set_dict[task], wsp)

Perturbed set with whitespace modifications

In [16]:
wsp = WordScramblerPerturber(perturb_prob=0.1, weight_add=1, weight_drop=1, weight_swap=1,
                             weight_split_word=1, weight_merge_words=1)

for task in tasks:
    sampled_test_set_adv_incl_ws[task] = generate_perturbed_multiset(sampled_test_set_dict[task], wsp)

## Models

### BERT, including finetuned variants

In [17]:
tokenizer = {}
model_base = {}
model_finetuned = { type: {} for type in model_types }
model_finetuned_all_pert = { type: {} for type in model_types }

In [18]:
bert_checkpoint = "bert-base-uncased"
tokenizer['bert'] = AutoTokenizer.from_pretrained(bert_checkpoint)
model_base['bert'] = BertModel.from_pretrained(bert_checkpoint).to(device)

In [19]:
bert_checkpoint_finetuned_imdb = "artemis13fowl/bert-base-uncased-imdb"
if 'bert' in model_types and 'imdb' in tasks:
    model_finetuned['bert']['imdb'] = BertForSequenceClassification.from_pretrained(bert_checkpoint_finetuned_imdb).to(device)

In [20]:
bert_checkpoint_finetuned_imdb_all_pert = "jjezabek/bert-base-uncased-imdb-all-pert"
if 'bert' in model_types and 'imdb' in tasks:
    model_finetuned_all_pert['bert']['imdb'] = BertForSequenceClassification.from_pretrained(bert_checkpoint_finetuned_imdb_all_pert).to(device)

In [21]:
bert_checkpoint_finetuned_sst = "jjezabek/bert-base-uncased-sst"
if 'bert' in model_types and 'sst' in tasks:
    model_finetuned['bert']['sst'] = BertForSequenceClassification.from_pretrained(bert_checkpoint_finetuned_sst).to(device)

In [22]:
bert_checkpoint_finetuned_sst_bin = '/home/jasko/resilient_nlp/output/bert-base-uncased-sst_bin/checkpoint-800'
if 'bert' in model_types and 'sst_bin' in tasks:
    model_finetuned['bert']['sst_bin'] = BertForSequenceClassification.from_pretrained(bert_checkpoint_finetuned_sst_bin).to(device)

In [23]:
bert_checkpoint_finetuned_yelp_bin = '/home/jasko/resilient_nlp/output/bert-base-uncased-yelp_bin/checkpoint-3500'
if 'bert' in model_types and 'yelp_bin' in tasks:
    model_finetuned['bert']['yelp_bin'] = BertForSequenceClassification.from_pretrained(bert_checkpoint_finetuned_yelp_bin).to(device)

In [24]:
bert_checkpoint_finetuned_yelp_full = '/home/jasko/resilient_nlp/output/bert-base-uncased-yelp_full/checkpoint-1500'
if 'bert' in model_types and 'yelp_full' in tasks:
    model_finetuned['bert']['yelp_full'] = BertForSequenceClassification.from_pretrained(bert_checkpoint_finetuned_yelp_full).to(device)

In [25]:
roberta_checkpoint = "roberta-base"
tokenizer['roberta'] = AutoTokenizer.from_pretrained(roberta_checkpoint)
model_base['roberta'] = RobertaModel.from_pretrained(roberta_checkpoint).to(device)

In [26]:
roberta_checkpoint_finetuned_imdb = "jjezabek/roberta-base-imdb"
if 'roberta' in model_types and 'imdb' in tasks:
    model_finetuned['roberta']['imdb'] = RobertaForSequenceClassification.from_pretrained(roberta_checkpoint_finetuned_imdb).to(device)

In [27]:
roberta_checkpoint_finetuned_sst = '/home/jasko/resilient_nlp/output/roberta-base-sst/checkpoint-900'
if 'roberta' in model_types and 'sst' in tasks:
    model_finetuned['roberta']['sst'] = RobertaForSequenceClassification.from_pretrained(roberta_checkpoint_finetuned_sst).to(device)

In [28]:
roberta_checkpoint_finetuned_sst_bin = '/home/jasko/resilient_nlp/output/roberta-base-sst_bin/checkpoint-700'
if 'roberta' in model_types and 'sst_bin' in tasks:
    model_finetuned['roberta']['sst_bin'] = RobertaForSequenceClassification.from_pretrained(roberta_checkpoint_finetuned_sst_bin).to(device)

In [29]:
roberta_checkpoint_finetuned_yelp_bin = '/home/jasko/resilient_nlp/output/roberta-base-yelp_bin/checkpoint-2200'
if 'roberta' in model_types and 'yelp_bin' in tasks:
    model_finetuned['roberta']['yelp_bin'] = RobertaForSequenceClassification.from_pretrained(roberta_checkpoint_finetuned_yelp_bin).to(device)

In [30]:
roberta_checkpoint_finetuned_yelp_full = '/home/jasko/resilient_nlp/output/roberta-base-yelp_full/checkpoint-7500'
if 'roberta' in model_types and 'yelp_full' in tasks:
    model_finetuned['roberta']['yelp_full'] = RobertaForSequenceClassification.from_pretrained(roberta_checkpoint_finetuned_yelp_full).to(device)

### RobEn clusterings (as baseline)

The first clustering is ConnComp (which very aggressively merges clusters). The second is AggClust, which uses a cost function to better preserve fidelity. The second one should generally be better.

In [31]:
roben_clustering = Clustering.from_pickle("../vocab100000_ed1.pkl")
roben_recoverer = ClusterRecovererWithPassthrough("cache", roben_clustering)
roben_clustering2 = Clustering.from_pickle("../vocab100000_ed1_gamma0.3.pkl")
roben_recoverer2 = ClusterRecovererWithPassthrough("cache", roben_clustering2)

## Model Prediction Helpers

In [32]:
max_sequence_length = 128
batch_size = 32

These are wrappers for standard (possibly finetuned) Huggingface models, using their normal tokenizers.

In [33]:
def standard_model_predict(tokenizer, model, sentences, recoverer, return_pred_tensor, recoverer_tokenize):
    if recoverer is not None:
        if recoverer_tokenize:
            tok = nltk.tokenize.treebank.TreebankWordTokenizer()
            sentences = [ " ".join(tok_list) for tok_list in tok.tokenize_sents(sentences) ]
        sentences = [ recoverer.recover(s.lower()) for s in sentences ]
        if recoverer_tokenize:
            detok = nltk.tokenize.treebank.TreebankWordDetokenizer()
            sentences = [ detok.detokenize(s.split(" ")) for s in sentences]
    tokenized = tokenizer(sentences, truncation=True, padding='max_length', max_length=max_sequence_length,
                          return_tensors='pt')
    tokenized = { k: v.to(device) for k, v in tokenized.items() }
    preds = model(**tokenized)
    if return_pred_tensor:
        return preds
    else:
        return torch.argmax(preds.logits, dim=1)

def wrap_standard_model(tokenizer, model, recoverer=None, return_pred_tensor=True, recoverer_tokenize=False):
    return lambda sentences: standard_model_predict(tokenizer, model, sentences, recoverer, return_pred_tensor,
                                                    recoverer_tokenize)

This is a wrapper for the machine trained tokenizer+embedder (aka MockingBERT)

In [34]:
def mltokenizer_model_predict(runner, model, cls_embedding, sep_embedding, pad_embedding, sentences, return_pred_tensor):
    # Truncate and lower case. Truncation is for performance only
    sentences = [ s.lower()[:8*max_sequence_length] for s in sentences]
    embedding = runner.embed(sentences=sentences,
        start_token=cls_embedding, end_token=sep_embedding, pad_token=pad_embedding,
        max_tokens=max_sequence_length)
    preds = model(inputs_embeds=embedding['inputs_embeds'], attention_mask=embedding['attention_mask'])
    if return_pred_tensor:
        return preds
    else:
        return torch.argmax(preds.logits, dim=1)

def wrap_mltokenizer_model(mltokenizer_prefix, tokenizer, model, cf_embedding, type, return_pred_tensor=True):
    filename = "../{}.pth".format(mltokenizer_prefix)
    runner = ExperimentRunner(device, model_filename=filename)
    if type == 'bert':
        cls_token_id = tokenizer.vocab['[CLS]']
        sep_token_id = tokenizer.vocab['[SEP]']
        pad_token_id = tokenizer.vocab['[PAD]']
    elif type == 'roberta':
        cls_token_id = tokenizer.vocab['<s>']
        sep_token_id = tokenizer.vocab['</s>']
        pad_token_id = tokenizer.vocab['<pad>']
    cls_embedding = cf_embedding(torch.tensor([cls_token_id], device=device)).view(-1)
    sep_embedding = cf_embedding(torch.tensor([sep_token_id], device=device)).view(-1)
    pad_embedding = cf_embedding(torch.tensor([pad_token_id], device=device)).view(-1)
    
    return lambda sentences: mltokenizer_model_predict(runner, model, cls_embedding, sep_embedding,
                                                      pad_embedding, sentences, return_pred_tensor)

## Evaluation Helpers

Evaluates a wrapped model on a test set

In [35]:
@torch.no_grad()
def evaluate_model(model, test_set):
    num_batches = math.ceil(len(test_set) / batch_size)
    
    sentences = [ x['text'] for x in test_set ]
    labels = [ x['label'] for x in test_set ]
    pred_batches = []
    
    for i in tqdm(range(num_batches)):
        bs = i * batch_size
        be = bs + batch_size
        
        output = model(sentences[bs:be])
        
        pred_batches.append(torch.argmax(output.logits, dim=1).detach().cpu())
    preds = torch.cat(pred_batches)
    
    print(classification_report(labels, preds, digits=4))
    
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='macro')
    
    return accuracy, f1

Evaluates a wrapped model on a stochastic, pseudo-adversarial test set. This means that each input sentence is replicated x times (typically 10) with randomized perturbations, and an attack is considered successful if *any* of the predictions is incorrect.

In [36]:
@torch.no_grad()
def evaluate_model_adv(model, test_sets):
    labels = [ x['label'] for x in test_sets[0] ]
    adv_preds = copy.copy(labels)
    accuracy_list = []
    f1_list = []
    
    for idx in tqdm(range(len(test_sets))):
        test_set = test_sets[idx]
        num_batches = math.ceil(len(test_set) / batch_size)
    
        sentences = [ x['text'] for x in test_set ]
        pred_batches = []
    
        for i in range(num_batches):
            bs = i * batch_size
            be = bs + batch_size
        
            output = model(sentences[bs:be])
        
            pred_batches.append(torch.argmax(output.logits, dim=1).detach().cpu())
        preds = torch.cat(pred_batches)
        
        for i in range(len(adv_preds)):
            if labels[i] != preds[i]:
                adv_preds[i] = preds[i]

        accuracy_list.append(accuracy_score(labels, adv_preds))
        f1_list.append(f1_score(labels, adv_preds, average='macro'))
    
    print(classification_report(labels, adv_preds, digits=4))    
    
    return accuracy_list, f1_list

Evaluates a model using WordScoreAttack

In [37]:
@torch.no_grad()
def evaluate_model_word_score(model, test_set, allow_whitespace_pert=True, report_prefix=None, word_scores_file=None):
    attacker = BertWordScoreAttack(
        WordScramblerPerturber(perturb_prob=1, weight_add=1, weight_drop=1, weight_swap=1,
                               weight_split_word=int(allow_whitespace_pert),
                               weight_merge_words=0),
        word_scores_file, model, tokenizer=None, max_sequence_length=max_sequence_length,
        attack_whitespace=allow_whitespace_pert,
    )

    res = attacker.attack(test_set, max_tokens_to_perturb=10, max_tries_per_token=4, mode=0, print_summary=False)

    if report_prefix is not None:
        res.to_csv(f"{report_prefix}_df.csv")
        with open(f"{report_prefix}_stats.json", "w") as f:
            json.dump(attacker.compute_attack_stats(), fp=f)            
    
    print(classification_report(res['ground_truth'], res['perturbed_preds'], digits=4))    
    
    accuracy = accuracy_score(res['ground_truth'], res['perturbed_preds'])
    f1 = f1_score(res['ground_truth'], res['perturbed_preds'], average='macro')
    
    return accuracy, f1

In [38]:
all_models = {
    'baseline': lambda task, type: wrap_standard_model(tokenizer[type], model_finetuned[type][task]),
    'baseline_all_pert': lambda task, type: wrap_standard_model(tokenizer[type], model_finetuned_all_pert[type][task]),
    'roben_1': lambda task, type: wrap_standard_model(tokenizer[type], model_finetuned[type][task], roben_recoverer),
    'roben_2': lambda task, type: wrap_standard_model(tokenizer[type], model_finetuned[type][task], roben_recoverer2),
    'roben_1_tok': lambda task, type: wrap_standard_model(tokenizer[type], model_finetuned[type][task], roben_recoverer, recoverer_tokenize=True),
    'roben_2_tok': lambda task, type: wrap_standard_model(tokenizer[type], model_finetuned[type][task], roben_recoverer2, recoverer_tokenize=True),
}

mltok_model_names = [
    '64k_lstm_clean_vanilla',
    '64k_lstm_no_whitespace_pert_vanilla',
    '64k_lstm_all_pert_vanilla',
    '64k_lstm_clean_finetuned',
    '64k_lstm_no_whitespace_pert_finetuned',
    '64k_lstm_all_pert_finetuned',
    '64k_cnn_no_whitespace_pert_finetuned',
    '2m_lstm_all_pert_finetuned',
    '32k_lstm_all_pert_finetuned_100ep',
]

for name in mltok_model_names:
    if name.endswith('_vanilla'):
        cf_embedding = lambda task, type: model_base[type].embeddings.word_embeddings
        filename = lambda task, type, name: f'output/{type}_{name}'
    else:
        cf_embedding = lambda task, type: model_finetuned[type][task].base_model.embeddings.word_embeddings
        filename = lambda task, type, name: f'output/{type}_{name}_{task}'
    # name=name is a hack to avoid Python late binding
    all_models[name] = lambda task, type, name=name, filename=filename, cf_embedding=cf_embedding: wrap_mltokenizer_model(filename(task, type, name), tokenizer[type], model_finetuned[type][task], cf_embedding(task, type), type)

In [39]:
evaluations = [
    'clean',
    'stochastic_no_ws',
    'stochastic_incl_ws',
    'word_score_no_ws',
    'word_score_incl_ws',
]

In [40]:
model_task_ids = [ f"{model}_{type}_{task}" for task, type, model in itertools.product(tasks, model_types, all_models.keys()) ]

accuracy_df = pd.DataFrame(columns=evaluations, index=model_task_ids)
f1_df = pd.DataFrame(columns=evaluations, index=model_task_ids)

for task, type in itertools.product(tasks, model_types):
    for cur_model_name, cur_model_factory in all_models.items():
        try:
            cur_model = cur_model_factory(task, type)
        except:
            print(f'Failed loading model {cur_model_name} on {type} for task {task}, skipping')
            accuracy_df.drop(f"{cur_model_name}_{type}_{task}", inplace=True)
            f1_df.drop(f"{cur_model_name}_{type}_{task}", inplace=True)
            continue
        for cur_evaluation in evaluations:
            print(f'Evaluating model {cur_model_name} on {type} on {cur_evaluation} for task {task}')
            start_time = time.time()
            random.seed(11)
            if cur_evaluation == 'clean':
                acc, f1 = evaluate_model(cur_model, sampled_test_set[task])
            elif cur_evaluation.startswith('stochastic_'):
                if cur_evaluation == 'stochastic_no_ws':
                    acc_list, f1_list = evaluate_model_adv(cur_model, sampled_test_set_adv_no_ws[task])
                elif cur_evaluation == 'stochastic_incl_ws':
                    acc_list, f1_list = evaluate_model_adv(cur_model, sampled_test_set_adv_incl_ws[task])
                acc = acc_list[-1]
                f1 = f1_list[-1]
                with open(f"../output/eval/{cur_model_name}_{type}_{task}_{cur_evaluation}_acc_list.json", "w") as f:
                    json.dump(acc_list, fp=f)
                with open(f"../output/eval/{cur_model_name}_{type}_{task}_{cur_evaluation}_f1_list.json", "w") as f:
                    json.dump(f1_list, fp=f)
            elif cur_evaluation.startswith('word_score_'):
                if cur_evaluation == 'word_score_no_ws':
                    acc, f1 = evaluate_model_word_score(cur_model, sampled_test_set[task], allow_whitespace_pert=False,
                                                        report_prefix=f"../output/eval/{cur_model_name}_{type}_{task}_{cur_evaluation}",
                                                        word_scores_file=f"../output/{task}_word_scores.json")
                elif cur_evaluation == 'word_score_incl_ws':
                    acc, f1 = evaluate_model_word_score(cur_model, sampled_test_set[task], allow_whitespace_pert=True,
                                                        report_prefix=f"../output/eval/{cur_model_name}_{type}_{task}_{cur_evaluation}",
                                                        word_scores_file=f"../output/{task}_word_scores.json")

            accuracy_df[cur_evaluation][f"{cur_model_name}_{type}_{task}"] = acc
            f1_df[cur_evaluation][f"{cur_model_name}_{type}_{task}"] = f1
            end_time = time.time()
            print(f"Evaluation took {end_time-start_time} seconds")
        del cur_model

Evaluating model baseline on bert on clean for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:03<00:00,  5.04it/s]


              precision    recall  f1-score   support

           0     0.9346    0.9720    0.9529       250
           1     0.9708    0.9320    0.9510       250

    accuracy                         0.9520       500
   macro avg     0.9527    0.9520    0.9520       500
weighted avg     0.9527    0.9520    0.9520       500

Evaluation took 3.320714235305786 seconds
Evaluating model baseline on bert on stochastic_no_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:25<00:00,  2.59s/it]


              precision    recall  f1-score   support

           0     0.8626    0.9040    0.8828       250
           1     0.8992    0.8560    0.8770       250

    accuracy                         0.8800       500
   macro avg     0.8809    0.8800    0.8799       500
weighted avg     0.8809    0.8800    0.8799       500

Evaluation took 26.032347917556763 seconds
Evaluating model baseline on bert on stochastic_incl_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:25<00:00,  2.60s/it]


              precision    recall  f1-score   support

           0     0.8667    0.8840    0.8752       250
           1     0.8816    0.8640    0.8727       250

    accuracy                         0.8740       500
   macro avg     0.8741    0.8740    0.8740       500
weighted avg     0.8741    0.8740    0.8740       500

Evaluation took 26.618725061416626 seconds
Evaluating model baseline on bert on word_score_no_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:28<00:00,  5.62it/s]


              precision    recall  f1-score   support

           0     0.7023    0.7360    0.7187       250
           1     0.7227    0.6880    0.7049       250

    accuracy                         0.7120       500
   macro avg     0.7125    0.7120    0.7118       500
weighted avg     0.7125    0.7120    0.7118       500

Evaluation took 89.48078799247742 seconds
Evaluating model baseline on bert on word_score_incl_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:26<00:00,  5.79it/s]


              precision    recall  f1-score   support

           0     0.6917    0.7360    0.7132       250
           1     0.7179    0.6720    0.6942       250

    accuracy                         0.7040       500
   macro avg     0.7048    0.7040    0.7037       500
weighted avg     0.7048    0.7040    0.7037       500

Evaluation took 86.41984486579895 seconds
Failed loading model baseline_all_pert on bert for task yelp_bin, skipping
Evaluating model roben_1 on bert on clean for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02<00:00,  5.93it/s]


              precision    recall  f1-score   support

           0     0.8453    0.9400    0.8902       250
           1     0.9324    0.8280    0.8771       250

    accuracy                         0.8840       500
   macro avg     0.8889    0.8840    0.8836       500
weighted avg     0.8889    0.8840    0.8836       500

Evaluation took 2.7370617389678955 seconds
Evaluating model roben_1 on bert on stochastic_no_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:26<00:00,  2.65s/it]


              precision    recall  f1-score   support

           0     0.7789    0.8880    0.8299       250
           1     0.8698    0.7480    0.8043       250

    accuracy                         0.8180       500
   macro avg     0.8244    0.8180    0.8171       500
weighted avg     0.8244    0.8180    0.8171       500

Evaluation took 26.549556255340576 seconds
Evaluating model roben_1 on bert on stochastic_incl_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:26<00:00,  2.65s/it]


              precision    recall  f1-score   support

           0     0.7428    0.8200    0.7795       250
           1     0.7991    0.7160    0.7553       250

    accuracy                         0.7680       500
   macro avg     0.7709    0.7680    0.7674       500
weighted avg     0.7709    0.7680    0.7674       500

Evaluation took 26.518423318862915 seconds
Evaluating model roben_1 on bert on word_score_no_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:28<00:00,  5.67it/s]


              precision    recall  f1-score   support

           0     0.7448    0.8520    0.7948       250
           1     0.8271    0.7080    0.7629       250

    accuracy                         0.7800       500
   macro avg     0.7859    0.7800    0.7789       500
weighted avg     0.7859    0.7800    0.7789       500

Evaluation took 88.28096389770508 seconds
Evaluating model roben_1 on bert on word_score_incl_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:22<00:00,  6.06it/s]


              precision    recall  f1-score   support

           0     0.6355    0.7600    0.6922       250
           1     0.7015    0.5640    0.6253       250

    accuracy                         0.6620       500
   macro avg     0.6685    0.6620    0.6587       500
weighted avg     0.6685    0.6620    0.6587       500

Evaluation took 82.60048627853394 seconds
Evaluating model roben_2 on bert on clean for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02<00:00,  6.15it/s]


              precision    recall  f1-score   support

           0     0.8918    0.9560    0.9228       250
           1     0.9526    0.8840    0.9170       250

    accuracy                         0.9200       500
   macro avg     0.9222    0.9200    0.9199       500
weighted avg     0.9222    0.9200    0.9199       500

Evaluation took 2.642221212387085 seconds
Evaluating model roben_2 on bert on stochastic_no_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:25<00:00,  2.58s/it]


              precision    recall  f1-score   support

           0     0.8513    0.9160    0.8825       250
           1     0.9091    0.8400    0.8732       250

    accuracy                         0.8780       500
   macro avg     0.8802    0.8780    0.8778       500
weighted avg     0.8802    0.8780    0.8778       500

Evaluation took 25.846231698989868 seconds
Evaluating model roben_2 on bert on stochastic_incl_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:25<00:00,  2.58s/it]


              precision    recall  f1-score   support

           0     0.8029    0.8960    0.8469       250
           1     0.8824    0.7800    0.8280       250

    accuracy                         0.8380       500
   macro avg     0.8426    0.8380    0.8375       500
weighted avg     0.8426    0.8380    0.8375       500

Evaluation took 25.84341859817505 seconds
Evaluating model roben_2 on bert on word_score_no_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:31<00:00,  5.46it/s]


              precision    recall  f1-score   support

           0     0.8073    0.8880    0.8457       250
           1     0.8756    0.7880    0.8295       250

    accuracy                         0.8380       500
   macro avg     0.8414    0.8380    0.8376       500
weighted avg     0.8414    0.8380    0.8376       500

Evaluation took 91.68138718605042 seconds
Evaluating model roben_2 on bert on word_score_incl_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:25<00:00,  5.84it/s]


              precision    recall  f1-score   support

           0     0.6833    0.7680    0.7232       250
           1     0.7352    0.6440    0.6866       250

    accuracy                         0.7060       500
   macro avg     0.7092    0.7060    0.7049       500
weighted avg     0.7092    0.7060    0.7049       500

Evaluation took 85.6818060874939 seconds
Evaluating model roben_1_tok on bert on clean for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02<00:00,  5.55it/s]


              precision    recall  f1-score   support

           0     0.8280    0.9240    0.8733       250
           1     0.9140    0.8080    0.8577       250

    accuracy                         0.8660       500
   macro avg     0.8710    0.8660    0.8655       500
weighted avg     0.8710    0.8660    0.8655       500

Evaluation took 2.926464796066284 seconds
Evaluating model roben_1_tok on bert on stochastic_no_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:28<00:00,  2.86s/it]


              precision    recall  f1-score   support

           0     0.7657    0.8760    0.8172       250
           1     0.8551    0.7320    0.7888       250

    accuracy                         0.8040       500
   macro avg     0.8104    0.8040    0.8030       500
weighted avg     0.8104    0.8040    0.8030       500

Evaluation took 28.61399555206299 seconds
Evaluating model roben_1_tok on bert on stochastic_incl_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:28<00:00,  2.85s/it]


              precision    recall  f1-score   support

           0     0.7193    0.8200    0.7664       250
           1     0.7907    0.6800    0.7312       250

    accuracy                         0.7500       500
   macro avg     0.7550    0.7500    0.7488       500
weighted avg     0.7550    0.7500    0.7488       500

Evaluation took 28.546339988708496 seconds
Evaluating model roben_1_tok on bert on word_score_no_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:35<00:00,  5.21it/s]


              precision    recall  f1-score   support

           0     0.7593    0.8960    0.8220       250
           1     0.8732    0.7160    0.7868       250

    accuracy                         0.8060       500
   macro avg     0.8162    0.8060    0.8044       500
weighted avg     0.8162    0.8060    0.8044       500

Evaluation took 95.93481588363647 seconds
Evaluating model roben_1_tok on bert on word_score_incl_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:26<00:00,  5.75it/s]


              precision    recall  f1-score   support

           0     0.6201    0.7640    0.6846       250
           1     0.6927    0.5320    0.6018       250

    accuracy                         0.6480       500
   macro avg     0.6564    0.6480    0.6432       500
weighted avg     0.6564    0.6480    0.6432       500

Evaluation took 86.99793529510498 seconds
Evaluating model roben_2_tok on bert on clean for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02<00:00,  5.72it/s]


              precision    recall  f1-score   support

           0     0.8686    0.9520    0.9084       250
           1     0.9469    0.8560    0.8992       250

    accuracy                         0.9040       500
   macro avg     0.9078    0.9040    0.9038       500
weighted avg     0.9078    0.9040    0.9038       500

Evaluation took 2.840550422668457 seconds
Evaluating model roben_2_tok on bert on stochastic_no_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:27<00:00,  2.78s/it]


              precision    recall  f1-score   support

           0     0.8345    0.9280    0.8788       250
           1     0.9189    0.8160    0.8644       250

    accuracy                         0.8720       500
   macro avg     0.8767    0.8720    0.8716       500
weighted avg     0.8767    0.8720    0.8716       500

Evaluation took 27.797950744628906 seconds
Evaluating model roben_2_tok on bert on stochastic_incl_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:27<00:00,  2.78s/it]


              precision    recall  f1-score   support

           0     0.7727    0.8840    0.8246       250
           1     0.8645    0.7400    0.7974       250

    accuracy                         0.8120       500
   macro avg     0.8186    0.8120    0.8110       500
weighted avg     0.8186    0.8120    0.8110       500

Evaluation took 27.7800772190094 seconds
Evaluating model roben_2_tok on bert on word_score_no_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:39<00:00,  5.04it/s]


              precision    recall  f1-score   support

           0     0.8250    0.9240    0.8717       250
           1     0.9136    0.8040    0.8553       250

    accuracy                         0.8640       500
   macro avg     0.8693    0.8640    0.8635       500
weighted avg     0.8693    0.8640    0.8635       500

Evaluation took 99.31512236595154 seconds
Evaluating model roben_2_tok on bert on word_score_incl_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:30<00:00,  5.50it/s]


              precision    recall  f1-score   support

           0     0.6829    0.7840    0.7300       250
           1     0.7465    0.6360    0.6868       250

    accuracy                         0.7100       500
   macro avg     0.7147    0.7100    0.7084       500
weighted avg     0.7147    0.7100    0.7084       500

Evaluation took 90.88443303108215 seconds
Evaluating model 64k_lstm_clean_vanilla on bert on clean for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:22<00:00,  1.39s/it]


              precision    recall  f1-score   support

           0     0.9412    0.9600    0.9505       250
           1     0.9592    0.9400    0.9495       250

    accuracy                         0.9500       500
   macro avg     0.9502    0.9500    0.9500       500
weighted avg     0.9502    0.9500    0.9500       500

Evaluation took 22.25829005241394 seconds
Evaluating model 64k_lstm_clean_vanilla on bert on stochastic_no_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:40<00:00, 22.07s/it]


              precision    recall  f1-score   support

           0     0.8964    0.9000    0.8982       250
           1     0.8996    0.8960    0.8978       250

    accuracy                         0.8980       500
   macro avg     0.8980    0.8980    0.8980       500
weighted avg     0.8980    0.8980    0.8980       500

Evaluation took 220.65621900558472 seconds
Evaluating model 64k_lstm_clean_vanilla on bert on stochastic_incl_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:38<00:00, 21.82s/it]


              precision    recall  f1-score   support

           0     0.8770    0.8840    0.8805       250
           1     0.8831    0.8760    0.8795       250

    accuracy                         0.8800       500
   macro avg     0.8800    0.8800    0.8800       500
weighted avg     0.8800    0.8800    0.8800       500

Evaluation took 218.20307517051697 seconds
Evaluating model 64k_lstm_clean_vanilla on bert on word_score_no_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [12:45<00:00,  1.53s/it]


              precision    recall  f1-score   support

           0     0.7716    0.7160    0.7427       250
           1     0.7351    0.7880    0.7606       250

    accuracy                         0.7520       500
   macro avg     0.7533    0.7520    0.7517       500
weighted avg     0.7533    0.7520    0.7517       500

Evaluation took 765.3168187141418 seconds
Evaluating model 64k_lstm_clean_vanilla on bert on word_score_incl_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [12:35<00:00,  1.51s/it]


              precision    recall  f1-score   support

           0     0.7213    0.7040    0.7126       250
           1     0.7109    0.7280    0.7194       250

    accuracy                         0.7160       500
   macro avg     0.7161    0.7160    0.7160       500
weighted avg     0.7161    0.7160    0.7160       500

Evaluation took 755.2156369686127 seconds
Evaluating model 64k_lstm_no_whitespace_pert_vanilla on bert on clean for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:22<00:00,  1.39s/it]


              precision    recall  f1-score   support

           0     0.9526    0.9640    0.9583       250
           1     0.9636    0.9520    0.9577       250

    accuracy                         0.9580       500
   macro avg     0.9581    0.9580    0.9580       500
weighted avg     0.9581    0.9580    0.9580       500

Evaluation took 22.23691964149475 seconds
Evaluating model 64k_lstm_no_whitespace_pert_vanilla on bert on stochastic_no_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:45<00:00, 22.54s/it]


              precision    recall  f1-score   support

           0     0.9286    0.9360    0.9323       250
           1     0.9355    0.9280    0.9317       250

    accuracy                         0.9320       500
   macro avg     0.9320    0.9320    0.9320       500
weighted avg     0.9320    0.9320    0.9320       500

Evaluation took 225.37646770477295 seconds
Evaluating model 64k_lstm_no_whitespace_pert_vanilla on bert on stochastic_incl_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:49<00:00, 22.95s/it]


              precision    recall  f1-score   support

           0     0.8915    0.9200    0.9055       250
           1     0.9174    0.8880    0.9024       250

    accuracy                         0.9040       500
   macro avg     0.9044    0.9040    0.9040       500
weighted avg     0.9044    0.9040    0.9040       500

Evaluation took 229.53277111053467 seconds
Evaluating model 64k_lstm_no_whitespace_pert_vanilla on bert on word_score_no_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [14:04<00:00,  1.69s/it]


              precision    recall  f1-score   support

           0     0.8984    0.8840    0.8911       250
           1     0.8858    0.9000    0.8929       250

    accuracy                         0.8920       500
   macro avg     0.8921    0.8920    0.8920       500
weighted avg     0.8921    0.8920    0.8920       500

Evaluation took 844.0933077335358 seconds
Evaluating model 64k_lstm_no_whitespace_pert_vanilla on bert on word_score_incl_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [13:17<00:00,  1.59s/it]


              precision    recall  f1-score   support

           0     0.7854    0.7760    0.7807       250
           1     0.7787    0.7880    0.7833       250

    accuracy                         0.7820       500
   macro avg     0.7820    0.7820    0.7820       500
weighted avg     0.7820    0.7820    0.7820       500

Evaluation took 797.2182626724243 seconds
Evaluating model 64k_lstm_all_pert_vanilla on bert on clean for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:22<00:00,  1.38s/it]


              precision    recall  f1-score   support

           0     0.9414    0.9640    0.9526       250
           1     0.9631    0.9400    0.9514       250

    accuracy                         0.9520       500
   macro avg     0.9523    0.9520    0.9520       500
weighted avg     0.9523    0.9520    0.9520       500

Evaluation took 22.06883931159973 seconds
Evaluating model 64k_lstm_all_pert_vanilla on bert on stochastic_no_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:39<00:00, 21.91s/it]


              precision    recall  f1-score   support

           0     0.9252    0.9400    0.9325       250
           1     0.9390    0.9240    0.9315       250

    accuracy                         0.9320       500
   macro avg     0.9321    0.9320    0.9320       500
weighted avg     0.9321    0.9320    0.9320       500

Evaluation took 219.07126712799072 seconds
Evaluating model 64k_lstm_all_pert_vanilla on bert on stochastic_incl_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:38<00:00, 21.83s/it]


              precision    recall  f1-score   support

           0     0.9291    0.9440    0.9365       250
           1     0.9431    0.9280    0.9355       250

    accuracy                         0.9360       500
   macro avg     0.9361    0.9360    0.9360       500
weighted avg     0.9361    0.9360    0.9360       500

Evaluation took 218.3376088142395 seconds
Evaluating model 64k_lstm_all_pert_vanilla on bert on word_score_no_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [13:36<00:00,  1.63s/it]


              precision    recall  f1-score   support

           0     0.9072    0.8600    0.8830       250
           1     0.8669    0.9120    0.8889       250

    accuracy                         0.8860       500
   macro avg     0.8870    0.8860    0.8859       500
weighted avg     0.8870    0.8860    0.8859       500

Evaluation took 816.25426030159 seconds
Evaluating model 64k_lstm_all_pert_vanilla on bert on word_score_incl_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [13:40<00:00,  1.64s/it]


              precision    recall  f1-score   support

           0     0.9163    0.8760    0.8957       250
           1     0.8812    0.9200    0.9002       250

    accuracy                         0.8980       500
   macro avg     0.8988    0.8980    0.8980       500
weighted avg     0.8988    0.8980    0.8980       500

Evaluation took 821.0216379165649 seconds
Failed loading model 64k_lstm_clean_finetuned on bert for task yelp_bin, skipping
Failed loading model 64k_lstm_no_whitespace_pert_finetuned on bert for task yelp_bin, skipping
Evaluating model 64k_lstm_all_pert_finetuned on bert on clean for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:21<00:00,  1.37s/it]


              precision    recall  f1-score   support

           0     0.9486    0.9600    0.9543       250
           1     0.9595    0.9480    0.9537       250

    accuracy                         0.9540       500
   macro avg     0.9541    0.9540    0.9540       500
weighted avg     0.9541    0.9540    0.9540       500

Evaluation took 21.91558527946472 seconds
Evaluating model 64k_lstm_all_pert_finetuned on bert on stochastic_no_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:39<00:00, 21.91s/it]


              precision    recall  f1-score   support

           0     0.9331    0.9480    0.9405       250
           1     0.9472    0.9320    0.9395       250

    accuracy                         0.9400       500
   macro avg     0.9401    0.9400    0.9400       500
weighted avg     0.9401    0.9400    0.9400       500

Evaluation took 219.15632319450378 seconds
Evaluating model 64k_lstm_all_pert_finetuned on bert on stochastic_incl_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:39<00:00, 21.96s/it]


              precision    recall  f1-score   support

           0     0.9294    0.9480    0.9386       250
           1     0.9469    0.9280    0.9374       250

    accuracy                         0.9380       500
   macro avg     0.9382    0.9380    0.9380       500
weighted avg     0.9382    0.9380    0.9380       500

Evaluation took 219.64597511291504 seconds
Evaluating model 64k_lstm_all_pert_finetuned on bert on word_score_no_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [13:46<00:00,  1.65s/it]


              precision    recall  f1-score   support

           0     0.9103    0.8520    0.8802       250
           1     0.8609    0.9160    0.8876       250

    accuracy                         0.8840       500
   macro avg     0.8856    0.8840    0.8839       500
weighted avg     0.8856    0.8840    0.8839       500

Evaluation took 826.0417170524597 seconds
Evaluating model 64k_lstm_all_pert_finetuned on bert on word_score_incl_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [13:51<00:00,  1.66s/it]


              precision    recall  f1-score   support

           0     0.9079    0.8680    0.8875       250
           1     0.8736    0.9120    0.8924       250

    accuracy                         0.8900       500
   macro avg     0.8908    0.8900    0.8899       500
weighted avg     0.8908    0.8900    0.8899       500

Evaluation took 831.5052070617676 seconds
Failed loading model 64k_cnn_no_whitespace_pert_finetuned on bert for task yelp_bin, skipping
Failed loading model 2m_lstm_all_pert_finetuned on bert for task yelp_bin, skipping
Failed loading model 32k_lstm_all_pert_finetuned_100ep on bert for task yelp_bin, skipping
Evaluating model baseline on roberta on clean for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02<00:00,  5.90it/s]


              precision    recall  f1-score   support

           0     0.9640    0.9640    0.9640       250
           1     0.9640    0.9640    0.9640       250

    accuracy                         0.9640       500
   macro avg     0.9640    0.9640    0.9640       500
weighted avg     0.9640    0.9640    0.9640       500

Evaluation took 2.7555718421936035 seconds
Evaluating model baseline on roberta on stochastic_no_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:25<00:00,  2.54s/it]


              precision    recall  f1-score   support

           0     0.8929    0.9000    0.8964       250
           1     0.8992    0.8920    0.8956       250

    accuracy                         0.8960       500
   macro avg     0.8960    0.8960    0.8960       500
weighted avg     0.8960    0.8960    0.8960       500

Evaluation took 25.357050895690918 seconds
Evaluating model baseline on roberta on stochastic_incl_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:25<00:00,  2.54s/it]


              precision    recall  f1-score   support

           0     0.8906    0.9120    0.9012       250
           1     0.9098    0.8880    0.8988       250

    accuracy                         0.9000       500
   macro avg     0.9002    0.9000    0.9000       500
weighted avg     0.9002    0.9000    0.9000       500

Evaluation took 25.41081213951111 seconds
Evaluating model baseline on roberta on word_score_no_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:30<00:00,  5.51it/s]


              precision    recall  f1-score   support

           0     0.7407    0.8000    0.7692       250
           1     0.7826    0.7200    0.7500       250

    accuracy                         0.7600       500
   macro avg     0.7617    0.7600    0.7596       500
weighted avg     0.7617    0.7600    0.7596       500

Evaluation took 90.71817278862 seconds
Evaluating model baseline on roberta on word_score_incl_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:30<00:00,  5.51it/s]


              precision    recall  f1-score   support

           0     0.7593    0.8200    0.7885       250
           1     0.8043    0.7400    0.7708       250

    accuracy                         0.7800       500
   macro avg     0.7818    0.7800    0.7796       500
weighted avg     0.7818    0.7800    0.7796       500

Evaluation took 90.71153163909912 seconds
Failed loading model baseline_all_pert on roberta for task yelp_bin, skipping
Evaluating model roben_1 on roberta on clean for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02<00:00,  6.26it/s]


              precision    recall  f1-score   support

           0     0.8662    0.9320    0.8979       250
           1     0.9264    0.8560    0.8898       250

    accuracy                         0.8940       500
   macro avg     0.8963    0.8940    0.8938       500
weighted avg     0.8963    0.8940    0.8938       500

Evaluation took 2.5955681800842285 seconds
Evaluating model roben_1 on roberta on stochastic_no_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:25<00:00,  2.54s/it]


              precision    recall  f1-score   support

           0     0.8095    0.8840    0.8451       250
           1     0.8722    0.7920    0.8302       250

    accuracy                         0.8380       500
   macro avg     0.8409    0.8380    0.8377       500
weighted avg     0.8409    0.8380    0.8377       500

Evaluation took 25.38089942932129 seconds
Evaluating model roben_1 on roberta on stochastic_incl_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:25<00:00,  2.54s/it]


              precision    recall  f1-score   support

           0     0.7687    0.8640    0.8136       250
           1     0.8447    0.7400    0.7889       250

    accuracy                         0.8020       500
   macro avg     0.8067    0.8020    0.8012       500
weighted avg     0.8067    0.8020    0.8012       500

Evaluation took 25.391318559646606 seconds
Evaluating model roben_1 on roberta on word_score_no_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:28<00:00,  5.66it/s]


              precision    recall  f1-score   support

           0     0.7599    0.8480    0.8015       250
           1     0.8281    0.7320    0.7771       250

    accuracy                         0.7900       500
   macro avg     0.7940    0.7900    0.7893       500
weighted avg     0.7940    0.7900    0.7893       500

Evaluation took 88.38461422920227 seconds
Evaluating model roben_1 on roberta on word_score_incl_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:24<00:00,  5.92it/s]


              precision    recall  f1-score   support

           0     0.6667    0.7920    0.7239       250
           1     0.7438    0.6040    0.6667       250

    accuracy                         0.6980       500
   macro avg     0.7053    0.6980    0.6953       500
weighted avg     0.7053    0.6980    0.6953       500

Evaluation took 84.46801948547363 seconds
Evaluating model roben_2 on roberta on clean for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02<00:00,  6.27it/s]


              precision    recall  f1-score   support

           0     0.8951    0.9560    0.9246       250
           1     0.9528    0.8880    0.9193       250

    accuracy                         0.9220       500
   macro avg     0.9240    0.9220    0.9219       500
weighted avg     0.9240    0.9220    0.9219       500

Evaluation took 2.594374895095825 seconds
Evaluating model roben_2 on roberta on stochastic_no_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:25<00:00,  2.54s/it]


              precision    recall  f1-score   support

           0     0.8481    0.9160    0.8808       250
           1     0.9087    0.8360    0.8708       250

    accuracy                         0.8760       500
   macro avg     0.8784    0.8760    0.8758       500
weighted avg     0.8784    0.8760    0.8758       500

Evaluation took 25.38547706604004 seconds
Evaluating model roben_2 on roberta on stochastic_incl_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:25<00:00,  2.54s/it]


              precision    recall  f1-score   support

           0     0.8162    0.8880    0.8506       250
           1     0.8772    0.8000    0.8368       250

    accuracy                         0.8440       500
   macro avg     0.8467    0.8440    0.8437       500
weighted avg     0.8467    0.8440    0.8437       500

Evaluation took 25.404032707214355 seconds
Evaluating model roben_2 on roberta on word_score_no_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:31<00:00,  5.46it/s]


              precision    recall  f1-score   support

           0     0.8071    0.9040    0.8528       250
           1     0.8909    0.7840    0.8340       250

    accuracy                         0.8440       500
   macro avg     0.8490    0.8440    0.8434       500
weighted avg     0.8490    0.8440    0.8434       500

Evaluation took 91.5969626903534 seconds
Evaluating model roben_2 on roberta on word_score_incl_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:28<00:00,  5.68it/s]


              precision    recall  f1-score   support

           0     0.7208    0.8160    0.7655       250
           1     0.7880    0.6840    0.7323       250

    accuracy                         0.7500       500
   macro avg     0.7544    0.7500    0.7489       500
weighted avg     0.7544    0.7500    0.7489       500

Evaluation took 88.05123710632324 seconds
Evaluating model roben_1_tok on roberta on clean for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02<00:00,  5.83it/s]


              precision    recall  f1-score   support

           0     0.8442    0.9320    0.8859       250
           1     0.9241    0.8280    0.8734       250

    accuracy                         0.8800       500
   macro avg     0.8842    0.8800    0.8797       500
weighted avg     0.8842    0.8800    0.8797       500

Evaluation took 2.7872912883758545 seconds
Evaluating model roben_1_tok on roberta on stochastic_no_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:27<00:00,  2.72s/it]


              precision    recall  f1-score   support

           0     0.7993    0.8920    0.8431       250
           1     0.8778    0.7760    0.8238       250

    accuracy                         0.8340       500
   macro avg     0.8386    0.8340    0.8334       500
weighted avg     0.8386    0.8340    0.8334       500

Evaluation took 27.238959312438965 seconds
Evaluating model roben_1_tok on roberta on stochastic_incl_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:27<00:00,  2.72s/it]


              precision    recall  f1-score   support

           0     0.7431    0.8560    0.7955       250
           1     0.8302    0.7040    0.7619       250

    accuracy                         0.7800       500
   macro avg     0.7866    0.7800    0.7787       500
weighted avg     0.7866    0.7800    0.7787       500

Evaluation took 27.22798180580139 seconds
Evaluating model roben_1_tok on roberta on word_score_no_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:34<00:00,  5.32it/s]


              precision    recall  f1-score   support

           0     0.7742    0.8640    0.8166       250
           1     0.8462    0.7480    0.7941       250

    accuracy                         0.8060       500
   macro avg     0.8102    0.8060    0.8053       500
weighted avg     0.8102    0.8060    0.8053       500

Evaluation took 94.06026005744934 seconds
Evaluating model roben_1_tok on roberta on word_score_incl_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:28<00:00,  5.64it/s]


              precision    recall  f1-score   support

           0     0.6566    0.7800    0.7130       250
           1     0.7291    0.5920    0.6534       250

    accuracy                         0.6860       500
   macro avg     0.6928    0.6860    0.6832       500
weighted avg     0.6928    0.6860    0.6832       500

Evaluation took 88.67803359031677 seconds
Evaluating model roben_2_tok on roberta on clean for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02<00:00,  5.83it/s]


              precision    recall  f1-score   support

           0     0.8906    0.9440    0.9165       250
           1     0.9404    0.8840    0.9113       250

    accuracy                         0.9140       500
   macro avg     0.9155    0.9140    0.9139       500
weighted avg     0.9155    0.9140    0.9139       500

Evaluation took 2.7848637104034424 seconds
Evaluating model roben_2_tok on roberta on stochastic_no_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:27<00:00,  2.72s/it]


              precision    recall  f1-score   support

           0     0.8593    0.9280    0.8923       250
           1     0.9217    0.8480    0.8833       250

    accuracy                         0.8880       500
   macro avg     0.8905    0.8880    0.8878       500
weighted avg     0.8905    0.8880    0.8878       500

Evaluation took 27.23576068878174 seconds
Evaluating model roben_2_tok on roberta on stochastic_incl_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:27<00:00,  2.72s/it]


              precision    recall  f1-score   support

           0     0.8094    0.9000    0.8523       250
           1     0.8874    0.7880    0.8347       250

    accuracy                         0.8440       500
   macro avg     0.8484    0.8440    0.8435       500
weighted avg     0.8484    0.8440    0.8435       500

Evaluation took 27.243335485458374 seconds
Evaluating model roben_2_tok on roberta on word_score_no_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:39<00:00,  5.04it/s]


              precision    recall  f1-score   support

           0     0.8444    0.9120    0.8769       250
           1     0.9043    0.8320    0.8667       250

    accuracy                         0.8720       500
   macro avg     0.8744    0.8720    0.8718       500
weighted avg     0.8744    0.8720    0.8718       500

Evaluation took 99.32031965255737 seconds
Evaluating model roben_2_tok on roberta on word_score_incl_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:34<00:00,  5.30it/s]


              precision    recall  f1-score   support

           0     0.7188    0.8280    0.7695       250
           1     0.7972    0.6760    0.7316       250

    accuracy                         0.7520       500
   macro avg     0.7580    0.7520    0.7506       500
weighted avg     0.7580    0.7520    0.7506       500

Evaluation took 94.44889426231384 seconds
Failed loading model 64k_lstm_clean_vanilla on roberta for task yelp_bin, skipping
Failed loading model 64k_lstm_no_whitespace_pert_vanilla on roberta for task yelp_bin, skipping
Evaluating model 64k_lstm_all_pert_vanilla on roberta on clean for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:22<00:00,  1.43s/it]


              precision    recall  f1-score   support

           0     0.9600    0.9600    0.9600       250
           1     0.9600    0.9600    0.9600       250

    accuracy                         0.9600       500
   macro avg     0.9600    0.9600    0.9600       500
weighted avg     0.9600    0.9600    0.9600       500

Evaluation took 22.8534677028656 seconds
Evaluating model 64k_lstm_all_pert_vanilla on roberta on stochastic_no_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:38<00:00, 21.89s/it]


              precision    recall  f1-score   support

           0     0.9370    0.9520    0.9444       250
           1     0.9512    0.9360    0.9435       250

    accuracy                         0.9440       500
   macro avg     0.9441    0.9440    0.9440       500
weighted avg     0.9441    0.9440    0.9440       500

Evaluation took 218.8963544368744 seconds
Evaluating model 64k_lstm_all_pert_vanilla on roberta on stochastic_incl_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:39<00:00, 21.96s/it]


              precision    recall  f1-score   support

           0     0.9219    0.9440    0.9328       250
           1     0.9426    0.9200    0.9312       250

    accuracy                         0.9320       500
   macro avg     0.9322    0.9320    0.9320       500
weighted avg     0.9322    0.9320    0.9320       500

Evaluation took 219.58509254455566 seconds
Evaluating model 64k_lstm_all_pert_vanilla on roberta on word_score_no_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [13:40<00:00,  1.64s/it]


              precision    recall  f1-score   support

           0     0.8519    0.9200    0.8846       250
           1     0.9130    0.8400    0.8750       250

    accuracy                         0.8800       500
   macro avg     0.8824    0.8800    0.8798       500
weighted avg     0.8824    0.8800    0.8798       500

Evaluation took 821.0247361660004 seconds
Evaluating model 64k_lstm_all_pert_vanilla on roberta on word_score_incl_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [13:35<00:00,  1.63s/it]


              precision    recall  f1-score   support

           0     0.8498    0.9280    0.8872       250
           1     0.9207    0.8360    0.8763       250

    accuracy                         0.8820       500
   macro avg     0.8853    0.8820    0.8817       500
weighted avg     0.8853    0.8820    0.8817       500

Evaluation took 815.7625555992126 seconds
Failed loading model 64k_lstm_clean_finetuned on roberta for task yelp_bin, skipping
Failed loading model 64k_lstm_no_whitespace_pert_finetuned on roberta for task yelp_bin, skipping
Evaluating model 64k_lstm_all_pert_finetuned on roberta on clean for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:21<00:00,  1.36s/it]


              precision    recall  f1-score   support

           0     0.9641    0.9680    0.9661       250
           1     0.9679    0.9640    0.9659       250

    accuracy                         0.9660       500
   macro avg     0.9660    0.9660    0.9660       500
weighted avg     0.9660    0.9660    0.9660       500

Evaluation took 21.871656894683838 seconds
Evaluating model 64k_lstm_all_pert_finetuned on roberta on stochastic_no_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:39<00:00, 21.91s/it]


              precision    recall  f1-score   support

           0     0.9444    0.9520    0.9482       250
           1     0.9516    0.9440    0.9478       250

    accuracy                         0.9480       500
   macro avg     0.9480    0.9480    0.9480       500
weighted avg     0.9480    0.9480    0.9480       500

Evaluation took 219.15234351158142 seconds
Evaluating model 64k_lstm_all_pert_finetuned on roberta on stochastic_incl_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:40<00:00, 22.02s/it]


              precision    recall  f1-score   support

           0     0.9258    0.9480    0.9368       250
           1     0.9467    0.9240    0.9352       250

    accuracy                         0.9360       500
   macro avg     0.9363    0.9360    0.9360       500
weighted avg     0.9363    0.9360    0.9360       500

Evaluation took 220.16953778266907 seconds
Evaluating model 64k_lstm_all_pert_finetuned on roberta on word_score_no_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [13:58<00:00,  1.68s/it]


              precision    recall  f1-score   support

           0     0.8425    0.9200    0.8795       250
           1     0.9119    0.8280    0.8679       250

    accuracy                         0.8740       500
   macro avg     0.8772    0.8740    0.8737       500
weighted avg     0.8772    0.8740    0.8737       500

Evaluation took 838.2970867156982 seconds
Evaluating model 64k_lstm_all_pert_finetuned on roberta on word_score_incl_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [14:13<00:00,  1.71s/it]

              precision    recall  f1-score   support

           0     0.8498    0.9280    0.8872       250
           1     0.9207    0.8360    0.8763       250

    accuracy                         0.8820       500
   macro avg     0.8853    0.8820    0.8817       500
weighted avg     0.8853    0.8820    0.8817       500

Evaluation took 854.0374884605408 seconds
Failed loading model 64k_cnn_no_whitespace_pert_finetuned on roberta for task yelp_bin, skipping
Failed loading model 2m_lstm_all_pert_finetuned on roberta for task yelp_bin, skipping
Failed loading model 32k_lstm_all_pert_finetuned_100ep on roberta for task yelp_bin, skipping





In [41]:
accuracy_df

Unnamed: 0,clean,stochastic_no_ws,stochastic_incl_ws,word_score_no_ws,word_score_incl_ws
baseline_bert_yelp_bin,0.952,0.88,0.874,0.712,0.704
roben_1_bert_yelp_bin,0.884,0.818,0.768,0.78,0.662
roben_2_bert_yelp_bin,0.92,0.878,0.838,0.838,0.706
roben_1_tok_bert_yelp_bin,0.866,0.804,0.75,0.806,0.648
roben_2_tok_bert_yelp_bin,0.904,0.872,0.812,0.864,0.71
64k_lstm_clean_vanilla_bert_yelp_bin,0.95,0.898,0.88,0.752,0.716
64k_lstm_no_whitespace_pert_vanilla_bert_yelp_bin,0.958,0.932,0.904,0.892,0.782
64k_lstm_all_pert_vanilla_bert_yelp_bin,0.952,0.932,0.936,0.886,0.898
64k_lstm_all_pert_finetuned_bert_yelp_bin,0.954,0.94,0.938,0.884,0.89
baseline_roberta_yelp_bin,0.964,0.896,0.9,0.76,0.78


In [42]:
f1_df

Unnamed: 0,clean,stochastic_no_ws,stochastic_incl_ws,word_score_no_ws,word_score_incl_ws
baseline_bert_yelp_bin,0.951981,0.879931,0.873987,0.711834,0.703697
roben_1_bert_yelp_bin,0.883635,0.817104,0.767371,0.778854,0.658722
roben_2_bert_yelp_bin,0.919896,0.877824,0.837453,0.837594,0.704866
roben_1_tok_bert_yelp_bin,0.865548,0.802979,0.748769,0.804416,0.643199
roben_2_tok_bert_yelp_bin,0.903778,0.871597,0.81102,0.863509,0.708403
64k_lstm_clean_vanilla_bert_yelp_bin,0.949995,0.898,0.879998,0.751678,0.715959
64k_lstm_no_whitespace_pert_vanilla_bert_yelp_bin,0.957998,0.931999,0.903975,0.891993,0.781992
64k_lstm_all_pert_vanilla_bert_yelp_bin,0.951993,0.931996,0.935996,0.885923,0.897951
64k_lstm_all_pert_finetuned_bert_yelp_bin,0.953998,0.939996,0.937994,0.883881,0.889947
baseline_roberta_yelp_bin,0.964,0.895998,0.899986,0.759615,0.779647


In [43]:
accuracy_df.to_csv("../output/grid_accuracy.csv")

In [44]:
f1_df.to_csv("../output/grid_f1.csv")

-----