# Multi Model Evaluation

In [1]:
import sys
sys.path.append("..")

import copy
import cProfile
from datasets import load_dataset
import itertools
import json
import math
import nltk
from nltk.tokenize.treebank import TreebankWordDetokenizer
import pandas as pd
import random
from sklearn.metrics import classification_report, accuracy_score, f1_score
import time
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, DataCollatorWithPadding, \
                         AutoModelForSequenceClassification, BertForSequenceClassification, \
                         BertModel, RobertaForSequenceClassification, RobertaModel

from resilient_nlp.mini_roben import Clustering, ClusterRepRecoverer, ClusterRecovererWithPassthrough
from resilient_nlp.models import BertClassifier
from resilient_nlp.perturbers import ToyPerturber, WordScramblerPerturber
from runner import ExperimentRunner
from word_score_attack import BertWordScoreAttack

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [2]:
#tasks = ('imdb', 'sst', 'sst_bin', 'yelp_bin', 'yelp_full')
#tasks = ('sst_bin', 'yelp_bin', 'yelp_full')
tasks = ('yelp_bin',)

In [3]:
model_types = ('bert', 'roberta')

Config for final evaluation on test set

In [4]:
eval_set_size = 500
use_dev_set = False

Config for evaluation on dev set

In [5]:
#eval_set_size = 113
#use_dev_set = True

In [6]:
max_raw_length = 826
preprocess = lambda row: { 'text': row['text'].lower()[:max_raw_length]}

## IMDb Dataset

In [7]:
sampled_test_set = {}
sampled_test_set_dict = {}
sampled_test_set_adv_no_ws = {}
sampled_test_set_adv_incl_ws = {}

In [8]:
imdb = load_dataset('artemis13fowl/imdb')

Using custom data configuration artemis13fowl--imdb-f63738dec0d5e230
Reusing dataset parquet (/home/jasko/.cache/huggingface/datasets/parquet/artemis13fowl--imdb-f63738dec0d5e230/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)


  0%|          | 0/4 [00:00<?, ?it/s]

In [9]:
random.seed(11)
if use_dev_set:
    sampled_test_set['imdb'] = imdb['dev'].select(random.choices(range(len(imdb['dev'])), k=eval_set_size)).map(preprocess)
else:
    sampled_test_set['imdb'] = imdb['attack_eval_truncated'].select(range(eval_set_size)).map(preprocess)


# This is silly but apparently huggingface datasets are immutable?
# Representing it as something a bit more sane
sampled_test_set_dict['imdb'] = [
    {
        'text': row['text'],
        'label': row['label'],
    }
    for row in sampled_test_set['imdb']
]

Loading cached processed dataset at /home/jasko/.cache/huggingface/datasets/parquet/artemis13fowl--imdb-f63738dec0d5e230/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901/cache-b45d493b37ab1dc7.arrow


## SST-5 Dataset

In [10]:
treebank_detok = TreebankWordDetokenizer()

sst = load_dataset('sst').map(
    lambda row: {
        "text": treebank_detok.detokenize(row["sentence"].split()),
        "label": min(math.floor(row["label"] / 0.2), 4.0),
    }, remove_columns=['sentence', 'tokens', 'tree']
)

random.seed(11)
if use_dev_set:
    sampled_test_set['sst'] = sst['validation'].select(random.choices(range(len(sst['validation'])), k=eval_set_size)).map(preprocess)
else:
    sampled_test_set['sst'] = sst['test'].select(random.choices(range(len(sst['test'])), k=eval_set_size)).map(preprocess)

sampled_test_set_dict['sst'] = [
    {
        'text': row['text'],
        'label': row['label'],
    }
    for row in sampled_test_set['sst']
]

No config specified, defaulting to: sst/default
Reusing dataset sst (/home/jasko/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /home/jasko/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-3c142acdab53f98c.arrow
Loading cached processed dataset at /home/jasko/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-0bf56ce0086915ee.arrow
Loading cached processed dataset at /home/jasko/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-19fdf8d124be4ba7.arrow
Loading cached processed dataset at /home/jasko/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-fc3e75429c3f7637.arrow


## SST-2 Dataset

In [11]:
treebank_detok = TreebankWordDetokenizer()

sst_bin = load_dataset('sst').filter(
        lambda row: row["label"] < 0.4 or row["label"] >= 0.6
    ).map(
    lambda row: {
        "text": treebank_detok.detokenize(row["sentence"].split()),
        "label": min(math.floor(row["label"] / 0.5), 1.0),
    }
)

random.seed(11)
if use_dev_set:
    sampled_test_set['sst_bin'] = sst_bin['validation'].select(random.choices(range(len(sst_bin['validation'])), k=eval_set_size)).map(preprocess)
else:
    sampled_test_set['sst_bin'] = sst_bin['test'].select(random.choices(range(len(sst_bin['test'])), k=eval_set_size)).map(preprocess)

sampled_test_set_dict['sst_bin'] = [
    {
        'text': row['text'],
        'label': row['label'],
    }
    for row in sampled_test_set['sst_bin']
]

No config specified, defaulting to: sst/default
Reusing dataset sst (/home/jasko/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /home/jasko/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-f4f1ada73617d193.arrow
Loading cached processed dataset at /home/jasko/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-6279b6f0f8a08f9a.arrow
Loading cached processed dataset at /home/jasko/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-6c5f77e5aefdd0e2.arrow
Loading cached processed dataset at /home/jasko/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-c46f07c913633b4d.arrow
Loading cached processed dataset at /home/jasko/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-87779081ead23eae.arrow
Loading cached processed dataset at /home/jasko/.cache/huggi

## Yelp-2

In [12]:
yelp_bin = load_dataset('yelp_polarity')

random.seed(11)
sampled_test_set['yelp_bin'] = yelp_bin['test'].select(random.choices(range(len(sst['test'])), k=eval_set_size)).map(preprocess)

sampled_test_set_dict['yelp_bin'] = [
    {
        'text': row['text'],
        'label': row['label'],
    }
    for row in sampled_test_set['yelp_bin']
]

Reusing dataset yelp_polarity (/home/jasko/.cache/huggingface/datasets/yelp_polarity/plain_text/1.0.0/a770787b2526bdcbfc29ac2d9beb8e820fbc15a03afd3ebc4fb9d8529de57544)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached processed dataset at /home/jasko/.cache/huggingface/datasets/yelp_polarity/plain_text/1.0.0/a770787b2526bdcbfc29ac2d9beb8e820fbc15a03afd3ebc4fb9d8529de57544/cache-103b3f679d53323c.arrow


## Yelp-5

In [13]:
yelp_full = load_dataset('yelp_polarity')

random.seed(11)
sampled_test_set['yelp_full'] = yelp_bin['test'].select(random.choices(range(len(sst['test'])), k=eval_set_size)).map(preprocess)

sampled_test_set_dict['yelp_full'] = [
    {
        'text': row['text'],
        'label': row['label'],
    }
    for row in sampled_test_set['yelp_full']
]

Reusing dataset yelp_polarity (/home/jasko/.cache/huggingface/datasets/yelp_polarity/plain_text/1.0.0/a770787b2526bdcbfc29ac2d9beb8e820fbc15a03afd3ebc4fb9d8529de57544)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached processed dataset at /home/jasko/.cache/huggingface/datasets/yelp_polarity/plain_text/1.0.0/a770787b2526bdcbfc29ac2d9beb8e820fbc15a03afd3ebc4fb9d8529de57544/cache-103b3f679d53323c.arrow


### Perturbations

In [14]:
def generate_perturbed_multiset(input, wsp):
    random.seed(11)
    result = []

    for i in range(10):
        test_item = copy.deepcopy(input)

        for row in test_item:
            row['text'] = wsp.perturb([row['text']])[0][0]
        result.append(test_item)
    
    return result

Perturbed set with no whitespace modifications

In [15]:
wsp = WordScramblerPerturber(perturb_prob=0.1, weight_add=1, weight_drop=1, weight_swap=1,
                             weight_split_word=0, weight_merge_words=0)

for task in tasks:
    sampled_test_set_adv_no_ws[task] = generate_perturbed_multiset(sampled_test_set_dict[task], wsp)

Perturbed set with whitespace modifications

In [16]:
wsp = WordScramblerPerturber(perturb_prob=0.1, weight_add=1, weight_drop=1, weight_swap=1,
                             weight_split_word=1, weight_merge_words=1)

for task in tasks:
    sampled_test_set_adv_incl_ws[task] = generate_perturbed_multiset(sampled_test_set_dict[task], wsp)

## Models

### BERT, including finetuned variants

In [17]:
tokenizer = {}
model_base = {}
model_finetuned = { type: {} for type in model_types }
model_finetuned_all_pert = { type: {} for type in model_types }

In [18]:
bert_checkpoint = "bert-base-uncased"
tokenizer['bert'] = AutoTokenizer.from_pretrained(bert_checkpoint)
model_base['bert'] = BertModel.from_pretrained(bert_checkpoint).to(device)

In [19]:
bert_checkpoint_finetuned_imdb = "artemis13fowl/bert-base-uncased-imdb"
if 'bert' in model_types and 'imdb' in tasks:
    model_finetuned['bert']['imdb'] = BertForSequenceClassification.from_pretrained(bert_checkpoint_finetuned_imdb).to(device)

In [20]:
bert_checkpoint_finetuned_imdb_all_pert = "jjezabek/bert-base-uncased-imdb-all-pert"
if 'bert' in model_types and 'imdb' in tasks:
    model_finetuned_all_pert['bert']['imdb'] = BertForSequenceClassification.from_pretrained(bert_checkpoint_finetuned_imdb_all_pert).to(device)

In [21]:
bert_checkpoint_finetuned_sst = "jjezabek/bert-base-uncased-sst"
if 'bert' in model_types and 'sst' in tasks:
    model_finetuned['bert']['sst'] = BertForSequenceClassification.from_pretrained(bert_checkpoint_finetuned_sst).to(device)

In [22]:
bert_checkpoint_finetuned_sst_bin = '/home/jasko/resilient_nlp/output/bert-base-uncased-sst_bin/checkpoint-800'
if 'bert' in model_types and 'sst_bin' in tasks:
    model_finetuned['bert']['sst_bin'] = BertForSequenceClassification.from_pretrained(bert_checkpoint_finetuned_sst_bin).to(device)

In [23]:
bert_checkpoint_finetuned_yelp_bin = '/home/jasko/resilient_nlp/output/bert-base-uncased-yelp_bin/checkpoint-3500'
if 'bert' in model_types and 'yelp_bin' in tasks:
    model_finetuned['bert']['yelp_bin'] = BertForSequenceClassification.from_pretrained(bert_checkpoint_finetuned_yelp_bin).to(device)

In [24]:
bert_checkpoint_finetuned_yelp_full = '/home/jasko/resilient_nlp/output/bert-base-uncased-yelp_full/checkpoint-1500'
if 'bert' in model_types and 'yelp_full' in tasks:
    model_finetuned['bert']['yelp_full'] = BertForSequenceClassification.from_pretrained(bert_checkpoint_finetuned_yelp_full).to(device)

In [25]:
roberta_checkpoint = "roberta-base"
tokenizer['roberta'] = AutoTokenizer.from_pretrained(roberta_checkpoint)
model_base['roberta'] = RobertaModel.from_pretrained(roberta_checkpoint).to(device)

In [26]:
roberta_checkpoint_finetuned_imdb = "jjezabek/roberta-base-imdb"
if 'roberta' in model_types and 'imdb' in tasks:
    model_finetuned['roberta']['imdb'] = RobertaForSequenceClassification.from_pretrained(roberta_checkpoint_finetuned_imdb).to(device)

In [27]:
roberta_checkpoint_finetuned_sst = '/home/jasko/resilient_nlp/output/roberta-base-sst/checkpoint-900'
if 'roberta' in model_types and 'sst' in tasks:
    model_finetuned['roberta']['sst'] = RobertaForSequenceClassification.from_pretrained(roberta_checkpoint_finetuned_sst).to(device)

In [28]:
roberta_checkpoint_finetuned_sst_bin = '/home/jasko/resilient_nlp/output/roberta-base-sst_bin/checkpoint-700'
if 'roberta' in model_types and 'sst_bin' in tasks:
    model_finetuned['roberta']['sst_bin'] = RobertaForSequenceClassification.from_pretrained(roberta_checkpoint_finetuned_sst_bin).to(device)

In [29]:
roberta_checkpoint_finetuned_yelp_bin = '/home/jasko/resilient_nlp/output/roberta-base-yelp_bin/checkpoint-2200'
if 'roberta' in model_types and 'yelp_bin' in tasks:
    model_finetuned['roberta']['yelp_bin'] = RobertaForSequenceClassification.from_pretrained(roberta_checkpoint_finetuned_yelp_bin).to(device)

In [30]:
roberta_checkpoint_finetuned_yelp_full = '/home/jasko/resilient_nlp/output/roberta-base-yelp_full/checkpoint-7500'
if 'roberta' in model_types and 'yelp_full' in tasks:
    model_finetuned['roberta']['yelp_full'] = RobertaForSequenceClassification.from_pretrained(roberta_checkpoint_finetuned_yelp_full).to(device)

### RobEn clusterings (as baseline)

The first clustering is ConnComp (which very aggressively merges clusters). The second is AggClust, which uses a cost function to better preserve fidelity. The second one should generally be better.

In [31]:
roben_clustering = Clustering.from_pickle("../vocab100000_ed1.pkl")
roben_recoverer = ClusterRecovererWithPassthrough("cache", roben_clustering)
roben_clustering2 = Clustering.from_pickle("../vocab100000_ed1_gamma0.3.pkl")
roben_recoverer2 = ClusterRecovererWithPassthrough("cache", roben_clustering2)

## Model Prediction Helpers

In [32]:
max_sequence_length = 128
batch_size = 32

These are wrappers for standard (possibly finetuned) Huggingface models, using their normal tokenizers.

In [33]:
def standard_model_predict(tokenizer, model, sentences, recoverer, return_pred_tensor, recoverer_tokenize):
    if recoverer is not None:
        if recoverer_tokenize:
            tok = nltk.tokenize.treebank.TreebankWordTokenizer()
            sentences = [ " ".join(tok_list) for tok_list in tok.tokenize_sents(sentences) ]
        sentences = [ recoverer.recover(s.lower()) for s in sentences ]
        if recoverer_tokenize:
            detok = nltk.tokenize.treebank.TreebankWordDetokenizer()
            sentences = [ detok.detokenize(s.split(" ")) for s in sentences]
    tokenized = tokenizer(sentences, truncation=True, padding='max_length', max_length=max_sequence_length,
                          return_tensors='pt')
    tokenized = { k: v.to(device) for k, v in tokenized.items() }
    preds = model(**tokenized)
    if return_pred_tensor:
        return preds
    else:
        return torch.argmax(preds.logits, dim=1)

def wrap_standard_model(tokenizer, model, recoverer=None, return_pred_tensor=True, recoverer_tokenize=False):
    return lambda sentences: standard_model_predict(tokenizer, model, sentences, recoverer, return_pred_tensor,
                                                    recoverer_tokenize)

This is a wrapper for the machine trained tokenizer+embedder (aka MockingBERT)

In [34]:
def mltokenizer_model_predict(runner, model, cls_embedding, sep_embedding, pad_embedding, sentences, return_pred_tensor):
    # Truncate and lower case. Truncation is for performance only
    sentences = [ s.lower()[:8*max_sequence_length] for s in sentences]
    embedding = runner.embed(sentences=sentences,
        start_token=cls_embedding, end_token=sep_embedding, pad_token=pad_embedding,
        max_tokens=max_sequence_length)
    preds = model(inputs_embeds=embedding['inputs_embeds'], attention_mask=embedding['attention_mask'])
    if return_pred_tensor:
        return preds
    else:
        return torch.argmax(preds.logits, dim=1)

def wrap_mltokenizer_model(mltokenizer_prefix, tokenizer, model, cf_embedding, type, return_pred_tensor=True):
    filename = "../{}.pth".format(mltokenizer_prefix)
    runner = ExperimentRunner(device, model_filename=filename)
    if type == 'bert':
        cls_token_id = tokenizer.vocab['[CLS]']
        sep_token_id = tokenizer.vocab['[SEP]']
        pad_token_id = tokenizer.vocab['[PAD]']
    elif type == 'roberta':
        cls_token_id = tokenizer.vocab['<s>']
        sep_token_id = tokenizer.vocab['</s>']
        pad_token_id = tokenizer.vocab['<pad>']
    cls_embedding = cf_embedding(torch.tensor([cls_token_id], device=device)).view(-1)
    sep_embedding = cf_embedding(torch.tensor([sep_token_id], device=device)).view(-1)
    pad_embedding = cf_embedding(torch.tensor([pad_token_id], device=device)).view(-1)
    
    return lambda sentences: mltokenizer_model_predict(runner, model, cls_embedding, sep_embedding,
                                                      pad_embedding, sentences, return_pred_tensor)

## Evaluation Helpers

Evaluates a wrapped model on a test set

In [35]:
@torch.no_grad()
def evaluate_model(model, test_set):
    num_batches = math.ceil(len(test_set) / batch_size)
    
    sentences = [ x['text'] for x in test_set ]
    labels = [ x['label'] for x in test_set ]
    pred_batches = []
    
    for i in tqdm(range(num_batches)):
        bs = i * batch_size
        be = bs + batch_size
        
        output = model(sentences[bs:be])
        
        pred_batches.append(torch.argmax(output.logits, dim=1).detach().cpu())
    preds = torch.cat(pred_batches)
    
    print(classification_report(labels, preds, digits=4))
    
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='macro')
    
    return accuracy, f1

Evaluates a wrapped model on a stochastic, pseudo-adversarial test set. This means that each input sentence is replicated x times (typically 10) with randomized perturbations, and an attack is considered successful if *any* of the predictions is incorrect.

In [36]:
@torch.no_grad()
def evaluate_model_adv(model, test_sets):
    labels = [ x['label'] for x in test_sets[0] ]
    adv_preds = copy.copy(labels)
    accuracy_list = []
    f1_list = []
    
    for idx in tqdm(range(len(test_sets))):
        test_set = test_sets[idx]
        num_batches = math.ceil(len(test_set) / batch_size)
    
        sentences = [ x['text'] for x in test_set ]
        pred_batches = []
    
        for i in range(num_batches):
            bs = i * batch_size
            be = bs + batch_size
        
            output = model(sentences[bs:be])
        
            pred_batches.append(torch.argmax(output.logits, dim=1).detach().cpu())
        preds = torch.cat(pred_batches)
        
        for i in range(len(adv_preds)):
            if labels[i] != preds[i]:
                adv_preds[i] = preds[i]

        accuracy_list.append(accuracy_score(labels, adv_preds))
        f1_list.append(f1_score(labels, adv_preds, average='macro'))
    
    print(classification_report(labels, adv_preds, digits=4))    
    
    return accuracy_list, f1_list

Evaluates a model using WordScoreAttack

In [37]:
@torch.no_grad()
def evaluate_model_word_score(model, test_set, allow_whitespace_pert=True, report_prefix=None, word_scores_file=None):
    attacker = BertWordScoreAttack(
        WordScramblerPerturber(perturb_prob=1, weight_add=1, weight_drop=1, weight_swap=1,
                               weight_split_word=int(allow_whitespace_pert),
                               weight_merge_words=0),
        word_scores_file, model, tokenizer=None, max_sequence_length=max_sequence_length,
        attack_whitespace=allow_whitespace_pert,
    )

    res = attacker.attack(test_set, max_tokens_to_perturb=10, max_tries_per_token=4, mode=0, print_summary=False)

    if report_prefix is not None:
        res.to_csv(f"{report_prefix}_df.csv")
        with open(f"{report_prefix}_stats.json", "w") as f:
            json.dump(attacker.compute_attack_stats(), fp=f)            
    
    print(classification_report(res['ground_truth'], res['perturbed_preds'], digits=4))    
    
    accuracy = accuracy_score(res['ground_truth'], res['perturbed_preds'])
    f1 = f1_score(res['ground_truth'], res['perturbed_preds'], average='macro')
    
    return accuracy, f1

In [38]:
all_models = {
    'baseline': lambda task, type: wrap_standard_model(tokenizer[type], model_finetuned[type][task]),
    'baseline_all_pert': lambda task, type: wrap_standard_model(tokenizer[type], model_finetuned_all_pert[type][task]),
    'roben_1': lambda task, type: wrap_standard_model(tokenizer[type], model_finetuned[type][task], roben_recoverer),
    'roben_2': lambda task, type: wrap_standard_model(tokenizer[type], model_finetuned[type][task], roben_recoverer2),
    'roben_1_tok': lambda task, type: wrap_standard_model(tokenizer[type], model_finetuned[type][task], roben_recoverer, recoverer_tokenize=True),
    'roben_2_tok': lambda task, type: wrap_standard_model(tokenizer[type], model_finetuned[type][task], roben_recoverer2, recoverer_tokenize=True),
}

mltok_model_names = [
    '64k_lstm_clean_vanilla',
    '64k_lstm_no_whitespace_pert_vanilla',
    '64k_lstm_all_pert_vanilla',
    '64k_lstm_clean_finetuned',
    '64k_lstm_no_whitespace_pert_finetuned',
    '64k_lstm_all_pert_finetuned',
    '64k_cnn_no_whitespace_pert_finetuned',
    '2m_lstm_all_pert_finetuned',
    '32k_lstm_all_pert_finetuned_100ep',
]

for name in mltok_model_names:
    if name.endswith('_vanilla'):
        cf_embedding = lambda task, type: model_base[type].embeddings.word_embeddings
        filename = lambda task, type, name: f'output/{type}_{name}'
    else:
        cf_embedding = lambda task, type: model_finetuned[type][task].base_model.embeddings.word_embeddings
        filename = lambda task, type, name: f'output/{type}_{name}_{task}'
    # name=name is a hack to avoid Python late binding
    all_models[name] = lambda task, type, name=name, filename=filename, cf_embedding=cf_embedding: wrap_mltokenizer_model(filename(task, type, name), tokenizer[type], model_finetuned[type][task], cf_embedding(task, type), type)

In [39]:
evaluations = [
    'clean',
    'stochastic_no_ws',
    'stochastic_incl_ws',
    'word_score_no_ws',
    'word_score_incl_ws',
]

In [40]:
model_task_ids = [ f"{model}_{type}_{task}" for task, type, model in itertools.product(tasks, model_types, all_models.keys()) ]

accuracy_df = pd.DataFrame(columns=evaluations, index=model_task_ids)
f1_df = pd.DataFrame(columns=evaluations, index=model_task_ids)

for task, type in itertools.product(tasks, model_types):
    for cur_model_name, cur_model_factory in all_models.items():
        try:
            cur_model = cur_model_factory(task, type)
        except:
            print(f'Failed loading model {cur_model_name} on {type} for task {task}, skipping')
            accuracy_df.drop(f"{cur_model_name}_{type}_{task}", inplace=True)
            f1_df.drop(f"{cur_model_name}_{type}_{task}", inplace=True)
            continue
        for cur_evaluation in evaluations:
            print(f'Evaluating model {cur_model_name} on {type} on {cur_evaluation} for task {task}')
            start_time = time.time()
            random.seed(11)
            if cur_evaluation == 'clean':
                acc, f1 = evaluate_model(cur_model, sampled_test_set[task])
            elif cur_evaluation.startswith('stochastic_'):
                if cur_evaluation == 'stochastic_no_ws':
                    acc_list, f1_list = evaluate_model_adv(cur_model, sampled_test_set_adv_no_ws[task])
                elif cur_evaluation == 'stochastic_incl_ws':
                    acc_list, f1_list = evaluate_model_adv(cur_model, sampled_test_set_adv_incl_ws[task])
                acc = acc_list[-1]
                f1 = f1_list[-1]
                with open(f"../output/eval/{cur_model_name}_{type}_{task}_{cur_evaluation}_acc_list.json", "w") as f:
                    json.dump(acc_list, fp=f)
                with open(f"../output/eval/{cur_model_name}_{type}_{task}_{cur_evaluation}_f1_list.json", "w") as f:
                    json.dump(f1_list, fp=f)
            elif cur_evaluation.startswith('word_score_'):
                if cur_evaluation == 'word_score_no_ws':
                    acc, f1 = evaluate_model_word_score(cur_model, sampled_test_set[task], allow_whitespace_pert=False,
                                                        report_prefix=f"../output/eval/{cur_model_name}_{type}_{task}_{cur_evaluation}",
                                                        word_scores_file=f"../output/{task}_word_scores.json")
                elif cur_evaluation == 'word_score_incl_ws':
                    acc, f1 = evaluate_model_word_score(cur_model, sampled_test_set[task], allow_whitespace_pert=True,
                                                        report_prefix=f"../output/eval/{cur_model_name}_{type}_{task}_{cur_evaluation}",
                                                        word_scores_file=f"../output/{task}_word_scores.json")

            accuracy_df[cur_evaluation][f"{cur_model_name}_{type}_{task}"] = acc
            f1_df[cur_evaluation][f"{cur_model_name}_{type}_{task}"] = f1
            end_time = time.time()
            print(f"Evaluation took {end_time-start_time} seconds")
        del cur_model

Evaluating model baseline on bert on clean for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02<00:00,  6.26it/s]


              precision    recall  f1-score   support

           0     0.9511    0.9620    0.9565       263
           1     0.9573    0.9451    0.9512       237

    accuracy                         0.9540       500
   macro avg     0.9542    0.9536    0.9538       500
weighted avg     0.9540    0.9540    0.9540       500

Evaluation took 2.617372512817383 seconds
Evaluating model baseline on bert on stochastic_no_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:25<00:00,  2.58s/it]


              precision    recall  f1-score   support

           0     0.8783    0.8783    0.8783       263
           1     0.8650    0.8650    0.8650       237

    accuracy                         0.8720       500
   macro avg     0.8717    0.8717    0.8717       500
weighted avg     0.8720    0.8720    0.8720       500

Evaluation took 25.811915159225464 seconds
Evaluating model baseline on bert on stochastic_incl_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:25<00:00,  2.59s/it]


              precision    recall  f1-score   support

           0     0.8556    0.8783    0.8668       263
           1     0.8609    0.8354    0.8480       237

    accuracy                         0.8580       500
   macro avg     0.8582    0.8569    0.8574       500
weighted avg     0.8581    0.8580    0.8579       500

Evaluation took 25.952208995819092 seconds
Evaluating model baseline on bert on word_score_no_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:34<00:00,  5.29it/s]


              precision    recall  f1-score   support

           0     0.7567    0.7567    0.7567       263
           1     0.7300    0.7300    0.7300       237

    accuracy                         0.7440       500
   macro avg     0.7433    0.7433    0.7433       500
weighted avg     0.7440    0.7440    0.7440       500

Evaluation took 94.53224587440491 seconds
Evaluating model baseline on bert on word_score_incl_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:34<00:00,  5.31it/s]


              precision    recall  f1-score   support

           0     0.7101    0.7452    0.7273       263
           1     0.7009    0.6624    0.6811       237

    accuracy                         0.7060       500
   macro avg     0.7055    0.7038    0.7042       500
weighted avg     0.7058    0.7060    0.7054       500

Evaluation took 94.23304080963135 seconds
Failed loading model baseline_all_pert on bert for task yelp_bin, skipping
Evaluating model roben_1 on bert on clean for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02<00:00,  5.59it/s]


              precision    recall  f1-score   support

           0     0.8493    0.9430    0.8937       263
           1     0.9279    0.8143    0.8674       237

    accuracy                         0.8820       500
   macro avg     0.8886    0.8787    0.8806       500
weighted avg     0.8866    0.8820    0.8812       500

Evaluation took 2.9076454639434814 seconds
Evaluating model roben_1 on bert on stochastic_no_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:28<00:00,  2.85s/it]


              precision    recall  f1-score   support

           0     0.7833    0.8935    0.8348       263
           1     0.8600    0.7257    0.7872       237

    accuracy                         0.8140       500
   macro avg     0.8217    0.8096    0.8110       500
weighted avg     0.8197    0.8140    0.8122       500

Evaluation took 28.50887155532837 seconds
Evaluating model roben_1 on bert on stochastic_incl_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:28<00:00,  2.86s/it]


              precision    recall  f1-score   support

           0     0.7320    0.8517    0.7873       263
           1     0.7990    0.6540    0.7193       237

    accuracy                         0.7580       500
   macro avg     0.7655    0.7529    0.7533       500
weighted avg     0.7638    0.7580    0.7551       500

Evaluation took 28.603731155395508 seconds
Evaluating model roben_1 on bert on word_score_no_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:35<00:00,  5.26it/s]


              precision    recall  f1-score   support

           0     0.7372    0.8745    0.8000       263
           1     0.8245    0.6540    0.7294       237

    accuracy                         0.7700       500
   macro avg     0.7808    0.7643    0.7647       500
weighted avg     0.7786    0.7700    0.7665       500

Evaluation took 95.15999341011047 seconds
Evaluating model roben_1 on bert on word_score_incl_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:27<00:00,  5.72it/s]


              precision    recall  f1-score   support

           0     0.6301    0.7643    0.6907       263
           1     0.6575    0.5021    0.5694       237

    accuracy                         0.6400       500
   macro avg     0.6438    0.6332    0.6300       500
weighted avg     0.6431    0.6400    0.6332       500

Evaluation took 87.36890649795532 seconds
Evaluating model roben_2 on bert on clean for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02<00:00,  5.68it/s]


              precision    recall  f1-score   support

           0     0.8881    0.9354    0.9111       263
           1     0.9238    0.8692    0.8957       237

    accuracy                         0.9040       500
   macro avg     0.9059    0.9023    0.9034       500
weighted avg     0.9050    0.9040    0.9038       500

Evaluation took 2.8601675033569336 seconds
Evaluating model roben_2 on bert on stochastic_no_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:28<00:00,  2.81s/it]


              precision    recall  f1-score   support

           0     0.8380    0.9049    0.8702       263
           1     0.8843    0.8059    0.8433       237

    accuracy                         0.8580       500
   macro avg     0.8611    0.8554    0.8567       500
weighted avg     0.8599    0.8580    0.8574       500

Evaluation took 28.1508629322052 seconds
Evaluating model roben_2 on bert on stochastic_incl_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:28<00:00,  2.82s/it]


              precision    recall  f1-score   support

           0     0.7700    0.8403    0.8036       263
           1     0.8028    0.7215    0.7600       237

    accuracy                         0.7840       500
   macro avg     0.7864    0.7809    0.7818       500
weighted avg     0.7856    0.7840    0.7830       500

Evaluation took 28.1719970703125 seconds
Evaluating model roben_2 on bert on word_score_no_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:37<00:00,  5.11it/s]


              precision    recall  f1-score   support

           0     0.8097    0.8897    0.8478       263
           1     0.8626    0.7679    0.8125       237

    accuracy                         0.8320       500
   macro avg     0.8361    0.8288    0.8302       500
weighted avg     0.8347    0.8320    0.8311       500

Evaluation took 97.92685461044312 seconds
Evaluating model roben_2 on bert on word_score_incl_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:30<00:00,  5.50it/s]


              precision    recall  f1-score   support

           0     0.6788    0.7795    0.7257       263
           1     0.7071    0.5907    0.6437       237

    accuracy                         0.6900       500
   macro avg     0.6929    0.6851    0.6847       500
weighted avg     0.6922    0.6900    0.6868       500

Evaluation took 90.94003105163574 seconds
Evaluating model roben_1_tok on bert on clean for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:03<00:00,  5.32it/s]


              precision    recall  f1-score   support

           0     0.8237    0.9240    0.8710       263
           1     0.9024    0.7806    0.8371       237

    accuracy                         0.8560       500
   macro avg     0.8631    0.8523    0.8540       500
weighted avg     0.8610    0.8560    0.8549       500

Evaluation took 3.0526790618896484 seconds
Evaluating model roben_1_tok on bert on stochastic_no_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:30<00:00,  3.08s/it]


              precision    recall  f1-score   support

           0     0.7855    0.9049    0.8410       263
           1     0.8731    0.7257    0.7926       237

    accuracy                         0.8200       500
   macro avg     0.8293    0.8153    0.8168       500
weighted avg     0.8270    0.8200    0.8181       500

Evaluation took 30.849148273468018 seconds
Evaluating model roben_1_tok on bert on stochastic_incl_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:30<00:00,  3.04s/it]


              precision    recall  f1-score   support

           0     0.6854    0.8365    0.7534       263
           1     0.7598    0.5738    0.6538       237

    accuracy                         0.7120       500
   macro avg     0.7226    0.7052    0.7036       500
weighted avg     0.7206    0.7120    0.7062       500

Evaluation took 30.45531463623047 seconds
Evaluating model roben_1_tok on bert on word_score_no_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:43<00:00,  4.85it/s]


              precision    recall  f1-score   support

           0     0.7508    0.8935    0.8160       263
           1     0.8503    0.6709    0.7500       237

    accuracy                         0.7880       500
   macro avg     0.8005    0.7822    0.7830       500
weighted avg     0.7979    0.7880    0.7847       500

Evaluation took 103.16852831840515 seconds
Evaluating model roben_1_tok on bert on word_score_incl_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:33<00:00,  5.37it/s]


              precision    recall  f1-score   support

           0     0.6204    0.7643    0.6848       263
           1     0.6477    0.4810    0.5521       237

    accuracy                         0.6300       500
   macro avg     0.6340    0.6226    0.6184       500
weighted avg     0.6333    0.6300    0.6219       500

Evaluation took 93.12048006057739 seconds
Evaluating model roben_2_tok on bert on clean for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:03<00:00,  5.31it/s]


              precision    recall  f1-score   support

           0     0.8732    0.9163    0.8942       263
           1     0.9018    0.8523    0.8764       237

    accuracy                         0.8860       500
   macro avg     0.8875    0.8843    0.8853       500
weighted avg     0.8867    0.8860    0.8858       500

Evaluation took 3.05865740776062 seconds
Evaluating model roben_2_tok on bert on stochastic_no_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:30<00:00,  3.01s/it]


              precision    recall  f1-score   support

           0     0.8429    0.8973    0.8692       263
           1     0.8773    0.8143    0.8446       237

    accuracy                         0.8580       500
   macro avg     0.8601    0.8558    0.8569       500
weighted avg     0.8592    0.8580    0.8576       500

Evaluation took 30.11723017692566 seconds
Evaluating model roben_2_tok on bert on stochastic_incl_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:30<00:00,  3.01s/it]


              precision    recall  f1-score   support

           0     0.7568    0.8517    0.8014       263
           1     0.8088    0.6962    0.7483       237

    accuracy                         0.7780       500
   macro avg     0.7828    0.7740    0.7749       500
weighted avg     0.7814    0.7780    0.7762       500

Evaluation took 30.10861039161682 seconds
Evaluating model roben_2_tok on bert on word_score_no_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:45<00:00,  4.76it/s]


              precision    recall  f1-score   support

           0     0.8127    0.8745    0.8425       263
           1     0.8479    0.7764    0.8106       237

    accuracy                         0.8280       500
   macro avg     0.8303    0.8254    0.8265       500
weighted avg     0.8294    0.8280    0.8274       500

Evaluation took 105.05119395256042 seconds
Evaluating model roben_2_tok on bert on word_score_incl_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:36<00:00,  5.18it/s]


              precision    recall  f1-score   support

           0     0.6645    0.7757    0.7158       263
           1     0.6943    0.5654    0.6233       237

    accuracy                         0.6760       500
   macro avg     0.6794    0.6705    0.6695       500
weighted avg     0.6786    0.6760    0.6719       500

Evaluation took 96.61831998825073 seconds
Evaluating model 64k_lstm_clean_vanilla on bert on clean for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:25<00:00,  1.59s/it]


              precision    recall  f1-score   support

           0     0.9513    0.9658    0.9585       263
           1     0.9614    0.9451    0.9532       237

    accuracy                         0.9560       500
   macro avg     0.9563    0.9555    0.9558       500
weighted avg     0.9561    0.9560    0.9560       500

Evaluation took 25.528133630752563 seconds
Evaluating model 64k_lstm_clean_vanilla on bert on stochastic_no_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [04:13<00:00, 25.37s/it]


              precision    recall  f1-score   support

           0     0.9102    0.8859    0.8979       263
           1     0.8770    0.9030    0.8898       237

    accuracy                         0.8940       500
   macro avg     0.8936    0.8944    0.8938       500
weighted avg     0.8945    0.8940    0.8941       500

Evaluation took 253.67984700202942 seconds
Evaluating model 64k_lstm_clean_vanilla on bert on stochastic_incl_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [04:05<00:00, 24.59s/it]


              precision    recall  f1-score   support

           0     0.8933    0.8593    0.8760       263
           1     0.8502    0.8861    0.8678       237

    accuracy                         0.8720       500
   macro avg     0.8717    0.8727    0.8719       500
weighted avg     0.8729    0.8720    0.8721       500

Evaluation took 245.9354429244995 seconds
Evaluating model 64k_lstm_clean_vanilla on bert on word_score_no_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [15:34<00:00,  1.87s/it]


              precision    recall  f1-score   support

           0     0.7967    0.7452    0.7701       263
           1     0.7362    0.7890    0.7617       237

    accuracy                         0.7660       500
   macro avg     0.7665    0.7671    0.7659       500
weighted avg     0.7681    0.7660    0.7661       500

Evaluation took 934.6562955379486 seconds
Evaluating model 64k_lstm_clean_vanilla on bert on word_score_incl_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [12:37<00:00,  1.52s/it]


              precision    recall  f1-score   support

           0     0.7410    0.7072    0.7237       263
           1     0.6908    0.7257    0.7078       237

    accuracy                         0.7160       500
   macro avg     0.7159    0.7165    0.7158       500
weighted avg     0.7172    0.7160    0.7162       500

Evaluation took 757.9176194667816 seconds
Evaluating model 64k_lstm_no_whitespace_pert_vanilla on bert on clean for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:21<00:00,  1.36s/it]


              precision    recall  f1-score   support

           0     0.9620    0.9620    0.9620       263
           1     0.9578    0.9578    0.9578       237

    accuracy                         0.9600       500
   macro avg     0.9599    0.9599    0.9599       500
weighted avg     0.9600    0.9600    0.9600       500

Evaluation took 21.861830711364746 seconds
Evaluating model 64k_lstm_no_whitespace_pert_vanilla on bert on stochastic_no_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:40<00:00, 22.09s/it]


              precision    recall  f1-score   support

           0     0.9280    0.9316    0.9298       263
           1     0.9237    0.9198    0.9218       237

    accuracy                         0.9260       500
   macro avg     0.9259    0.9257    0.9258       500
weighted avg     0.9260    0.9260    0.9260       500

Evaluation took 220.85825777053833 seconds
Evaluating model 64k_lstm_no_whitespace_pert_vanilla on bert on stochastic_incl_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:38<00:00, 21.82s/it]


              precision    recall  f1-score   support

           0     0.9008    0.8973    0.8990       263
           1     0.8866    0.8903    0.8884       237

    accuracy                         0.8940       500
   macro avg     0.8937    0.8938    0.8937       500
weighted avg     0.8940    0.8940    0.8940       500

Evaluation took 218.22774028778076 seconds
Evaluating model 64k_lstm_no_whitespace_pert_vanilla on bert on word_score_no_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [13:47<00:00,  1.66s/it]


              precision    recall  f1-score   support

           0     0.8959    0.9163    0.9060       263
           1     0.9048    0.8819    0.8932       237

    accuracy                         0.9000       500
   macro avg     0.9003    0.8991    0.8996       500
weighted avg     0.9001    0.9000    0.8999       500

Evaluation took 827.7421262264252 seconds
Evaluating model 64k_lstm_no_whitespace_pert_vanilla on bert on word_score_incl_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [13:08<00:00,  1.58s/it]


              precision    recall  f1-score   support

           0     0.8016    0.7833    0.7923       263
           1     0.7654    0.7848    0.7750       237

    accuracy                         0.7840       500
   macro avg     0.7835    0.7840    0.7837       500
weighted avg     0.7844    0.7840    0.7841       500

Evaluation took 788.0917520523071 seconds
Evaluating model 64k_lstm_all_pert_vanilla on bert on clean for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:21<00:00,  1.37s/it]


              precision    recall  f1-score   support

           0     0.9656    0.9620    0.9638       263
           1     0.9580    0.9620    0.9600       237

    accuracy                         0.9620       500
   macro avg     0.9618    0.9620    0.9619       500
weighted avg     0.9620    0.9620    0.9620       500

Evaluation took 21.981257915496826 seconds
Evaluating model 64k_lstm_all_pert_vanilla on bert on stochastic_no_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:39<00:00, 21.99s/it]


              precision    recall  f1-score   support

           0     0.9462    0.9354    0.9407       263
           1     0.9292    0.9409    0.9350       237

    accuracy                         0.9380       500
   macro avg     0.9377    0.9381    0.9379       500
weighted avg     0.9381    0.9380    0.9380       500

Evaluation took 219.8661594390869 seconds
Evaluating model 64k_lstm_all_pert_vanilla on bert on stochastic_incl_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:38<00:00, 21.89s/it]


              precision    recall  f1-score   support

           0     0.9421    0.9278    0.9349       263
           1     0.9212    0.9367    0.9289       237

    accuracy                         0.9320       500
   macro avg     0.9316    0.9322    0.9319       500
weighted avg     0.9322    0.9320    0.9320       500

Evaluation took 218.93701577186584 seconds
Evaluating model 64k_lstm_all_pert_vanilla on bert on word_score_no_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [13:27<00:00,  1.61s/it]


              precision    recall  f1-score   support

           0     0.8755    0.8289    0.8516       263
           1     0.8207    0.8692    0.8443       237

    accuracy                         0.8480       500
   macro avg     0.8481    0.8490    0.8479       500
weighted avg     0.8495    0.8480    0.8481       500

Evaluation took 807.3517551422119 seconds
Evaluating model 64k_lstm_all_pert_vanilla on bert on word_score_incl_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [13:38<00:00,  1.64s/it]


              precision    recall  f1-score   support

           0     0.8833    0.8631    0.8731       263
           1     0.8519    0.8734    0.8625       237

    accuracy                         0.8680       500
   macro avg     0.8676    0.8683    0.8678       500
weighted avg     0.8684    0.8680    0.8681       500

Evaluation took 818.8644361495972 seconds
Failed loading model 64k_lstm_clean_finetuned on bert for task yelp_bin, skipping
Failed loading model 64k_lstm_no_whitespace_pert_finetuned on bert for task yelp_bin, skipping
Evaluating model 64k_lstm_all_pert_finetuned on bert on clean for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:21<00:00,  1.36s/it]


              precision    recall  f1-score   support

           0     0.9620    0.9620    0.9620       263
           1     0.9578    0.9578    0.9578       237

    accuracy                         0.9600       500
   macro avg     0.9599    0.9599    0.9599       500
weighted avg     0.9600    0.9600    0.9600       500

Evaluation took 21.827037811279297 seconds
Evaluating model 64k_lstm_all_pert_finetuned on bert on stochastic_no_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:40<00:00, 22.10s/it]


              precision    recall  f1-score   support

           0     0.9466    0.9430    0.9448       263
           1     0.9370    0.9409    0.9389       237

    accuracy                         0.9420       500
   macro avg     0.9418    0.9419    0.9419       500
weighted avg     0.9420    0.9420    0.9420       500

Evaluation took 220.98148798942566 seconds
Evaluating model 64k_lstm_all_pert_finetuned on bert on stochastic_incl_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:39<00:00, 21.99s/it]


              precision    recall  f1-score   support

           0     0.9356    0.9392    0.9374       263
           1     0.9322    0.9283    0.9302       237

    accuracy                         0.9340       500
   macro avg     0.9339    0.9337    0.9338       500
weighted avg     0.9340    0.9340    0.9340       500

Evaluation took 219.90367007255554 seconds
Evaluating model 64k_lstm_all_pert_finetuned on bert on word_score_no_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [13:46<00:00,  1.65s/it]


              precision    recall  f1-score   support

           0     0.8919    0.8783    0.8851       263
           1     0.8672    0.8819    0.8745       237

    accuracy                         0.8800       500
   macro avg     0.8796    0.8801    0.8798       500
weighted avg     0.8802    0.8800    0.8800       500

Evaluation took 826.4246244430542 seconds
Evaluating model 64k_lstm_all_pert_finetuned on bert on word_score_incl_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [13:49<00:00,  1.66s/it]


              precision    recall  f1-score   support

           0     0.8902    0.8631    0.8764       263
           1     0.8531    0.8819    0.8672       237

    accuracy                         0.8720       500
   macro avg     0.8716    0.8725    0.8718       500
weighted avg     0.8726    0.8720    0.8721       500

Evaluation took 829.1796503067017 seconds
Failed loading model 64k_cnn_no_whitespace_pert_finetuned on bert for task yelp_bin, skipping
Failed loading model 2m_lstm_all_pert_finetuned on bert for task yelp_bin, skipping
Failed loading model 32k_lstm_all_pert_finetuned_100ep on bert for task yelp_bin, skipping
Evaluating model baseline on roberta on clean for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02<00:00,  6.21it/s]


              precision    recall  f1-score   support

           0     0.9583    0.9620    0.9602       263
           1     0.9576    0.9536    0.9556       237

    accuracy                         0.9580       500
   macro avg     0.9580    0.9578    0.9579       500
weighted avg     0.9580    0.9580    0.9580       500

Evaluation took 2.6168811321258545 seconds
Evaluating model baseline on roberta on stochastic_no_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:25<00:00,  2.53s/it]


              precision    recall  f1-score   support

           0     0.9109    0.8935    0.9021       263
           1     0.8843    0.9030    0.8935       237

    accuracy                         0.8980       500
   macro avg     0.8976    0.8982    0.8978       500
weighted avg     0.8983    0.8980    0.8980       500

Evaluation took 25.30278491973877 seconds
Evaluating model baseline on roberta on stochastic_incl_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:25<00:00,  2.53s/it]


              precision    recall  f1-score   support

           0     0.8876    0.9011    0.8943       263
           1     0.8884    0.8734    0.8809       237

    accuracy                         0.8880       500
   macro avg     0.8880    0.8873    0.8876       500
weighted avg     0.8880    0.8880    0.8879       500

Evaluation took 25.35331630706787 seconds
Evaluating model baseline on roberta on word_score_no_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:28<00:00,  5.66it/s]


              precision    recall  f1-score   support

           0     0.7553    0.8099    0.7817       263
           1     0.7706    0.7089    0.7385       237

    accuracy                         0.7620       500
   macro avg     0.7630    0.7594    0.7601       500
weighted avg     0.7626    0.7620    0.7612       500

Evaluation took 88.31887221336365 seconds
Evaluating model baseline on roberta on word_score_incl_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:28<00:00,  5.65it/s]


              precision    recall  f1-score   support

           0     0.7571    0.8061    0.7808       263
           1     0.7682    0.7131    0.7396       237

    accuracy                         0.7620       500
   macro avg     0.7627    0.7596    0.7602       500
weighted avg     0.7624    0.7620    0.7613       500

Evaluation took 88.58341717720032 seconds
Failed loading model baseline_all_pert on roberta for task yelp_bin, skipping
Evaluating model roben_1 on roberta on clean for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02<00:00,  6.27it/s]


              precision    recall  f1-score   support

           0     0.8961    0.9506    0.9225       263
           1     0.9412    0.8776    0.9083       237

    accuracy                         0.9160       500
   macro avg     0.9186    0.9141    0.9154       500
weighted avg     0.9174    0.9160    0.9158       500

Evaluation took 2.593519449234009 seconds
Evaluating model roben_1 on roberta on stochastic_no_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:25<00:00,  2.54s/it]


              precision    recall  f1-score   support

           0     0.8380    0.9049    0.8702       263
           1     0.8843    0.8059    0.8433       237

    accuracy                         0.8580       500
   macro avg     0.8611    0.8554    0.8567       500
weighted avg     0.8599    0.8580    0.8574       500

Evaluation took 25.367953300476074 seconds
Evaluating model roben_1 on roberta on stochastic_incl_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:25<00:00,  2.54s/it]


              precision    recall  f1-score   support

           0     0.7945    0.8821    0.8360       263
           1     0.8510    0.7468    0.7955       237

    accuracy                         0.8180       500
   macro avg     0.8227    0.8145    0.8158       500
weighted avg     0.8213    0.8180    0.8168       500

Evaluation took 25.379795789718628 seconds
Evaluating model roben_1 on roberta on word_score_no_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:29<00:00,  5.56it/s]


              precision    recall  f1-score   support

           0     0.7793    0.8859    0.8292       263
           1     0.8507    0.7215    0.7808       237

    accuracy                         0.8080       500
   macro avg     0.8150    0.8037    0.8050       500
weighted avg     0.8131    0.8080    0.8063       500

Evaluation took 90.01756143569946 seconds
Evaluating model roben_1 on roberta on word_score_incl_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:23<00:00,  6.01it/s]


              precision    recall  f1-score   support

           0     0.6562    0.7985    0.7204       263
           1     0.7056    0.5359    0.6091       237

    accuracy                         0.6740       500
   macro avg     0.6809    0.6672    0.6648       500
weighted avg     0.6796    0.6740    0.6677       500

Evaluation took 83.2600610256195 seconds
Evaluating model roben_2 on roberta on clean for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02<00:00,  6.26it/s]


              precision    recall  f1-score   support

           0     0.9000    0.9240    0.9118       263
           1     0.9130    0.8861    0.8994       237

    accuracy                         0.9060       500
   macro avg     0.9065    0.9050    0.9056       500
weighted avg     0.9062    0.9060    0.9059       500

Evaluation took 2.5954816341400146 seconds
Evaluating model roben_2 on roberta on stochastic_no_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:25<00:00,  2.54s/it]


              precision    recall  f1-score   support

           0     0.8727    0.8859    0.8792       263
           1     0.8712    0.8565    0.8638       237

    accuracy                         0.8720       500
   macro avg     0.8720    0.8712    0.8715       500
weighted avg     0.8720    0.8720    0.8719       500

Evaluation took 25.37122106552124 seconds
Evaluating model roben_2 on roberta on stochastic_incl_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:25<00:00,  2.54s/it]


              precision    recall  f1-score   support

           0     0.8233    0.8859    0.8535       263
           1     0.8618    0.7890    0.8238       237

    accuracy                         0.8400       500
   macro avg     0.8425    0.8375    0.8386       500
weighted avg     0.8415    0.8400    0.8394       500

Evaluation took 25.391633987426758 seconds
Evaluating model roben_2 on roberta on word_score_no_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:30<00:00,  5.53it/s]


              precision    recall  f1-score   support

           0     0.8185    0.8745    0.8456       263
           1     0.8493    0.7848    0.8158       237

    accuracy                         0.8320       500
   macro avg     0.8339    0.8297    0.8307       500
weighted avg     0.8331    0.8320    0.8315       500

Evaluation took 90.37223887443542 seconds
Evaluating model roben_2 on roberta on word_score_incl_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:25<00:00,  5.87it/s]


              precision    recall  f1-score   support

           0     0.7003    0.8175    0.7544       263
           1     0.7513    0.6118    0.6744       237

    accuracy                         0.7200       500
   macro avg     0.7258    0.7147    0.7144       500
weighted avg     0.7245    0.7200    0.7165       500

Evaluation took 85.21417427062988 seconds
Evaluating model roben_1_tok on roberta on clean for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02<00:00,  5.83it/s]


              precision    recall  f1-score   support

           0     0.8781    0.9316    0.9041       263
           1     0.9186    0.8565    0.8865       237

    accuracy                         0.8960       500
   macro avg     0.8983    0.8940    0.8953       500
weighted avg     0.8973    0.8960    0.8957       500

Evaluation took 2.7857654094696045 seconds
Evaluating model roben_1_tok on roberta on stochastic_no_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:27<00:00,  2.73s/it]


              precision    recall  f1-score   support

           0     0.8194    0.8973    0.8566       263
           1     0.8726    0.7806    0.8241       237

    accuracy                         0.8420       500
   macro avg     0.8460    0.8390    0.8403       500
weighted avg     0.8447    0.8420    0.8412       500

Evaluation took 27.25647521018982 seconds
Evaluating model roben_1_tok on roberta on stochastic_incl_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:27<00:00,  2.73s/it]


              precision    recall  f1-score   support

           0     0.7721    0.8631    0.8151       263
           1     0.8252    0.7173    0.7675       237

    accuracy                         0.7940       500
   macro avg     0.7987    0.7902    0.7913       500
weighted avg     0.7973    0.7940    0.7925       500

Evaluation took 27.258527994155884 seconds
Evaluating model roben_1_tok on roberta on word_score_no_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:35<00:00,  5.23it/s]


              precision    recall  f1-score   support

           0     0.8034    0.8859    0.8427       263
           1     0.8571    0.7595    0.8054       237

    accuracy                         0.8260       500
   macro avg     0.8303    0.8227    0.8240       500
weighted avg     0.8289    0.8260    0.8250       500

Evaluation took 95.7079222202301 seconds
Evaluating model roben_1_tok on roberta on word_score_incl_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:29<00:00,  5.62it/s]


              precision    recall  f1-score   support

           0     0.6543    0.8061    0.7223       263
           1     0.7102    0.5274    0.6053       237

    accuracy                         0.6740       500
   macro avg     0.6823    0.6668    0.6638       500
weighted avg     0.6808    0.6740    0.6669       500

Evaluation took 89.03521203994751 seconds
Evaluating model roben_2_tok on roberta on clean for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:02<00:00,  5.83it/s]


              precision    recall  f1-score   support

           0     0.9139    0.9278    0.9208       263
           1     0.9185    0.9030    0.9106       237

    accuracy                         0.9160       500
   macro avg     0.9162    0.9154    0.9157       500
weighted avg     0.9160    0.9160    0.9160       500

Evaluation took 2.7871503829956055 seconds
Evaluating model roben_2_tok on roberta on stochastic_no_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:27<00:00,  2.72s/it]


              precision    recall  f1-score   support

           0     0.8745    0.9011    0.8876       263
           1     0.8865    0.8565    0.8712       237

    accuracy                         0.8800       500
   macro avg     0.8805    0.8788    0.8794       500
weighted avg     0.8802    0.8800    0.8799       500

Evaluation took 27.24778127670288 seconds
Evaluating model roben_2_tok on roberta on stochastic_incl_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:27<00:00,  2.73s/it]


              precision    recall  f1-score   support

           0     0.8229    0.9011    0.8603       263
           1     0.8774    0.7848    0.8285       237

    accuracy                         0.8460       500
   macro avg     0.8501    0.8430    0.8444       500
weighted avg     0.8487    0.8460    0.8452       500

Evaluation took 27.256755113601685 seconds
Evaluating model roben_2_tok on roberta on word_score_no_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:38<00:00,  5.07it/s]


              precision    recall  f1-score   support

           0     0.8566    0.8859    0.8710       263
           1     0.8684    0.8354    0.8516       237

    accuracy                         0.8620       500
   macro avg     0.8625    0.8607    0.8613       500
weighted avg     0.8622    0.8620    0.8618       500

Evaluation took 98.68831968307495 seconds
Evaluating model roben_2_tok on roberta on word_score_incl_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:31<00:00,  5.46it/s]


              precision    recall  f1-score   support

           0     0.7010    0.8289    0.7596       263
           1     0.7619    0.6076    0.6761       237

    accuracy                         0.7240       500
   macro avg     0.7314    0.7182    0.7178       500
weighted avg     0.7299    0.7240    0.7200       500

Evaluation took 91.6050672531128 seconds
Failed loading model 64k_lstm_clean_vanilla on roberta for task yelp_bin, skipping
Failed loading model 64k_lstm_no_whitespace_pert_vanilla on roberta for task yelp_bin, skipping
Evaluating model 64k_lstm_all_pert_vanilla on roberta on clean for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:21<00:00,  1.36s/it]


              precision    recall  f1-score   support

           0     0.9405    0.9620    0.9511       263
           1     0.9567    0.9325    0.9444       237

    accuracy                         0.9480       500
   macro avg     0.9486    0.9472    0.9478       500
weighted avg     0.9482    0.9480    0.9480       500

Evaluation took 21.743806838989258 seconds
Evaluating model 64k_lstm_all_pert_vanilla on roberta on stochastic_no_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:39<00:00, 21.96s/it]


              precision    recall  f1-score   support

           0     0.9127    0.9544    0.9331       263
           1     0.9467    0.8987    0.9221       237

    accuracy                         0.9280       500
   macro avg     0.9297    0.9266    0.9276       500
weighted avg     0.9288    0.9280    0.9279       500

Evaluation took 219.63570499420166 seconds
Evaluating model 64k_lstm_all_pert_vanilla on roberta on stochastic_incl_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:39<00:00, 21.98s/it]


              precision    recall  f1-score   support

           0     0.9022    0.9468    0.9239       263
           1     0.9375    0.8861    0.9111       237

    accuracy                         0.9180       500
   macro avg     0.9198    0.9164    0.9175       500
weighted avg     0.9189    0.9180    0.9178       500

Evaluation took 219.77753233909607 seconds
Evaluating model 64k_lstm_all_pert_vanilla on roberta on word_score_no_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [13:33<00:00,  1.63s/it]


              precision    recall  f1-score   support

           0     0.8432    0.9202    0.8800       263
           1     0.9014    0.8101    0.8533       237

    accuracy                         0.8680       500
   macro avg     0.8723    0.8651    0.8667       500
weighted avg     0.8708    0.8680    0.8674       500

Evaluation took 813.4523239135742 seconds
Evaluating model 64k_lstm_all_pert_vanilla on roberta on word_score_incl_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [13:42<00:00,  1.65s/it]


              precision    recall  f1-score   support

           0     0.8592    0.9049    0.8815       263
           1     0.8879    0.8354    0.8609       237

    accuracy                         0.8720       500
   macro avg     0.8735    0.8702    0.8712       500
weighted avg     0.8728    0.8720    0.8717       500

Evaluation took 822.9476583003998 seconds
Failed loading model 64k_lstm_clean_finetuned on roberta for task yelp_bin, skipping
Failed loading model 64k_lstm_no_whitespace_pert_finetuned on roberta for task yelp_bin, skipping
Evaluating model 64k_lstm_all_pert_finetuned on roberta on clean for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:21<00:00,  1.36s/it]


              precision    recall  f1-score   support

           0     0.9478    0.9658    0.9567       263
           1     0.9612    0.9409    0.9510       237

    accuracy                         0.9540       500
   macro avg     0.9545    0.9534    0.9538       500
weighted avg     0.9541    0.9540    0.9540       500

Evaluation took 21.79786252975464 seconds
Evaluating model 64k_lstm_all_pert_finetuned on roberta on stochastic_no_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:40<00:00, 22.08s/it]


              precision    recall  f1-score   support

           0     0.9259    0.9506    0.9381       263
           1     0.9435    0.9156    0.9293       237

    accuracy                         0.9340       500
   macro avg     0.9347    0.9331    0.9337       500
weighted avg     0.9342    0.9340    0.9339       500

Evaluation took 220.82987427711487 seconds
Evaluating model 64k_lstm_all_pert_finetuned on roberta on stochastic_incl_ws for task yelp_bin


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:39<00:00, 21.93s/it]


              precision    recall  f1-score   support

           0     0.9191    0.9506    0.9346       263
           1     0.9430    0.9072    0.9247       237

    accuracy                         0.9300       500
   macro avg     0.9311    0.9289    0.9297       500
weighted avg     0.9304    0.9300    0.9299       500

Evaluation took 219.33505630493164 seconds
Evaluating model 64k_lstm_all_pert_finetuned on roberta on word_score_no_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [13:25<00:00,  1.61s/it]


              precision    recall  f1-score   support

           0     0.8521    0.9202    0.8848       263
           1     0.9028    0.8228    0.8609       237

    accuracy                         0.8740       500
   macro avg     0.8774    0.8715    0.8729       500
weighted avg     0.8761    0.8740    0.8735       500

Evaluation took 805.4878494739532 seconds
Evaluating model 64k_lstm_all_pert_finetuned on roberta on word_score_incl_ws for task yelp_bin


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [13:37<00:00,  1.64s/it]

              precision    recall  f1-score   support

           0     0.8659    0.9087    0.8868       263
           1     0.8929    0.8439    0.8677       237

    accuracy                         0.8780       500
   macro avg     0.8794    0.8763    0.8773       500
weighted avg     0.8787    0.8780    0.8778       500

Evaluation took 817.88609790802 seconds
Failed loading model 64k_cnn_no_whitespace_pert_finetuned on roberta for task yelp_bin, skipping
Failed loading model 2m_lstm_all_pert_finetuned on roberta for task yelp_bin, skipping
Failed loading model 32k_lstm_all_pert_finetuned_100ep on roberta for task yelp_bin, skipping





In [41]:
accuracy_df

Unnamed: 0,clean,stochastic_no_ws,stochastic_incl_ws,word_score_no_ws,word_score_incl_ws
baseline_bert_yelp_bin,0.954,0.872,0.858,0.744,0.706
roben_1_bert_yelp_bin,0.882,0.814,0.758,0.77,0.64
roben_2_bert_yelp_bin,0.904,0.858,0.784,0.832,0.69
roben_1_tok_bert_yelp_bin,0.856,0.82,0.712,0.788,0.63
roben_2_tok_bert_yelp_bin,0.886,0.858,0.778,0.828,0.676
64k_lstm_clean_vanilla_bert_yelp_bin,0.956,0.894,0.872,0.766,0.716
64k_lstm_no_whitespace_pert_vanilla_bert_yelp_bin,0.96,0.926,0.894,0.9,0.784
64k_lstm_all_pert_vanilla_bert_yelp_bin,0.962,0.938,0.932,0.848,0.868
64k_lstm_all_pert_finetuned_bert_yelp_bin,0.96,0.942,0.934,0.88,0.872
baseline_roberta_yelp_bin,0.958,0.898,0.888,0.762,0.762


In [42]:
f1_df

Unnamed: 0,clean,stochastic_no_ws,stochastic_incl_ws,word_score_no_ws,word_score_incl_ws
baseline_bert_yelp_bin,0.953845,0.871653,0.857379,0.743306,0.7042
roben_1_bert_yelp_bin,0.880555,0.810999,0.753302,0.764706,0.63005
roben_2_bert_yelp_bin,0.903382,0.856734,0.781818,0.830163,0.684671
roben_1_tok_bert_yelp_bin,0.854036,0.816808,0.703635,0.782986,0.618448
roben_2_tok_bert_yelp_bin,0.885302,0.856942,0.774865,0.826532,0.669523
64k_lstm_clean_vanilla_bert_yelp_bin,0.955841,0.893847,0.871869,0.765924,0.715777
64k_lstm_no_whitespace_pert_vanilla_bert_yelp_bin,0.959892,0.925784,0.893734,0.899589,0.783654
64k_lstm_all_pert_vanilla_bert_yelp_bin,0.961905,0.937869,0.931868,0.847912,0.867788
64k_lstm_all_pert_finetuned_bert_yelp_bin,0.959892,0.941855,0.933807,0.879767,0.871834
baseline_roberta_yelp_bin,0.957877,0.89782,0.887595,0.760056,0.760227


In [43]:
accuracy_df.to_csv("../output/grid_accuracy.csv")

In [44]:
f1_df.to_csv("../output/grid_f1.csv")

-----