In [85]:
import sys
sys.path.append("..")

import copy
import cProfile
from datasets import load_dataset
import math
import pandas as pd
import random
from sklearn.metrics import classification_report, accuracy_score, f1_score
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, DataCollatorWithPadding, \
                         AutoModelForSequenceClassification, BertForSequenceClassification

from resilient_nlp.mini_roben import Clustering, ClusterRepRecoverer, ClusterRecovererWithPassthrough
from resilient_nlp.models import BertClassifier
from resilient_nlp.perturbers import ToyPerturber, WordScramblerPerturber
from runner import ExperimentRunner
from word_score_attack import BertWordScoreAttack

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [2]:
# imdb = load_dataset('imdb')
imdb = load_dataset('artemis13fowl/imdb')

Using custom data configuration artemis13fowl--imdb-f63738dec0d5e230
Reusing dataset parquet (/home/scpdxcs/.cache/huggingface/datasets/parquet/artemis13fowl--imdb-f63738dec0d5e230/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)


  0%|          | 0/4 [00:00<?, ?it/s]

In [96]:
random.seed(11)
sampled_test_set = imdb['attack_eval_truncated'].select(range(31))

# This is silly but apparently huggingface datasets are immutable?
# Representing it as something a bit more sane
sampled_test_set_dict = [
    {
        'text': row['text'],
        'label': row['label'],
    }
    for row in sampled_test_set
]

In [4]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [5]:
checkpoint_finetuned = "artemis13fowl/bert-base-uncased-imdb"
model_finetuned = BertForSequenceClassification.from_pretrained(checkpoint_finetuned).to(device)

In [7]:
roben_clustering = Clustering.from_pickle("../vocab100000_ed1.pkl")
roben_recoverer = ClusterRecovererWithPassthrough("cache", roben_clustering)
roben_clustering2 = Clustering.from_pickle("../vocab100000_ed1_gamma0.3.pkl")
roben_recoverer2 = ClusterRecovererWithPassthrough("cache", roben_clustering2)

In [97]:
random.seed(11)
sampled_test_set_adv_no_ws = []
wsp = WordScramblerPerturber(perturb_prob=0.1, weight_add=1, weight_drop=1, weight_swap=1,
                             weight_split_word=0, weight_merge_words=0)

for i in range(10):
    test_item = copy.deepcopy(sampled_test_set_dict)

    for row in test_item:
        row['text'] = wsp.perturb([row['text']])[0][0]
    sampled_test_set_adv_no_ws.append(test_item)

In [98]:
random.seed(11)
sampled_test_set_adv_incl_ws = []
wsp = WordScramblerPerturber(perturb_prob=0.1, weight_add=1, weight_drop=1, weight_swap=1,
                             weight_split_word=1, weight_merge_words=1)

for i in range(10):
    test_item = copy.deepcopy(sampled_test_set_dict)

    for row in test_item:
        row['text'] = wsp.perturb([row['text']])[0][0]
    sampled_test_set_adv_incl_ws.append(test_item)

In [10]:
max_sequence_length = 128
batch_size = 32
eval_steps = 100

In [11]:
def standard_model_predict(tokenizer, model, sentences, recoverer, return_pred_tensor):
    if recoverer is not None:
        sentences = [ recoverer.recover(s.lower()) for s in sentences ]
    tokenized = tokenizer(sentences, truncation=True, padding='max_length', max_length=max_sequence_length,
                          return_tensors='pt')
    tokenized = { k: v.to(device) for k, v in tokenized.items() }
    preds = model(**tokenized)
    if return_pred_tensor:
        return preds
    else:
        return torch.argmax(preds.logits, dim=1)

def wrap_standard_model(tokenizer, model, recoverer=None, return_pred_tensor=True):
    return lambda sentences: standard_model_predict(tokenizer, model, sentences, recoverer, return_pred_tensor)

In [12]:
def mltokenizer_model_predict(runner, model, cls_embedding, sep_embedding, pad_embedding, sentences, return_pred_tensor):
    # Truncate and lower case. Truncation is for performance only
    sentences = [ s.lower()[:5*max_sequence_length] for s in sentences]
    embedding = runner.embed(sentences=sentences,
        start_token=cls_embedding, end_token=sep_embedding, pad_token=pad_embedding,
        max_tokens=max_sequence_length)
    preds = model(inputs_embeds=embedding['inputs_embeds'], attention_mask=embedding['attention_mask'])
    if return_pred_tensor:
        return preds
    else:
        return torch.argmax(preds.logits, dim=1)

def wrap_mltokenizer_model(mltokenizer_prefix, tokenizer, model, return_pred_tensor=True):
    filename = "../{}.pth".format(mltokenizer_prefix)
    runner = ExperimentRunner(device, model_filename=filename)
    cf_embedding = model.base_model.embeddings.word_embeddings
    cls_token_id = tokenizer.vocab['[CLS]']
    sep_token_id = tokenizer.vocab['[SEP]']
    pad_token_id = tokenizer.vocab['[PAD]']
    cls_embedding = cf_embedding(torch.tensor([cls_token_id], device=device)).view(-1)
    sep_embedding = cf_embedding(torch.tensor([sep_token_id], device=device)).view(-1)
    pad_embedding = cf_embedding(torch.tensor([pad_token_id], device=device)).view(-1)
    
    return lambda sentences: mltokenizer_model_predict(runner, model, cls_embedding, sep_embedding,
                                                      pad_embedding, sentences, return_pred_tensor)

In [87]:
@torch.no_grad()
def evaluate_model(model, test_set):
    num_batches = math.ceil(len(test_set) / batch_size)
    
    sentences = [ x['text'] for x in test_set ]
    labels = [ x['label'] for x in test_set ]
    pred_batches = []
    
    for i in tqdm(range(num_batches)):
        bs = i * batch_size
        be = bs + batch_size
        
        output = model(sentences[bs:be])
        
        pred_batches.append(torch.argmax(output.logits, dim=1).detach().cpu())
    preds = torch.cat(pred_batches)
    
    print(classification_report(labels, preds, digits=4))
    
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='macro')
    
    return accuracy, f1

In [88]:
@torch.no_grad()
def evaluate_model_adv(model, test_sets):
    labels = [ x['label'] for x in test_sets[0] ]
    adv_preds = copy.copy(labels)
    accuracy_list = []
    f1_list = []
    
    for idx in tqdm(range(len(test_sets))):
        test_set = test_sets[idx]
        num_batches = math.ceil(len(test_set) / batch_size)
    
        sentences = [ x['text'] for x in test_set ]
        pred_batches = []
    
        for i in range(num_batches):
            bs = i * batch_size
            be = bs + batch_size
        
            output = model(sentences[bs:be])
        
            pred_batches.append(torch.argmax(output.logits, dim=1).detach().cpu())
        preds = torch.cat(pred_batches)
        
        for i in range(len(adv_preds)):
            if labels[i] == 1.0 and preds[i] == 0.0:
                adv_preds[i] = 0.0
            elif labels[i] == 0.0 and preds[i] == 1.0:
                adv_preds[i] = 1.0

        accuracy_list.append(accuracy_score(labels, adv_preds))
        f1_list.append(f1_score(labels, adv_preds, average='macro'))
    
    print(classification_report(labels, adv_preds, digits=4))    
    
    return accuracy_list, f1_list

In [89]:
@torch.no_grad()
def evaluate_model_word_score(model, test_set, allow_whitespace_pert=True):
    attacker = BertWordScoreAttack(
        WordScramblerPerturber(perturb_prob=1, weight_add=1, weight_drop=1, weight_swap=1,
                               weight_split_word=int(allow_whitespace_pert),
                               weight_merge_words=int(allow_whitespace_pert)),
        "../output/imdb_word_scores.json", model, tokenizer=None, max_sequence_length=max_sequence_length
    )

    res = attacker.attack(test_set, max_tokens_to_query=10, max_tries_per_token=2, mode=0, print_summary=False)
    
    print(classification_report(res['ground_truth'], res['perturbed_preds'], digits=4))    
    
    accuracy = accuracy_score(res['ground_truth'], res['perturbed_preds'])
    f1 = f1_score(res['ground_truth'], res['perturbed_preds'], average='macro')
    
    return accuracy, f1

In [16]:
baseline_model = wrap_standard_model(tokenizer, model_finetuned)

In [17]:
mltok_model = wrap_mltokenizer_model('output/64k_lstm_all_pert_finetuned', tokenizer, model_finetuned)

In [24]:
baseline_roben_model = wrap_standard_model(tokenizer, model_finetuned, roben_recoverer)

In [25]:
baseline_roben_model2 = wrap_standard_model(tokenizer, model_finetuned, roben_recoverer2)

In [27]:
all_models = {
    'baseline': baseline_model,
    'roben_1': baseline_roben_model,
    'roben_2': baseline_roben_model2,
}

mltok_model_names = [
    '64k_lstm_clean_vanilla',
    '64k_lstm_no_whitespace_pert_vanilla',
    '64k_lstm_all_pert_vanilla',
    '64k_lstm_clean_finetuned',
    '64k_lstm_no_whitespace_pert_finetuned',
    '64k_lstm_all_pert_finetuned',
    '64k_cnn_no_whitespace_pert_finetuned',
]

for name in mltok_model_names:
    all_models[name] = wrap_mltokenizer_model(f'output/{name}', tokenizer, model_finetuned)

In [40]:
evaluations = [
    'clean',
    'stochastic_no_ws',
    'stochastic_incl_ws',
    'word_score_no_ws',
    'word_score_incl_ws',
]

In [99]:
accuracy_df = pd.DataFrame(columns=evaluations, index=all_models.keys())
f1_df = pd.DataFrame(columns=evaluations, index=all_models.keys())

for cur_model_name, cur_model in all_models.items():
    for cur_evaluation in evaluations:
        print(f'Evaluating model {cur_model_name} on {cur_evaluation}')
        random.seed(11)
        if cur_evaluation == 'clean':
            acc, f1 = evaluate_model(cur_model, sampled_test_set)
        elif cur_evaluation.startswith('stochastic_'):
            if cur_evaluation == 'stochastic_no_ws':
                acc_list, f1_list = evaluate_model_adv(cur_model, sampled_test_set_adv_no_ws)
            elif cur_evaluation == 'stochastic_incl_ws':
                acc_list, f1_list = evaluate_model_adv(cur_model, sampled_test_set_adv_incl_ws)
            acc = acc_list[-1]
            f1 = f1_list[-1]
        elif cur_evaluation.startswith('word_score_'):
            if cur_evaluation == 'word_score_no_ws':
                acc, f1 = evaluate_model_word_score(cur_model, sampled_test_set, allow_whitespace_pert=False)
            elif cur_evaluation == 'word_score_incl_ws':
                acc, f1 = evaluate_model_word_score(cur_model, sampled_test_set, allow_whitespace_pert=True)

        accuracy_df[cur_evaluation][cur_model_name] = acc
        f1_df[cur_evaluation][cur_model_name] = f1

Evaluating model baseline on clean


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.97it/s]


              precision    recall  f1-score   support

           0     1.0000    0.9231    0.9600        13
           1     0.9474    1.0000    0.9730        18

    accuracy                         0.9677        31
   macro avg     0.9737    0.9615    0.9665        31
weighted avg     0.9694    0.9677    0.9675        31

Evaluating model baseline on stochastic_no_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.28it/s]


              precision    recall  f1-score   support

           0     0.8000    0.6154    0.6957        13
           1     0.7619    0.8889    0.8205        18

    accuracy                         0.7742        31
   macro avg     0.7810    0.7521    0.7581        31
weighted avg     0.7779    0.7742    0.7682        31

Evaluating model baseline on stochastic_incl_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.28it/s]


              precision    recall  f1-score   support

           0     0.9000    0.6923    0.7826        13
           1     0.8095    0.9444    0.8718        18

    accuracy                         0.8387        31
   macro avg     0.8548    0.8184    0.8272        31
weighted avg     0.8475    0.8387    0.8344        31

Evaluating model baseline on word_score_no_ws


31it [00:14,  2.14it/s]


              precision    recall  f1-score   support

           0     0.6923    0.6923    0.6923        13
           1     0.7778    0.7778    0.7778        18

    accuracy                         0.7419        31
   macro avg     0.7350    0.7350    0.7350        31
weighted avg     0.7419    0.7419    0.7419        31

Evaluating model baseline on word_score_incl_ws


31it [00:14,  2.20it/s]


              precision    recall  f1-score   support

           0     0.6154    0.6154    0.6154        13
           1     0.7222    0.7222    0.7222        18

    accuracy                         0.6774        31
   macro avg     0.6688    0.6688    0.6688        31
weighted avg     0.6774    0.6774    0.6774        31

Evaluating model roben_1 on clean


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.26it/s]


              precision    recall  f1-score   support

           0     0.8182    0.6923    0.7500        13
           1     0.8000    0.8889    0.8421        18

    accuracy                         0.8065        31
   macro avg     0.8091    0.7906    0.7961        31
weighted avg     0.8076    0.8065    0.8035        31

Evaluating model roben_1 on stochastic_no_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.20it/s]


              precision    recall  f1-score   support

           0     0.7778    0.5385    0.6364        13
           1     0.7273    0.8889    0.8000        18

    accuracy                         0.7419        31
   macro avg     0.7525    0.7137    0.7182        31
weighted avg     0.7485    0.7419    0.7314        31

Evaluating model roben_1 on stochastic_incl_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.21it/s]


              precision    recall  f1-score   support

           0     0.4444    0.3077    0.3636        13
           1     0.5909    0.7222    0.6500        18

    accuracy                         0.5484        31
   macro avg     0.5177    0.5150    0.5068        31
weighted avg     0.5295    0.5484    0.5299        31

Evaluating model roben_1 on word_score_no_ws


31it [00:12,  2.50it/s]


              precision    recall  f1-score   support

           0     0.6667    0.4615    0.5455        13
           1     0.6818    0.8333    0.7500        18

    accuracy                         0.6774        31
   macro avg     0.6742    0.6474    0.6477        31
weighted avg     0.6755    0.6774    0.6642        31

Evaluating model roben_1 on word_score_incl_ws


31it [00:12,  2.53it/s]


              precision    recall  f1-score   support

           0     0.6250    0.3846    0.4762        13
           1     0.6522    0.8333    0.7317        18

    accuracy                         0.6452        31
   macro avg     0.6386    0.6090    0.6039        31
weighted avg     0.6408    0.6452    0.6246        31

Evaluating model roben_2 on clean


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.25it/s]


              precision    recall  f1-score   support

           0     0.8333    0.7692    0.8000        13
           1     0.8421    0.8889    0.8649        18

    accuracy                         0.8387        31
   macro avg     0.8377    0.8291    0.8324        31
weighted avg     0.8384    0.8387    0.8377        31

Evaluating model roben_2 on stochastic_no_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.23it/s]


              precision    recall  f1-score   support

           0     0.8182    0.6923    0.7500        13
           1     0.8000    0.8889    0.8421        18

    accuracy                         0.8065        31
   macro avg     0.8091    0.7906    0.7961        31
weighted avg     0.8076    0.8065    0.8035        31

Evaluating model roben_2 on stochastic_incl_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.23it/s]


              precision    recall  f1-score   support

           0     0.5455    0.4615    0.5000        13
           1     0.6500    0.7222    0.6842        18

    accuracy                         0.6129        31
   macro avg     0.5977    0.5919    0.5921        31
weighted avg     0.6062    0.6129    0.6070        31

Evaluating model roben_2 on word_score_no_ws


31it [00:12,  2.42it/s]


              precision    recall  f1-score   support

           0     0.7000    0.5385    0.6087        13
           1     0.7143    0.8333    0.7692        18

    accuracy                         0.7097        31
   macro avg     0.7071    0.6859    0.6890        31
weighted avg     0.7083    0.7097    0.7019        31

Evaluating model roben_2 on word_score_incl_ws


31it [00:12,  2.43it/s]


              precision    recall  f1-score   support

           0     0.6667    0.6154    0.6400        13
           1     0.7368    0.7778    0.7568        18

    accuracy                         0.7097        31
   macro avg     0.7018    0.6966    0.6984        31
weighted avg     0.7074    0.7097    0.7078        31

Evaluating model 64k_lstm_clean_vanilla on clean


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.81s/it]


              precision    recall  f1-score   support

           0     0.8462    0.8462    0.8462        13
           1     0.8889    0.8889    0.8889        18

    accuracy                         0.8710        31
   macro avg     0.8675    0.8675    0.8675        31
weighted avg     0.8710    0.8710    0.8710        31

Evaluating model 64k_lstm_clean_vanilla on stochastic_no_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:28<00:00,  2.81s/it]


              precision    recall  f1-score   support

           0     0.6429    0.6923    0.6667        13
           1     0.7647    0.7222    0.7429        18

    accuracy                         0.7097        31
   macro avg     0.7038    0.7073    0.7048        31
weighted avg     0.7136    0.7097    0.7109        31

Evaluating model 64k_lstm_clean_vanilla on stochastic_incl_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:28<00:00,  2.80s/it]


              precision    recall  f1-score   support

           0     0.6000    0.6923    0.6429        13
           1     0.7500    0.6667    0.7059        18

    accuracy                         0.6774        31
   macro avg     0.6750    0.6795    0.6744        31
weighted avg     0.6871    0.6774    0.6795        31

Evaluating model 64k_lstm_clean_vanilla on word_score_no_ws


31it [02:35,  5.02s/it]


              precision    recall  f1-score   support

           0     0.4667    0.5385    0.5000        13
           1     0.6250    0.5556    0.5882        18

    accuracy                         0.5484        31
   macro avg     0.5458    0.5470    0.5441        31
weighted avg     0.5586    0.5484    0.5512        31

Evaluating model 64k_lstm_clean_vanilla on word_score_incl_ws


31it [02:40,  5.18s/it]


              precision    recall  f1-score   support

           0     0.4667    0.5385    0.5000        13
           1     0.6250    0.5556    0.5882        18

    accuracy                         0.5484        31
   macro avg     0.5458    0.5470    0.5441        31
weighted avg     0.5586    0.5484    0.5512        31

Evaluating model 64k_lstm_no_whitespace_pert_vanilla on clean


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.83s/it]


              precision    recall  f1-score   support

           0     0.9091    0.7692    0.8333        13
           1     0.8500    0.9444    0.8947        18

    accuracy                         0.8710        31
   macro avg     0.8795    0.8568    0.8640        31
weighted avg     0.8748    0.8710    0.8690        31

Evaluating model 64k_lstm_no_whitespace_pert_vanilla on stochastic_no_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:28<00:00,  2.85s/it]


              precision    recall  f1-score   support

           0     0.8182    0.6923    0.7500        13
           1     0.8000    0.8889    0.8421        18

    accuracy                         0.8065        31
   macro avg     0.8091    0.7906    0.7961        31
weighted avg     0.8076    0.8065    0.8035        31

Evaluating model 64k_lstm_no_whitespace_pert_vanilla on stochastic_incl_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:28<00:00,  2.84s/it]


              precision    recall  f1-score   support

           0     0.6667    0.6154    0.6400        13
           1     0.7368    0.7778    0.7568        18

    accuracy                         0.7097        31
   macro avg     0.7018    0.6966    0.6984        31
weighted avg     0.7074    0.7097    0.7078        31

Evaluating model 64k_lstm_no_whitespace_pert_vanilla on word_score_no_ws


31it [02:59,  5.78s/it]


              precision    recall  f1-score   support

           0     0.8000    0.6154    0.6957        13
           1     0.7619    0.8889    0.8205        18

    accuracy                         0.7742        31
   macro avg     0.7810    0.7521    0.7581        31
weighted avg     0.7779    0.7742    0.7682        31

Evaluating model 64k_lstm_no_whitespace_pert_vanilla on word_score_incl_ws


31it [03:03,  5.93s/it]


              precision    recall  f1-score   support

           0     0.7500    0.6923    0.7200        13
           1     0.7895    0.8333    0.8108        18

    accuracy                         0.7742        31
   macro avg     0.7697    0.7628    0.7654        31
weighted avg     0.7729    0.7742    0.7727        31

Evaluating model 64k_lstm_all_pert_vanilla on clean


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.82s/it]


              precision    recall  f1-score   support

           0     0.8333    0.7692    0.8000        13
           1     0.8421    0.8889    0.8649        18

    accuracy                         0.8387        31
   macro avg     0.8377    0.8291    0.8324        31
weighted avg     0.8384    0.8387    0.8377        31

Evaluating model 64k_lstm_all_pert_vanilla on stochastic_no_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:28<00:00,  2.84s/it]


              precision    recall  f1-score   support

           0     0.8182    0.6923    0.7500        13
           1     0.8000    0.8889    0.8421        18

    accuracy                         0.8065        31
   macro avg     0.8091    0.7906    0.7961        31
weighted avg     0.8076    0.8065    0.8035        31

Evaluating model 64k_lstm_all_pert_vanilla on stochastic_incl_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:28<00:00,  2.84s/it]


              precision    recall  f1-score   support

           0     0.8182    0.6923    0.7500        13
           1     0.8000    0.8889    0.8421        18

    accuracy                         0.8065        31
   macro avg     0.8091    0.7906    0.7961        31
weighted avg     0.8076    0.8065    0.8035        31

Evaluating model 64k_lstm_all_pert_vanilla on word_score_no_ws


31it [02:59,  5.78s/it]


              precision    recall  f1-score   support

           0     0.8182    0.6923    0.7500        13
           1     0.8000    0.8889    0.8421        18

    accuracy                         0.8065        31
   macro avg     0.8091    0.7906    0.7961        31
weighted avg     0.8076    0.8065    0.8035        31

Evaluating model 64k_lstm_all_pert_vanilla on word_score_incl_ws


31it [02:53,  5.61s/it]


              precision    recall  f1-score   support

           0     0.7500    0.6923    0.7200        13
           1     0.7895    0.8333    0.8108        18

    accuracy                         0.7742        31
   macro avg     0.7697    0.7628    0.7654        31
weighted avg     0.7729    0.7742    0.7727        31

Evaluating model 64k_lstm_clean_finetuned on clean


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.83s/it]


              precision    recall  f1-score   support

           0     0.9167    0.8462    0.8800        13
           1     0.8947    0.9444    0.9189        18

    accuracy                         0.9032        31
   macro avg     0.9057    0.8953    0.8995        31
weighted avg     0.9039    0.9032    0.9026        31

Evaluating model 64k_lstm_clean_finetuned on stochastic_no_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:28<00:00,  2.85s/it]


              precision    recall  f1-score   support

           0     0.7273    0.6154    0.6667        13
           1     0.7500    0.8333    0.7895        18

    accuracy                         0.7419        31
   macro avg     0.7386    0.7244    0.7281        31
weighted avg     0.7405    0.7419    0.7380        31

Evaluating model 64k_lstm_clean_finetuned on stochastic_incl_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:28<00:00,  2.84s/it]


              precision    recall  f1-score   support

           0     0.6923    0.6923    0.6923        13
           1     0.7778    0.7778    0.7778        18

    accuracy                         0.7419        31
   macro avg     0.7350    0.7350    0.7350        31
weighted avg     0.7419    0.7419    0.7419        31

Evaluating model 64k_lstm_clean_finetuned on word_score_no_ws


31it [02:55,  5.67s/it]


              precision    recall  f1-score   support

           0     0.6923    0.6923    0.6923        13
           1     0.7778    0.7778    0.7778        18

    accuracy                         0.7419        31
   macro avg     0.7350    0.7350    0.7350        31
weighted avg     0.7419    0.7419    0.7419        31

Evaluating model 64k_lstm_clean_finetuned on word_score_incl_ws


31it [02:51,  5.52s/it]


              precision    recall  f1-score   support

           0     0.6154    0.6154    0.6154        13
           1     0.7222    0.7222    0.7222        18

    accuracy                         0.6774        31
   macro avg     0.6688    0.6688    0.6688        31
weighted avg     0.6774    0.6774    0.6774        31

Evaluating model 64k_lstm_no_whitespace_pert_finetuned on clean


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.79s/it]


              precision    recall  f1-score   support

           0     0.8571    0.9231    0.8889        13
           1     0.9412    0.8889    0.9143        18

    accuracy                         0.9032        31
   macro avg     0.8992    0.9060    0.9016        31
weighted avg     0.9059    0.9032    0.9036        31

Evaluating model 64k_lstm_no_whitespace_pert_finetuned on stochastic_no_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:28<00:00,  2.83s/it]


              precision    recall  f1-score   support

           0     0.8333    0.7692    0.8000        13
           1     0.8421    0.8889    0.8649        18

    accuracy                         0.8387        31
   macro avg     0.8377    0.8291    0.8324        31
weighted avg     0.8384    0.8387    0.8377        31

Evaluating model 64k_lstm_no_whitespace_pert_finetuned on stochastic_incl_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:28<00:00,  2.83s/it]


              precision    recall  f1-score   support

           0     0.6667    0.6154    0.6400        13
           1     0.7368    0.7778    0.7568        18

    accuracy                         0.7097        31
   macro avg     0.7018    0.6966    0.6984        31
weighted avg     0.7074    0.7097    0.7078        31

Evaluating model 64k_lstm_no_whitespace_pert_finetuned on word_score_no_ws


31it [03:03,  5.93s/it]


              precision    recall  f1-score   support

           0     0.8000    0.6154    0.6957        13
           1     0.7619    0.8889    0.8205        18

    accuracy                         0.7742        31
   macro avg     0.7810    0.7521    0.7581        31
weighted avg     0.7779    0.7742    0.7682        31

Evaluating model 64k_lstm_no_whitespace_pert_finetuned on word_score_incl_ws


31it [03:00,  5.81s/it]


              precision    recall  f1-score   support

           0     0.7500    0.6923    0.7200        13
           1     0.7895    0.8333    0.8108        18

    accuracy                         0.7742        31
   macro avg     0.7697    0.7628    0.7654        31
weighted avg     0.7729    0.7742    0.7727        31

Evaluating model 64k_lstm_all_pert_finetuned on clean


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.84s/it]


              precision    recall  f1-score   support

           0     0.8333    0.7692    0.8000        13
           1     0.8421    0.8889    0.8649        18

    accuracy                         0.8387        31
   macro avg     0.8377    0.8291    0.8324        31
weighted avg     0.8384    0.8387    0.8377        31

Evaluating model 64k_lstm_all_pert_finetuned on stochastic_no_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:28<00:00,  2.85s/it]


              precision    recall  f1-score   support

           0     0.8182    0.6923    0.7500        13
           1     0.8000    0.8889    0.8421        18

    accuracy                         0.8065        31
   macro avg     0.8091    0.7906    0.7961        31
weighted avg     0.8076    0.8065    0.8035        31

Evaluating model 64k_lstm_all_pert_finetuned on stochastic_incl_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:28<00:00,  2.83s/it]


              precision    recall  f1-score   support

           0     0.7500    0.6923    0.7200        13
           1     0.7895    0.8333    0.8108        18

    accuracy                         0.7742        31
   macro avg     0.7697    0.7628    0.7654        31
weighted avg     0.7729    0.7742    0.7727        31

Evaluating model 64k_lstm_all_pert_finetuned on word_score_no_ws


31it [03:01,  5.85s/it]


              precision    recall  f1-score   support

           0     0.8000    0.6154    0.6957        13
           1     0.7619    0.8889    0.8205        18

    accuracy                         0.7742        31
   macro avg     0.7810    0.7521    0.7581        31
weighted avg     0.7779    0.7742    0.7682        31

Evaluating model 64k_lstm_all_pert_finetuned on word_score_incl_ws


31it [03:03,  5.92s/it]


              precision    recall  f1-score   support

           0     0.7500    0.6923    0.7200        13
           1     0.7895    0.8333    0.8108        18

    accuracy                         0.7742        31
   macro avg     0.7697    0.7628    0.7654        31
weighted avg     0.7729    0.7742    0.7727        31

Evaluating model 64k_cnn_no_whitespace_pert_finetuned on clean


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.79s/it]


              precision    recall  f1-score   support

           0     1.0000    0.0769    0.1429        13
           1     0.6000    1.0000    0.7500        18

    accuracy                         0.6129        31
   macro avg     0.8000    0.5385    0.4464        31
weighted avg     0.7677    0.6129    0.4954        31

Evaluating model 64k_cnn_no_whitespace_pert_finetuned on stochastic_no_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:17<00:00,  1.72s/it]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000        13
           1     0.5806    1.0000    0.7347        18

    accuracy                         0.5806        31
   macro avg     0.2903    0.5000    0.3673        31
weighted avg     0.3371    0.5806    0.4266        31

Evaluating model 64k_cnn_no_whitespace_pert_finetuned on stochastic_incl_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:17<00:00,  1.72s/it]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000        13
           1     0.5806    1.0000    0.7347        18

    accuracy                         0.5806        31
   macro avg     0.2903    0.5000    0.3673        31
weighted avg     0.3371    0.5806    0.4266        31

Evaluating model 64k_cnn_no_whitespace_pert_finetuned on word_score_no_ws


31it [00:34,  1.12s/it]


              precision    recall  f1-score   support

           0     1.0000    0.0769    0.1429        13
           1     0.6000    1.0000    0.7500        18

    accuracy                         0.6129        31
   macro avg     0.8000    0.5385    0.4464        31
weighted avg     0.7677    0.6129    0.4954        31

Evaluating model 64k_cnn_no_whitespace_pert_finetuned on word_score_incl_ws


31it [00:35,  1.15s/it]

              precision    recall  f1-score   support

           0     1.0000    0.0769    0.1429        13
           1     0.6000    1.0000    0.7500        18

    accuracy                         0.6129        31
   macro avg     0.8000    0.5385    0.4464        31
weighted avg     0.7677    0.6129    0.4954        31






In [100]:
accuracy_df

Unnamed: 0,clean,stochastic_no_ws,stochastic_incl_ws,word_score_no_ws,word_score_incl_ws
baseline,0.967742,0.774194,0.83871,0.741935,0.677419
roben_1,0.806452,0.741935,0.548387,0.677419,0.645161
roben_2,0.83871,0.806452,0.612903,0.709677,0.709677
64k_lstm_clean_vanilla,0.870968,0.709677,0.677419,0.548387,0.548387
64k_lstm_no_whitespace_pert_vanilla,0.870968,0.806452,0.709677,0.774194,0.774194
64k_lstm_all_pert_vanilla,0.83871,0.806452,0.806452,0.806452,0.774194
64k_lstm_clean_finetuned,0.903226,0.741935,0.741935,0.741935,0.677419
64k_lstm_no_whitespace_pert_finetuned,0.903226,0.83871,0.709677,0.774194,0.774194
64k_lstm_all_pert_finetuned,0.83871,0.806452,0.774194,0.774194,0.774194
64k_cnn_no_whitespace_pert_finetuned,0.612903,0.580645,0.580645,0.612903,0.612903


In [101]:
f1_df

Unnamed: 0,clean,stochastic_no_ws,stochastic_incl_ws,word_score_no_ws,word_score_incl_ws
baseline,0.966486,0.758082,0.827202,0.735043,0.668803
roben_1,0.796053,0.718182,0.506818,0.647727,0.603949
roben_2,0.832432,0.796053,0.592105,0.688963,0.698378
64k_lstm_clean_vanilla,0.867521,0.704762,0.67437,0.544118,0.544118
64k_lstm_no_whitespace_pert_vanilla,0.864035,0.796053,0.698378,0.758082,0.765405
64k_lstm_all_pert_vanilla,0.832432,0.796053,0.796053,0.796053,0.765405
64k_lstm_clean_finetuned,0.899459,0.72807,0.735043,0.735043,0.668803
64k_lstm_no_whitespace_pert_finetuned,0.901587,0.832432,0.698378,0.758082,0.765405
64k_lstm_all_pert_finetuned,0.832432,0.796053,0.765405,0.758082,0.765405
64k_cnn_no_whitespace_pert_finetuned,0.446429,0.367347,0.367347,0.446429,0.446429


In [102]:
accuracy_df.to_csv("../output/grid_accuracy.csv")

In [103]:
f1_df.to_csv("../output/grid_f1.csv")