In [1]:
import sys
sys.path.append("..")

import copy
import cProfile
from datasets import load_dataset
import math
import nltk
import pandas as pd
import random
from sklearn.metrics import classification_report, accuracy_score, f1_score
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, DataCollatorWithPadding, \
                         AutoModelForSequenceClassification, BertForSequenceClassification

from resilient_nlp.mini_roben import Clustering, ClusterRepRecoverer, ClusterRecovererWithPassthrough
from resilient_nlp.models import BertClassifier
from resilient_nlp.perturbers import ToyPerturber, WordScramblerPerturber
from runner import ExperimentRunner
from word_score_attack import BertWordScoreAttack

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [2]:
# imdb = load_dataset('imdb')
imdb = load_dataset('artemis13fowl/imdb')

Using custom data configuration artemis13fowl--imdb-f63738dec0d5e230
Reusing dataset parquet (/home/scpdxcs/.cache/huggingface/datasets/parquet/artemis13fowl--imdb-f63738dec0d5e230/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)


  0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
random.seed(11)
sampled_test_set = imdb['attack_eval_truncated']

# This is silly but apparently huggingface datasets are immutable?
# Representing it as something a bit more sane
sampled_test_set_dict = [
    {
        'text': row['text'],
        'label': row['label'],
    }
    for row in sampled_test_set
]

In [4]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [5]:
checkpoint_finetuned = "artemis13fowl/bert-base-uncased-imdb"
model_finetuned = BertForSequenceClassification.from_pretrained(checkpoint_finetuned).to(device)

In [6]:
roben_clustering = Clustering.from_pickle("../vocab100000_ed1.pkl")
roben_recoverer = ClusterRecovererWithPassthrough("cache", roben_clustering)
roben_clustering2 = Clustering.from_pickle("../vocab100000_ed1_gamma0.3.pkl")
roben_recoverer2 = ClusterRecovererWithPassthrough("cache", roben_clustering2)

In [7]:
random.seed(11)
sampled_test_set_adv_no_ws = []
wsp = WordScramblerPerturber(perturb_prob=0.1, weight_add=1, weight_drop=1, weight_swap=1,
                             weight_split_word=0, weight_merge_words=0)

for i in range(10):
    test_item = copy.deepcopy(sampled_test_set_dict)

    for row in test_item:
        row['text'] = wsp.perturb([row['text']])[0][0]
    sampled_test_set_adv_no_ws.append(test_item)

In [8]:
random.seed(11)
sampled_test_set_adv_incl_ws = []
wsp = WordScramblerPerturber(perturb_prob=0.1, weight_add=1, weight_drop=1, weight_swap=1,
                             weight_split_word=1, weight_merge_words=1)

for i in range(10):
    test_item = copy.deepcopy(sampled_test_set_dict)

    for row in test_item:
        row['text'] = wsp.perturb([row['text']])[0][0]
    sampled_test_set_adv_incl_ws.append(test_item)

In [9]:
max_sequence_length = 128
batch_size = 32
eval_steps = 100

In [10]:
def standard_model_predict(tokenizer, model, sentences, recoverer, return_pred_tensor, recoverer_tokenize):
    if recoverer is not None:
        if recoverer_tokenize:
            tok = nltk.tokenize.treebank.TreebankWordTokenizer()
            sentences = [ " ".join(tok_list) for tok_list in tok.tokenize_sents(sentences) ]
        sentences = [ recoverer.recover(s.lower()) for s in sentences ]
        if recoverer_tokenize:
            detok = nltk.tokenize.treebank.TreebankWordDetokenizer()
            sentences = [ detok.detokenize(s.split(" ")) for s in sentences]
    tokenized = tokenizer(sentences, truncation=True, padding='max_length', max_length=max_sequence_length,
                          return_tensors='pt')
    tokenized = { k: v.to(device) for k, v in tokenized.items() }
    preds = model(**tokenized)
    if return_pred_tensor:
        return preds
    else:
        return torch.argmax(preds.logits, dim=1)

def wrap_standard_model(tokenizer, model, recoverer=None, return_pred_tensor=True, recoverer_tokenize=False):
    return lambda sentences: standard_model_predict(tokenizer, model, sentences, recoverer, return_pred_tensor,
                                                    recoverer_tokenize)

In [11]:
def mltokenizer_model_predict(runner, model, cls_embedding, sep_embedding, pad_embedding, sentences, return_pred_tensor):
    # Truncate and lower case. Truncation is for performance only
    sentences = [ s.lower()[:5*max_sequence_length] for s in sentences]
    embedding = runner.embed(sentences=sentences,
        start_token=cls_embedding, end_token=sep_embedding, pad_token=pad_embedding,
        max_tokens=max_sequence_length)
    preds = model(inputs_embeds=embedding['inputs_embeds'], attention_mask=embedding['attention_mask'])
    if return_pred_tensor:
        return preds
    else:
        return torch.argmax(preds.logits, dim=1)

def wrap_mltokenizer_model(mltokenizer_prefix, tokenizer, model, return_pred_tensor=True):
    filename = "../{}.pth".format(mltokenizer_prefix)
    runner = ExperimentRunner(device, model_filename=filename)
    cf_embedding = model.base_model.embeddings.word_embeddings
    cls_token_id = tokenizer.vocab['[CLS]']
    sep_token_id = tokenizer.vocab['[SEP]']
    pad_token_id = tokenizer.vocab['[PAD]']
    cls_embedding = cf_embedding(torch.tensor([cls_token_id], device=device)).view(-1)
    sep_embedding = cf_embedding(torch.tensor([sep_token_id], device=device)).view(-1)
    pad_embedding = cf_embedding(torch.tensor([pad_token_id], device=device)).view(-1)
    
    return lambda sentences: mltokenizer_model_predict(runner, model, cls_embedding, sep_embedding,
                                                      pad_embedding, sentences, return_pred_tensor)

In [12]:
@torch.no_grad()
def evaluate_model(model, test_set):
    num_batches = math.ceil(len(test_set) / batch_size)
    
    sentences = [ x['text'] for x in test_set ]
    labels = [ x['label'] for x in test_set ]
    pred_batches = []
    
    for i in tqdm(range(num_batches)):
        bs = i * batch_size
        be = bs + batch_size
        
        output = model(sentences[bs:be])
        
        pred_batches.append(torch.argmax(output.logits, dim=1).detach().cpu())
    preds = torch.cat(pred_batches)
    
    print(classification_report(labels, preds, digits=4))
    
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='macro')
    
    return accuracy, f1

In [13]:
@torch.no_grad()
def evaluate_model_adv(model, test_sets):
    labels = [ x['label'] for x in test_sets[0] ]
    adv_preds = copy.copy(labels)
    accuracy_list = []
    f1_list = []
    
    for idx in tqdm(range(len(test_sets))):
        test_set = test_sets[idx]
        num_batches = math.ceil(len(test_set) / batch_size)
    
        sentences = [ x['text'] for x in test_set ]
        pred_batches = []
    
        for i in range(num_batches):
            bs = i * batch_size
            be = bs + batch_size
        
            output = model(sentences[bs:be])
        
            pred_batches.append(torch.argmax(output.logits, dim=1).detach().cpu())
        preds = torch.cat(pred_batches)
        
        for i in range(len(adv_preds)):
            if labels[i] == 1.0 and preds[i] == 0.0:
                adv_preds[i] = 0.0
            elif labels[i] == 0.0 and preds[i] == 1.0:
                adv_preds[i] = 1.0

        accuracy_list.append(accuracy_score(labels, adv_preds))
        f1_list.append(f1_score(labels, adv_preds, average='macro'))
    
    print(classification_report(labels, adv_preds, digits=4))    
    
    return accuracy_list, f1_list

In [14]:
@torch.no_grad()
def evaluate_model_word_score(model, test_set, allow_whitespace_pert=True):
    attacker = BertWordScoreAttack(
        WordScramblerPerturber(perturb_prob=1, weight_add=1, weight_drop=1, weight_swap=1,
                               weight_split_word=int(allow_whitespace_pert),
                               weight_merge_words=int(allow_whitespace_pert)),
        "../output/imdb_word_scores.json", model, tokenizer=None, max_sequence_length=max_sequence_length
    )

    res = attacker.attack(test_set, max_tokens_to_query=10, max_tries_per_token=2, mode=0, print_summary=False)
    
    print(classification_report(res['ground_truth'], res['perturbed_preds'], digits=4))    
    
    accuracy = accuracy_score(res['ground_truth'], res['perturbed_preds'])
    f1 = f1_score(res['ground_truth'], res['perturbed_preds'], average='macro')
    
    return accuracy, f1

In [15]:
baseline_model = wrap_standard_model(tokenizer, model_finetuned)

In [16]:
mltok_model = wrap_mltokenizer_model('output/64k_lstm_all_pert_finetuned', tokenizer, model_finetuned)

In [17]:
baseline_roben_model = wrap_standard_model(tokenizer, model_finetuned, roben_recoverer)

In [18]:
baseline_roben_model_tok = wrap_standard_model(tokenizer, model_finetuned, roben_recoverer, recoverer_tokenize=True)

In [19]:
baseline_roben_model2 = wrap_standard_model(tokenizer, model_finetuned, roben_recoverer2)

In [20]:
baseline_roben_model2_tok = wrap_standard_model(tokenizer, model_finetuned, roben_recoverer2, recoverer_tokenize=True)

In [21]:
all_models = {
    'baseline': baseline_model,
    'roben_1': baseline_roben_model,
    'roben_2': baseline_roben_model2,
    'roben_1_tok': baseline_roben_model,
    'roben_2_tok': baseline_roben_model2_tok,
}

mltok_model_names = [
    '64k_lstm_clean_vanilla',
    '64k_lstm_no_whitespace_pert_vanilla',
    '64k_lstm_all_pert_vanilla',
    '64k_lstm_clean_finetuned',
    '64k_lstm_no_whitespace_pert_finetuned',
    '64k_lstm_all_pert_finetuned',
    '64k_cnn_no_whitespace_pert_finetuned',
]

for name in mltok_model_names:
    all_models[name] = wrap_mltokenizer_model(f'output/{name}', tokenizer, model_finetuned)

In [22]:
evaluations = [
    'clean',
    'stochastic_no_ws',
    'stochastic_incl_ws',
    'word_score_no_ws',
    'word_score_incl_ws',
]

In [23]:
accuracy_df = pd.DataFrame(columns=evaluations, index=all_models.keys())
f1_df = pd.DataFrame(columns=evaluations, index=all_models.keys())

for cur_model_name, cur_model in all_models.items():
    for cur_evaluation in evaluations:
        print(f'Evaluating model {cur_model_name} on {cur_evaluation}')
        random.seed(11)
        if cur_evaluation == 'clean':
            acc, f1 = evaluate_model(cur_model, sampled_test_set)
        elif cur_evaluation.startswith('stochastic_'):
            if cur_evaluation == 'stochastic_no_ws':
                acc_list, f1_list = evaluate_model_adv(cur_model, sampled_test_set_adv_no_ws)
            elif cur_evaluation == 'stochastic_incl_ws':
                acc_list, f1_list = evaluate_model_adv(cur_model, sampled_test_set_adv_incl_ws)
            acc = acc_list[-1]
            f1 = f1_list[-1]
        elif cur_evaluation.startswith('word_score_'):
            if cur_evaluation == 'word_score_no_ws':
                acc, f1 = evaluate_model_word_score(cur_model, sampled_test_set, allow_whitespace_pert=False)
            elif cur_evaluation == 'word_score_incl_ws':
                acc, f1 = evaluate_model_word_score(cur_model, sampled_test_set, allow_whitespace_pert=True)

        accuracy_df[cur_evaluation][cur_model_name] = acc
        f1_df[cur_evaluation][cur_model_name] = f1

Evaluating model baseline on clean


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:07<00:00,  2.04it/s]


              precision    recall  f1-score   support

           0     0.8765    0.8765    0.8765       243
           1     0.8833    0.8833    0.8833       257

    accuracy                         0.8800       500
   macro avg     0.8799    0.8799    0.8799       500
weighted avg     0.8800    0.8800    0.8800       500

Evaluating model baseline on stochastic_no_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [01:16<00:00,  7.67s/it]


              precision    recall  f1-score   support

           0     0.6970    0.7572    0.7258       243
           1     0.7500    0.6887    0.7181       257

    accuracy                         0.7220       500
   macro avg     0.7235    0.7230    0.7219       500
weighted avg     0.7242    0.7220    0.7218       500

Evaluating model baseline on stochastic_incl_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [01:16<00:00,  7.68s/it]


              precision    recall  f1-score   support

           0     0.6692    0.7160    0.6918       243
           1     0.7125    0.6654    0.6881       257

    accuracy                         0.6900       500
   macro avg     0.6909    0.6907    0.6900       500
weighted avg     0.6915    0.6900    0.6899       500

Evaluating model baseline on word_score_no_ws


500it [03:39,  2.27it/s]


              precision    recall  f1-score   support

           0     0.6182    0.6996    0.6564       243
           1     0.6756    0.5914    0.6307       257

    accuracy                         0.6440       500
   macro avg     0.6469    0.6455    0.6435       500
weighted avg     0.6477    0.6440    0.6432       500

Evaluating model baseline on word_score_incl_ws


500it [03:46,  2.21it/s]


              precision    recall  f1-score   support

           0     0.6322    0.6790    0.6548       243
           1     0.6736    0.6265    0.6492       257

    accuracy                         0.6520       500
   macro avg     0.6529    0.6527    0.6520       500
weighted avg     0.6535    0.6520    0.6519       500

Evaluating model roben_1 on clean


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:08<00:00,  1.95it/s]


              precision    recall  f1-score   support

           0     0.8365    0.7160    0.7716       243
           1     0.7637    0.8677    0.8124       257

    accuracy                         0.7940       500
   macro avg     0.8001    0.7919    0.7920       500
weighted avg     0.7991    0.7940    0.7926       500

Evaluating model roben_1 on stochastic_no_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [01:22<00:00,  8.21s/it]


              precision    recall  f1-score   support

           0     0.7048    0.6091    0.6534       243
           1     0.6724    0.7588    0.7130       257

    accuracy                         0.6860       500
   macro avg     0.6886    0.6839    0.6832       500
weighted avg     0.6881    0.6860    0.6840       500

Evaluating model roben_1 on stochastic_incl_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [01:21<00:00,  8.19s/it]


              precision    recall  f1-score   support

           0     0.5674    0.5021    0.5328       243
           1     0.5754    0.6381    0.6052       257

    accuracy                         0.5720       500
   macro avg     0.5714    0.5701    0.5690       500
weighted avg     0.5716    0.5720    0.5700       500

Evaluating model roben_1 on word_score_no_ws


500it [03:39,  2.28it/s]


              precision    recall  f1-score   support

           0     0.7196    0.5597    0.6296       243
           1     0.6559    0.7938    0.7183       257

    accuracy                         0.6800       500
   macro avg     0.6878    0.6767    0.6740       500
weighted avg     0.6869    0.6800    0.6752       500

Evaluating model roben_1 on word_score_incl_ws


500it [03:31,  2.37it/s]


              precision    recall  f1-score   support

           0     0.6413    0.4856    0.5527       243
           1     0.6044    0.7432    0.6667       257

    accuracy                         0.6180       500
   macro avg     0.6229    0.6144    0.6097       500
weighted avg     0.6224    0.6180    0.6113       500

Evaluating model roben_2 on clean


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:08<00:00,  1.98it/s]


              precision    recall  f1-score   support

           0     0.8423    0.7695    0.8043       243
           1     0.7986    0.8638    0.8299       257

    accuracy                         0.8180       500
   macro avg     0.8205    0.8167    0.8171       500
weighted avg     0.8198    0.8180    0.8175       500

Evaluating model roben_2 on stochastic_no_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [01:20<00:00,  8.09s/it]


              precision    recall  f1-score   support

           0     0.7168    0.6667    0.6908       243
           1     0.7044    0.7510    0.7269       257

    accuracy                         0.7100       500
   macro avg     0.7106    0.7088    0.7089       500
weighted avg     0.7104    0.7100    0.7094       500

Evaluating model roben_2 on stochastic_incl_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [01:21<00:00,  8.10s/it]


              precision    recall  f1-score   support

           0     0.5778    0.5350    0.5556       243
           1     0.5891    0.6304    0.6090       257

    accuracy                         0.5840       500
   macro avg     0.5834    0.5827    0.5823       500
weighted avg     0.5836    0.5840    0.5830       500

Evaluating model roben_2 on word_score_no_ws


500it [03:50,  2.17it/s]


              precision    recall  f1-score   support

           0     0.7536    0.6420    0.6933       243
           1     0.7031    0.8016    0.7491       257

    accuracy                         0.7240       500
   macro avg     0.7283    0.7218    0.7212       500
weighted avg     0.7276    0.7240    0.7220       500

Evaluating model roben_2 on word_score_incl_ws


500it [03:40,  2.27it/s]


              precision    recall  f1-score   support

           0     0.6774    0.6049    0.6391       243
           1     0.6608    0.7276    0.6926       257

    accuracy                         0.6680       500
   macro avg     0.6691    0.6663    0.6659       500
weighted avg     0.6689    0.6680    0.6666       500

Evaluating model roben_1_tok on clean


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:07<00:00,  2.00it/s]


              precision    recall  f1-score   support

           0     0.8365    0.7160    0.7716       243
           1     0.7637    0.8677    0.8124       257

    accuracy                         0.7940       500
   macro avg     0.8001    0.7919    0.7920       500
weighted avg     0.7991    0.7940    0.7926       500

Evaluating model roben_1_tok on stochastic_no_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [01:19<00:00,  7.99s/it]


              precision    recall  f1-score   support

           0     0.7048    0.6091    0.6534       243
           1     0.6724    0.7588    0.7130       257

    accuracy                         0.6860       500
   macro avg     0.6886    0.6839    0.6832       500
weighted avg     0.6881    0.6860    0.6840       500

Evaluating model roben_1_tok on stochastic_incl_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [01:20<00:00,  8.00s/it]


              precision    recall  f1-score   support

           0     0.5674    0.5021    0.5328       243
           1     0.5754    0.6381    0.6052       257

    accuracy                         0.5720       500
   macro avg     0.5714    0.5701    0.5690       500
weighted avg     0.5716    0.5720    0.5700       500

Evaluating model roben_1_tok on word_score_no_ws


500it [03:39,  2.28it/s]


              precision    recall  f1-score   support

           0     0.7196    0.5597    0.6296       243
           1     0.6559    0.7938    0.7183       257

    accuracy                         0.6800       500
   macro avg     0.6878    0.6767    0.6740       500
weighted avg     0.6869    0.6800    0.6752       500

Evaluating model roben_1_tok on word_score_incl_ws


500it [03:29,  2.39it/s]


              precision    recall  f1-score   support

           0     0.6413    0.4856    0.5527       243
           1     0.6044    0.7432    0.6667       257

    accuracy                         0.6180       500
   macro avg     0.6229    0.6144    0.6097       500
weighted avg     0.6224    0.6180    0.6113       500

Evaluating model roben_2_tok on clean


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:08<00:00,  1.88it/s]


              precision    recall  f1-score   support

           0     0.8238    0.7119    0.7638       243
           1     0.7586    0.8560    0.8044       257

    accuracy                         0.7860       500
   macro avg     0.7912    0.7840    0.7841       500
weighted avg     0.7903    0.7860    0.7847       500

Evaluating model roben_2_tok on stochastic_no_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [01:24<00:00,  8.50s/it]


              precision    recall  f1-score   support

           0     0.7053    0.6008    0.6489       243
           1     0.6689    0.7626    0.7127       257

    accuracy                         0.6840       500
   macro avg     0.6871    0.6817    0.6808       500
weighted avg     0.6866    0.6840    0.6817       500

Evaluating model roben_2_tok on stochastic_incl_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [01:24<00:00,  8.47s/it]


              precision    recall  f1-score   support

           0     0.5880    0.5226    0.5534       243
           1     0.5915    0.6537    0.6211       257

    accuracy                         0.5900       500
   macro avg     0.5898    0.5882    0.5872       500
weighted avg     0.5898    0.5900    0.5882       500

Evaluating model roben_2_tok on word_score_no_ws


500it [03:52,  2.15it/s]


              precision    recall  f1-score   support

           0     0.7778    0.6337    0.6984       243
           1     0.7053    0.8288    0.7621       257

    accuracy                         0.7340       500
   macro avg     0.7415    0.7313    0.7302       500
weighted avg     0.7405    0.7340    0.7311       500

Evaluating model roben_2_tok on word_score_incl_ws


500it [03:40,  2.27it/s]


              precision    recall  f1-score   support

           0     0.6753    0.5391    0.5995       243
           1     0.6340    0.7549    0.6892       257

    accuracy                         0.6500       500
   macro avg     0.6546    0.6470    0.6444       500
weighted avg     0.6540    0.6500    0.6456       500

Evaluating model 64k_lstm_clean_vanilla on clean


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:49<00:00,  3.11s/it]


              precision    recall  f1-score   support

           0     0.8571    0.8642    0.8607       243
           1     0.8706    0.8638    0.8672       257

    accuracy                         0.8640       500
   macro avg     0.8639    0.8640    0.8639       500
weighted avg     0.8641    0.8640    0.8640       500

Evaluating model 64k_lstm_clean_vanilla on stochastic_no_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [08:10<00:00, 49.02s/it]


              precision    recall  f1-score   support

           0     0.6858    0.7366    0.7103       243
           1     0.7322    0.6809    0.7056       257

    accuracy                         0.7080       500
   macro avg     0.7090    0.7088    0.7080       500
weighted avg     0.7097    0.7080    0.7079       500

Evaluating model 64k_lstm_clean_vanilla on stochastic_incl_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [08:09<00:00, 48.92s/it]


              precision    recall  f1-score   support

           0     0.6370    0.7078    0.6706       243
           1     0.6913    0.6187    0.6530       257

    accuracy                         0.6620       500
   macro avg     0.6642    0.6632    0.6618       500
weighted avg     0.6649    0.6620    0.6615       500

Evaluating model 64k_lstm_clean_vanilla on word_score_no_ws


500it [45:36,  5.47s/it]


              precision    recall  f1-score   support

           0     0.6640    0.6914    0.6774       243
           1     0.6964    0.6693    0.6825       257

    accuracy                         0.6800       500
   macro avg     0.6802    0.6803    0.6800       500
weighted avg     0.6806    0.6800    0.6801       500

Evaluating model 64k_lstm_clean_vanilla on word_score_incl_ws


500it [44:14,  5.31s/it]


              precision    recall  f1-score   support

           0     0.6070    0.6420    0.6240       243
           1     0.6420    0.6070    0.6240       257

    accuracy                         0.6240       500
   macro avg     0.6245    0.6245    0.6240       500
weighted avg     0.6250    0.6240    0.6240       500

Evaluating model 64k_lstm_no_whitespace_pert_vanilla on clean


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:49<00:00,  3.07s/it]


              precision    recall  f1-score   support

           0     0.8653    0.8724    0.8689       243
           1     0.8784    0.8716    0.8750       257

    accuracy                         0.8720       500
   macro avg     0.8719    0.8720    0.8719       500
weighted avg     0.8721    0.8720    0.8720       500

Evaluating model 64k_lstm_no_whitespace_pert_vanilla on stochastic_no_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [08:13<00:00, 49.39s/it]


              precision    recall  f1-score   support

           0     0.8041    0.8107    0.8074       243
           1     0.8196    0.8132    0.8164       257

    accuracy                         0.8120       500
   macro avg     0.8118    0.8120    0.8119       500
weighted avg     0.8121    0.8120    0.8120       500

Evaluating model 64k_lstm_no_whitespace_pert_vanilla on stochastic_incl_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [08:14<00:00, 49.45s/it]


              precision    recall  f1-score   support

           0     0.6972    0.7202    0.7085       243
           1     0.7269    0.7043    0.7154       257

    accuracy                         0.7120       500
   macro avg     0.7121    0.7122    0.7120       500
weighted avg     0.7125    0.7120    0.7121       500

Evaluating model 64k_lstm_no_whitespace_pert_vanilla on word_score_no_ws


500it [48:28,  5.82s/it]


              precision    recall  f1-score   support

           0     0.7787    0.7531    0.7657       243
           1     0.7736    0.7977    0.7854       257

    accuracy                         0.7760       500
   macro avg     0.7762    0.7754    0.7756       500
weighted avg     0.7761    0.7760    0.7758       500

Evaluating model 64k_lstm_no_whitespace_pert_vanilla on word_score_incl_ws


500it [48:02,  5.77s/it]


              precision    recall  f1-score   support

           0     0.7460    0.7613    0.7536       243
           1     0.7698    0.7549    0.7623       257

    accuracy                         0.7580       500
   macro avg     0.7579    0.7581    0.7579       500
weighted avg     0.7582    0.7580    0.7580       500

Evaluating model 64k_lstm_all_pert_vanilla on clean


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:49<00:00,  3.10s/it]


              precision    recall  f1-score   support

           0     0.8734    0.8519    0.8625       243
           1     0.8631    0.8833    0.8731       257

    accuracy                         0.8680       500
   macro avg     0.8683    0.8676    0.8678       500
weighted avg     0.8681    0.8680    0.8679       500

Evaluating model 64k_lstm_all_pert_vanilla on stochastic_no_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [08:12<00:00, 49.25s/it]


              precision    recall  f1-score   support

           0     0.8162    0.7860    0.8008       243
           1     0.8045    0.8327    0.8184       257

    accuracy                         0.8100       500
   macro avg     0.8104    0.8093    0.8096       500
weighted avg     0.8102    0.8100    0.8098       500

Evaluating model 64k_lstm_all_pert_vanilla on stochastic_incl_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [08:13<00:00, 49.30s/it]


              precision    recall  f1-score   support

           0     0.8103    0.7737    0.7916       243
           1     0.7948    0.8288    0.8114       257

    accuracy                         0.8020       500
   macro avg     0.8026    0.8012    0.8015       500
weighted avg     0.8023    0.8020    0.8018       500

Evaluating model 64k_lstm_all_pert_vanilla on word_score_no_ws


500it [48:30,  5.82s/it]


              precision    recall  f1-score   support

           0     0.7888    0.7531    0.7705       243
           1     0.7761    0.8093    0.7924       257

    accuracy                         0.7820       500
   macro avg     0.7825    0.7812    0.7815       500
weighted avg     0.7823    0.7820    0.7818       500

Evaluating model 64k_lstm_all_pert_vanilla on word_score_incl_ws


500it [48:37,  5.84s/it]


              precision    recall  f1-score   support

           0     0.7759    0.7407    0.7579       243
           1     0.7649    0.7977    0.7810       257

    accuracy                         0.7700       500
   macro avg     0.7704    0.7692    0.7694       500
weighted avg     0.7702    0.7700    0.7697       500

Evaluating model 64k_lstm_clean_finetuned on clean


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:49<00:00,  3.10s/it]


              precision    recall  f1-score   support

           0     0.8543    0.8683    0.8612       243
           1     0.8735    0.8599    0.8667       257

    accuracy                         0.8640       500
   macro avg     0.8639    0.8641    0.8639       500
weighted avg     0.8642    0.8640    0.8640       500

Evaluating model 64k_lstm_clean_finetuned on stochastic_no_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [08:14<00:00, 49.43s/it]


              precision    recall  f1-score   support

           0     0.7092    0.7325    0.7206       243
           1     0.7390    0.7160    0.7273       257

    accuracy                         0.7240       500
   macro avg     0.7241    0.7242    0.7240       500
weighted avg     0.7245    0.7240    0.7241       500

Evaluating model 64k_lstm_clean_finetuned on stochastic_incl_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [08:11<00:00, 49.10s/it]


              precision    recall  f1-score   support

           0     0.6718    0.7243    0.6970       243
           1     0.7185    0.6654    0.6909       257

    accuracy                         0.6940       500
   macro avg     0.6951    0.6948    0.6940       500
weighted avg     0.6958    0.6940    0.6939       500

Evaluating model 64k_lstm_clean_finetuned on word_score_no_ws


500it [46:33,  5.59s/it]


              precision    recall  f1-score   support

           0     0.6829    0.6914    0.6871       243
           1     0.7047    0.6965    0.7006       257

    accuracy                         0.6940       500
   macro avg     0.6938    0.6939    0.6939       500
weighted avg     0.6941    0.6940    0.6940       500

Evaluating model 64k_lstm_clean_finetuned on word_score_incl_ws


500it [45:34,  5.47s/it]


              precision    recall  f1-score   support

           0     0.6454    0.6667    0.6559       243
           1     0.6747    0.6537    0.6640       257

    accuracy                         0.6600       500
   macro avg     0.6601    0.6602    0.6600       500
weighted avg     0.6605    0.6600    0.6601       500

Evaluating model 64k_lstm_no_whitespace_pert_finetuned on clean


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:49<00:00,  3.10s/it]


              precision    recall  f1-score   support

           0     0.8514    0.8724    0.8618       243
           1     0.8765    0.8560    0.8661       257

    accuracy                         0.8640       500
   macro avg     0.8639    0.8642    0.8640       500
weighted avg     0.8643    0.8640    0.8640       500

Evaluating model 64k_lstm_no_whitespace_pert_finetuned on stochastic_no_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [08:12<00:00, 49.24s/it]


              precision    recall  f1-score   support

           0     0.7984    0.8313    0.8145       243
           1     0.8340    0.8016    0.8175       257

    accuracy                         0.8160       500
   macro avg     0.8162    0.8164    0.8160       500
weighted avg     0.8167    0.8160    0.8160       500

Evaluating model 64k_lstm_no_whitespace_pert_finetuned on stochastic_incl_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [08:10<00:00, 49.04s/it]


              precision    recall  f1-score   support

           0     0.7016    0.7449    0.7226       243
           1     0.7438    0.7004    0.7214       257

    accuracy                         0.7220       500
   macro avg     0.7227    0.7226    0.7220       500
weighted avg     0.7233    0.7220    0.7220       500

Evaluating model 64k_lstm_no_whitespace_pert_finetuned on word_score_no_ws


500it [48:12,  5.78s/it]


              precision    recall  f1-score   support

           0     0.7842    0.7778    0.7810       243
           1     0.7915    0.7977    0.7946       257

    accuracy                         0.7880       500
   macro avg     0.7879    0.7877    0.7878       500
weighted avg     0.7880    0.7880    0.7880       500

Evaluating model 64k_lstm_no_whitespace_pert_finetuned on word_score_incl_ws


500it [47:00,  5.64s/it]


              precision    recall  f1-score   support

           0     0.7261    0.7202    0.7231       243
           1     0.7375    0.7432    0.7403       257

    accuracy                         0.7320       500
   macro avg     0.7318    0.7317    0.7317       500
weighted avg     0.7320    0.7320    0.7320       500

Evaluating model 64k_lstm_all_pert_finetuned on clean


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:49<00:00,  3.07s/it]


              precision    recall  f1-score   support

           0     0.8601    0.8601    0.8601       243
           1     0.8677    0.8677    0.8677       257

    accuracy                         0.8640       500
   macro avg     0.8639    0.8639    0.8639       500
weighted avg     0.8640    0.8640    0.8640       500

Evaluating model 64k_lstm_all_pert_finetuned on stochastic_no_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [08:10<00:00, 49.02s/it]


              precision    recall  f1-score   support

           0     0.7912    0.8107    0.8008       243
           1     0.8167    0.7977    0.8071       257

    accuracy                         0.8040       500
   macro avg     0.8039    0.8042    0.8039       500
weighted avg     0.8043    0.8040    0.8040       500

Evaluating model 64k_lstm_all_pert_finetuned on stochastic_incl_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [08:08<00:00, 48.86s/it]


              precision    recall  f1-score   support

           0     0.7765    0.8148    0.7952       243
           1     0.8163    0.7782    0.7968       257

    accuracy                         0.7960       500
   macro avg     0.7964    0.7965    0.7960       500
weighted avg     0.7970    0.7960    0.7960       500

Evaluating model 64k_lstm_all_pert_finetuned on word_score_no_ws


500it [48:20,  5.80s/it]


              precision    recall  f1-score   support

           0     0.7676    0.7613    0.7645       243
           1     0.7761    0.7821    0.7791       257

    accuracy                         0.7720       500
   macro avg     0.7718    0.7717    0.7718       500
weighted avg     0.7720    0.7720    0.7720       500

Evaluating model 64k_lstm_all_pert_finetuned on word_score_incl_ws


500it [47:54,  5.75s/it]


              precision    recall  f1-score   support

           0     0.7412    0.7778    0.7590       243
           1     0.7796    0.7432    0.7610       257

    accuracy                         0.7600       500
   macro avg     0.7604    0.7605    0.7600       500
weighted avg     0.7609    0.7600    0.7600       500

Evaluating model 64k_cnn_no_whitespace_pert_finetuned on clean


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:30<00:00,  1.93s/it]


              precision    recall  f1-score   support

           0     0.8333    0.0206    0.0402       243
           1     0.5182    0.9961    0.6818       257

    accuracy                         0.5220       500
   macro avg     0.6758    0.5083    0.3610       500
weighted avg     0.6714    0.5220    0.3699       500

Evaluating model 64k_cnn_no_whitespace_pert_finetuned on stochastic_no_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [05:05<00:00, 30.50s/it]


              precision    recall  f1-score   support

           0     0.3333    0.0123    0.0238       243
           1     0.5112    0.9767    0.6711       257

    accuracy                         0.5080       500
   macro avg     0.4223    0.4945    0.3475       500
weighted avg     0.4248    0.5080    0.3565       500

Evaluating model 64k_cnn_no_whitespace_pert_finetuned on stochastic_incl_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [05:03<00:00, 30.37s/it]


              precision    recall  f1-score   support

           0     0.1333    0.0082    0.0155       243
           1     0.5031    0.9494    0.6577       257

    accuracy                         0.4920       500
   macro avg     0.3182    0.4788    0.3366       500
weighted avg     0.3234    0.4920    0.3456       500

Evaluating model 64k_cnn_no_whitespace_pert_finetuned on word_score_no_ws


500it [08:26,  1.01s/it]


              precision    recall  f1-score   support

           0     0.3333    0.0041    0.0081       243
           1     0.5131    0.9922    0.6764       257

    accuracy                         0.5120       500
   macro avg     0.4232    0.4982    0.3423       500
weighted avg     0.4257    0.5120    0.3516       500

Evaluating model 64k_cnn_no_whitespace_pert_finetuned on word_score_incl_ws


500it [08:24,  1.01s/it]

              precision    recall  f1-score   support

           0     0.2000    0.0082    0.0158       243
           1     0.5082    0.9689    0.6667       257

    accuracy                         0.5020       500
   macro avg     0.3541    0.4886    0.3412       500
weighted avg     0.3584    0.5020    0.3504       500






In [24]:
accuracy_df

Unnamed: 0,clean,stochastic_no_ws,stochastic_incl_ws,word_score_no_ws,word_score_incl_ws
baseline,0.88,0.722,0.69,0.644,0.652
roben_1,0.794,0.686,0.572,0.68,0.618
roben_2,0.818,0.71,0.584,0.724,0.668
roben_1_tok,0.794,0.686,0.572,0.68,0.618
roben_2_tok,0.786,0.684,0.59,0.734,0.65
64k_lstm_clean_vanilla,0.864,0.708,0.662,0.68,0.624
64k_lstm_no_whitespace_pert_vanilla,0.872,0.812,0.712,0.776,0.758
64k_lstm_all_pert_vanilla,0.868,0.81,0.802,0.782,0.77
64k_lstm_clean_finetuned,0.864,0.724,0.694,0.694,0.66
64k_lstm_no_whitespace_pert_finetuned,0.864,0.816,0.722,0.788,0.732


In [25]:
f1_df

Unnamed: 0,clean,stochastic_no_ws,stochastic_incl_ws,word_score_no_ws,word_score_incl_ws
baseline,0.879906,0.721946,0.689989,0.643538,0.651978
roben_1,0.792002,0.683201,0.568959,0.67397,0.60968
roben_2,0.817104,0.708881,0.582289,0.721212,0.665862
roben_1_tok,0.792002,0.683201,0.568959,0.67397,0.60968
roben_2_tok,0.784092,0.680808,0.587224,0.730244,0.644354
64k_lstm_clean_vanilla,0.863922,0.707981,0.661771,0.67998,0.624
64k_lstm_no_whitespace_pert_vanilla,0.871926,0.811892,0.711959,0.775565,0.757922
64k_lstm_all_pert_vanilla,0.867788,0.809597,0.801504,0.781454,0.769424
64k_lstm_clean_finetuned,0.863946,0.72396,0.693969,0.693852,0.659951
64k_lstm_no_whitespace_pert_finetuned,0.863965,0.815988,0.721999,0.787783,0.731725


In [26]:
accuracy_df.to_csv("../output/grid_accuracy.csv")

In [27]:
f1_df.to_csv("../output/grid_f1.csv")