In [1]:
import sys
sys.path.append("..")

import copy
import cProfile
from datasets import load_dataset
import json
import math
import nltk
import pandas as pd
import random
from sklearn.metrics import classification_report, accuracy_score, f1_score
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, DataCollatorWithPadding, \
                         AutoModelForSequenceClassification, BertForSequenceClassification

from resilient_nlp.mini_roben import Clustering, ClusterRepRecoverer, ClusterRecovererWithPassthrough
from resilient_nlp.models import BertClassifier
from resilient_nlp.perturbers import ToyPerturber, WordScramblerPerturber
from runner import ExperimentRunner
from word_score_attack import BertWordScoreAttack

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [2]:
# imdb = load_dataset('imdb')
imdb = load_dataset('artemis13fowl/imdb')

Using custom data configuration artemis13fowl--imdb-f63738dec0d5e230
Reusing dataset parquet (/home/ec2-user/.cache/huggingface/datasets/parquet/artemis13fowl--imdb-f63738dec0d5e230/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)


  0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
random.seed(11)
sampled_test_set = imdb['attack_eval_truncated']

# This is silly but apparently huggingface datasets are immutable?
# Representing it as something a bit more sane
sampled_test_set_dict = [
    {
        'text': row['text'],
        'label': row['label'],
    }
    for row in sampled_test_set
]

In [4]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [5]:
checkpoint_finetuned = "artemis13fowl/bert-base-uncased-imdb"
model_finetuned = BertForSequenceClassification.from_pretrained(checkpoint_finetuned).to(device)

In [6]:
roben_clustering = Clustering.from_pickle("../vocab100000_ed1.pkl")
roben_recoverer = ClusterRecovererWithPassthrough("cache", roben_clustering)
roben_clustering2 = Clustering.from_pickle("../vocab100000_ed1_gamma0.3.pkl")
roben_recoverer2 = ClusterRecovererWithPassthrough("cache", roben_clustering2)

In [7]:
random.seed(11)
sampled_test_set_adv_no_ws = []
wsp = WordScramblerPerturber(perturb_prob=0.1, weight_add=1, weight_drop=1, weight_swap=1,
                             weight_split_word=0, weight_merge_words=0)

for i in range(10):
    test_item = copy.deepcopy(sampled_test_set_dict)

    for row in test_item:
        row['text'] = wsp.perturb([row['text']])[0][0]
    sampled_test_set_adv_no_ws.append(test_item)

In [8]:
random.seed(11)
sampled_test_set_adv_incl_ws = []
wsp = WordScramblerPerturber(perturb_prob=0.1, weight_add=1, weight_drop=1, weight_swap=1,
                             weight_split_word=1, weight_merge_words=1)

for i in range(10):
    test_item = copy.deepcopy(sampled_test_set_dict)

    for row in test_item:
        row['text'] = wsp.perturb([row['text']])[0][0]
    sampled_test_set_adv_incl_ws.append(test_item)

In [9]:
max_sequence_length = 128
batch_size = 32
eval_steps = 100

In [10]:
def standard_model_predict(tokenizer, model, sentences, recoverer, return_pred_tensor, recoverer_tokenize):
    if recoverer is not None:
        if recoverer_tokenize:
            tok = nltk.tokenize.treebank.TreebankWordTokenizer()
            sentences = [ " ".join(tok_list) for tok_list in tok.tokenize_sents(sentences) ]
        sentences = [ recoverer.recover(s.lower()) for s in sentences ]
        if recoverer_tokenize:
            detok = nltk.tokenize.treebank.TreebankWordDetokenizer()
            sentences = [ detok.detokenize(s.split(" ")) for s in sentences]
    tokenized = tokenizer(sentences, truncation=True, padding='max_length', max_length=max_sequence_length,
                          return_tensors='pt')
    tokenized = { k: v.to(device) for k, v in tokenized.items() }
    preds = model(**tokenized)
    if return_pred_tensor:
        return preds
    else:
        return torch.argmax(preds.logits, dim=1)

def wrap_standard_model(tokenizer, model, recoverer=None, return_pred_tensor=True, recoverer_tokenize=False):
    return lambda sentences: standard_model_predict(tokenizer, model, sentences, recoverer, return_pred_tensor,
                                                    recoverer_tokenize)

In [11]:
def mltokenizer_model_predict(runner, model, cls_embedding, sep_embedding, pad_embedding, sentences, return_pred_tensor):
    # Truncate and lower case. Truncation is for performance only
    sentences = [ s.lower()[:5*max_sequence_length] for s in sentences]
    embedding = runner.embed(sentences=sentences,
        start_token=cls_embedding, end_token=sep_embedding, pad_token=pad_embedding,
        max_tokens=max_sequence_length)
    preds = model(inputs_embeds=embedding['inputs_embeds'], attention_mask=embedding['attention_mask'])
    if return_pred_tensor:
        return preds
    else:
        return torch.argmax(preds.logits, dim=1)

def wrap_mltokenizer_model(mltokenizer_prefix, tokenizer, model, return_pred_tensor=True):
    filename = "../{}.pth".format(mltokenizer_prefix)
    runner = ExperimentRunner(device, model_filename=filename)
    cf_embedding = model.base_model.embeddings.word_embeddings
    cls_token_id = tokenizer.vocab['[CLS]']
    sep_token_id = tokenizer.vocab['[SEP]']
    pad_token_id = tokenizer.vocab['[PAD]']
    cls_embedding = cf_embedding(torch.tensor([cls_token_id], device=device)).view(-1)
    sep_embedding = cf_embedding(torch.tensor([sep_token_id], device=device)).view(-1)
    pad_embedding = cf_embedding(torch.tensor([pad_token_id], device=device)).view(-1)
    
    return lambda sentences: mltokenizer_model_predict(runner, model, cls_embedding, sep_embedding,
                                                      pad_embedding, sentences, return_pred_tensor)

In [12]:
@torch.no_grad()
def evaluate_model(model, test_set):
    num_batches = math.ceil(len(test_set) / batch_size)
    
    sentences = [ x['text'] for x in test_set ]
    labels = [ x['label'] for x in test_set ]
    pred_batches = []
    
    for i in tqdm(range(num_batches)):
        bs = i * batch_size
        be = bs + batch_size
        
        output = model(sentences[bs:be])
        
        pred_batches.append(torch.argmax(output.logits, dim=1).detach().cpu())
    preds = torch.cat(pred_batches)
    
    print(classification_report(labels, preds, digits=4))
    
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='macro')
    
    return accuracy, f1

In [13]:
@torch.no_grad()
def evaluate_model_adv(model, test_sets):
    labels = [ x['label'] for x in test_sets[0] ]
    adv_preds = copy.copy(labels)
    accuracy_list = []
    f1_list = []
    
    for idx in tqdm(range(len(test_sets))):
        test_set = test_sets[idx]
        num_batches = math.ceil(len(test_set) / batch_size)
    
        sentences = [ x['text'] for x in test_set ]
        pred_batches = []
    
        for i in range(num_batches):
            bs = i * batch_size
            be = bs + batch_size
        
            output = model(sentences[bs:be])
        
            pred_batches.append(torch.argmax(output.logits, dim=1).detach().cpu())
        preds = torch.cat(pred_batches)
        
        for i in range(len(adv_preds)):
            if labels[i] == 1.0 and preds[i] == 0.0:
                adv_preds[i] = 0.0
            elif labels[i] == 0.0 and preds[i] == 1.0:
                adv_preds[i] = 1.0

        accuracy_list.append(accuracy_score(labels, adv_preds))
        f1_list.append(f1_score(labels, adv_preds, average='macro'))
    
    print(classification_report(labels, adv_preds, digits=4))    
    
    return accuracy_list, f1_list

In [14]:
@torch.no_grad()
def evaluate_model_word_score(model, test_set, allow_whitespace_pert=True, report_prefix=None):
    attacker = BertWordScoreAttack(
        WordScramblerPerturber(perturb_prob=1, weight_add=1, weight_drop=1, weight_swap=1,
                               weight_split_word=int(allow_whitespace_pert),
                               weight_merge_words=int(allow_whitespace_pert)),
        "../output/imdb_word_scores.json", model, tokenizer=None, max_sequence_length=max_sequence_length
    )

    res = attacker.attack(test_set, max_tokens_to_query=10, max_tries_per_token=4, mode=0, print_summary=False)

    if report_prefix is not None:
        res.to_csv(f"{report_prefix}_df.csv")
        with open(f"{report_prefix}_stats.json", "w") as f:
            json.dump(attacker.compute_attack_stats(), fp=f)            
    
    print(classification_report(res['ground_truth'], res['perturbed_preds'], digits=4))    
    
    accuracy = accuracy_score(res['ground_truth'], res['perturbed_preds'])
    f1 = f1_score(res['ground_truth'], res['perturbed_preds'], average='macro')
    
    return accuracy, f1

In [15]:
baseline_model = wrap_standard_model(tokenizer, model_finetuned)

In [16]:
mltok_model = wrap_mltokenizer_model('output/64k_lstm_all_pert_finetuned', tokenizer, model_finetuned)

In [17]:
baseline_roben_model = wrap_standard_model(tokenizer, model_finetuned, roben_recoverer)

In [18]:
baseline_roben_model_tok = wrap_standard_model(tokenizer, model_finetuned, roben_recoverer, recoverer_tokenize=True)

In [19]:
baseline_roben_model2 = wrap_standard_model(tokenizer, model_finetuned, roben_recoverer2)

In [20]:
baseline_roben_model2_tok = wrap_standard_model(tokenizer, model_finetuned, roben_recoverer2, recoverer_tokenize=True)

In [21]:
all_models = {
    'baseline': baseline_model,
    'roben_1': baseline_roben_model,
    'roben_2': baseline_roben_model2,
    'roben_1_tok': baseline_roben_model,
    'roben_2_tok': baseline_roben_model2_tok,
}

mltok_model_names = [
    '64k_lstm_clean_vanilla',
    '64k_lstm_no_whitespace_pert_vanilla',
    '64k_lstm_all_pert_vanilla',
    '64k_lstm_clean_finetuned',
    '64k_lstm_no_whitespace_pert_finetuned',
    '64k_lstm_all_pert_finetuned',
    '64k_cnn_no_whitespace_pert_finetuned',
]

for name in mltok_model_names:
    all_models[name] = wrap_mltokenizer_model(f'output/{name}', tokenizer, model_finetuned)

In [22]:
evaluations = [
    'clean',
    'stochastic_no_ws',
    'stochastic_incl_ws',
    'word_score_no_ws',
    'word_score_incl_ws',
]

In [23]:
accuracy_df = pd.DataFrame(columns=evaluations, index=all_models.keys())
f1_df = pd.DataFrame(columns=evaluations, index=all_models.keys())

for cur_model_name, cur_model in all_models.items():
    for cur_evaluation in evaluations:
        print(f'Evaluating model {cur_model_name} on {cur_evaluation}')
        random.seed(11)
        if cur_evaluation == 'clean':
            acc, f1 = evaluate_model(cur_model, sampled_test_set)
        elif cur_evaluation.startswith('stochastic_'):
            if cur_evaluation == 'stochastic_no_ws':
                acc_list, f1_list = evaluate_model_adv(cur_model, sampled_test_set_adv_no_ws)
            elif cur_evaluation == 'stochastic_incl_ws':
                acc_list, f1_list = evaluate_model_adv(cur_model, sampled_test_set_adv_incl_ws)
            acc = acc_list[-1]
            f1 = f1_list[-1]
            with open(f"../output/eval/{cur_model_name}_{cur_evaluation}_acc_list.json", "w") as f:
                json.dump(acc_list, fp=f)
            with open(f"../output/eval/{cur_model_name}_{cur_evaluation}_f1_list.json", "w") as f:
                json.dump(f1_list, fp=f)
        elif cur_evaluation.startswith('word_score_'):
            if cur_evaluation == 'word_score_no_ws':
                acc, f1 = evaluate_model_word_score(cur_model, sampled_test_set, allow_whitespace_pert=False,
                                                    report_prefix=f"../output/eval/{cur_model_name}_{cur_evaluation}")
            elif cur_evaluation == 'word_score_incl_ws':
                acc, f1 = evaluate_model_word_score(cur_model, sampled_test_set, allow_whitespace_pert=True,
                                                    report_prefix=f"../output/eval/{cur_model_name}_{cur_evaluation}")

        accuracy_df[cur_evaluation][cur_model_name] = acc
        f1_df[cur_evaluation][cur_model_name] = f1

Evaluating model baseline on clean


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:01<00:00, 13.89it/s]


              precision    recall  f1-score   support

           0     0.8765    0.8765    0.8765       243
           1     0.8833    0.8833    0.8833       257

    accuracy                         0.8800       500
   macro avg     0.8799    0.8799    0.8799       500
weighted avg     0.8800    0.8800    0.8800       500

Evaluating model baseline on stochastic_no_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:11<00:00,  1.14s/it]


              precision    recall  f1-score   support

           0     0.6970    0.7572    0.7258       243
           1     0.7500    0.6887    0.7181       257

    accuracy                         0.7220       500
   macro avg     0.7235    0.7230    0.7219       500
weighted avg     0.7242    0.7220    0.7218       500

Evaluating model baseline on stochastic_incl_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:11<00:00,  1.14s/it]


              precision    recall  f1-score   support

           0     0.6692    0.7160    0.6918       243
           1     0.7125    0.6654    0.6881       257

    accuracy                         0.6900       500
   macro avg     0.6909    0.6907    0.6900       500
weighted avg     0.6915    0.6900    0.6899       500

Evaluating model baseline on word_score_no_ws


500it [04:04,  2.04it/s]


              precision    recall  f1-score   support

           0     0.5878    0.6337    0.6099       243
           1     0.6261    0.5798    0.6020       257

    accuracy                         0.6060       500
   macro avg     0.6069    0.6068    0.6060       500
weighted avg     0.6075    0.6060    0.6059       500

Evaluating model baseline on word_score_incl_ws


500it [04:03,  2.05it/s]


              precision    recall  f1-score   support

           0     0.5843    0.6420    0.6118       243
           1     0.6266    0.5681    0.5959       257

    accuracy                         0.6040       500
   macro avg     0.6054    0.6050    0.6038       500
weighted avg     0.6060    0.6040    0.6036       500

Evaluating model roben_1 on clean


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:01<00:00, 11.75it/s]


              precision    recall  f1-score   support

           0     0.8365    0.7160    0.7716       243
           1     0.7637    0.8677    0.8124       257

    accuracy                         0.7940       500
   macro avg     0.8001    0.7919    0.7920       500
weighted avg     0.7991    0.7940    0.7926       500

Evaluating model roben_1 on stochastic_no_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:13<00:00,  1.35s/it]


              precision    recall  f1-score   support

           0     0.7048    0.6091    0.6534       243
           1     0.6724    0.7588    0.7130       257

    accuracy                         0.6860       500
   macro avg     0.6886    0.6839    0.6832       500
weighted avg     0.6881    0.6860    0.6840       500

Evaluating model roben_1 on stochastic_incl_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:13<00:00,  1.34s/it]


              precision    recall  f1-score   support

           0     0.5674    0.5021    0.5328       243
           1     0.5754    0.6381    0.6052       257

    accuracy                         0.5720       500
   macro avg     0.5714    0.5701    0.5690       500
weighted avg     0.5716    0.5720    0.5700       500

Evaluating model roben_1 on word_score_no_ws


500it [04:02,  2.06it/s]


              precision    recall  f1-score   support

           0     0.6898    0.5309    0.6000       243
           1     0.6358    0.7743    0.6982       257

    accuracy                         0.6560       500
   macro avg     0.6628    0.6526    0.6491       500
weighted avg     0.6621    0.6560    0.6505       500

Evaluating model roben_1 on word_score_incl_ws


500it [03:38,  2.29it/s]


              precision    recall  f1-score   support

           0     0.5538    0.4239    0.4802       243
           1     0.5541    0.6770    0.6095       257

    accuracy                         0.5540       500
   macro avg     0.5540    0.5505    0.5448       500
weighted avg     0.5540    0.5540    0.5466       500

Evaluating model roben_2 on clean


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:01<00:00, 12.88it/s]


              precision    recall  f1-score   support

           0     0.8423    0.7695    0.8043       243
           1     0.7986    0.8638    0.8299       257

    accuracy                         0.8180       500
   macro avg     0.8205    0.8167    0.8171       500
weighted avg     0.8198    0.8180    0.8175       500

Evaluating model roben_2 on stochastic_no_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:12<00:00,  1.23s/it]


              precision    recall  f1-score   support

           0     0.7168    0.6667    0.6908       243
           1     0.7044    0.7510    0.7269       257

    accuracy                         0.7100       500
   macro avg     0.7106    0.7088    0.7089       500
weighted avg     0.7104    0.7100    0.7094       500

Evaluating model roben_2 on stochastic_incl_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:12<00:00,  1.23s/it]


              precision    recall  f1-score   support

           0     0.5778    0.5350    0.5556       243
           1     0.5891    0.6304    0.6090       257

    accuracy                         0.5840       500
   macro avg     0.5834    0.5827    0.5823       500
weighted avg     0.5836    0.5840    0.5830       500

Evaluating model roben_2 on word_score_no_ws


500it [04:15,  1.96it/s]


              precision    recall  f1-score   support

           0     0.7429    0.6420    0.6887       243
           1     0.7000    0.7899    0.7422       257

    accuracy                         0.7180       500
   macro avg     0.7214    0.7159    0.7155       500
weighted avg     0.7208    0.7180    0.7162       500

Evaluating model roben_2 on word_score_incl_ws


500it [03:53,  2.14it/s]


              precision    recall  f1-score   support

           0     0.5936    0.5350    0.5628       243
           1     0.5979    0.6537    0.6245       257

    accuracy                         0.5960       500
   macro avg     0.5957    0.5943    0.5937       500
weighted avg     0.5958    0.5960    0.5945       500

Evaluating model roben_1_tok on clean


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:01<00:00, 13.97it/s]


              precision    recall  f1-score   support

           0     0.8365    0.7160    0.7716       243
           1     0.7637    0.8677    0.8124       257

    accuracy                         0.7940       500
   macro avg     0.8001    0.7919    0.7920       500
weighted avg     0.7991    0.7940    0.7926       500

Evaluating model roben_1_tok on stochastic_no_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:11<00:00,  1.14s/it]


              precision    recall  f1-score   support

           0     0.7048    0.6091    0.6534       243
           1     0.6724    0.7588    0.7130       257

    accuracy                         0.6860       500
   macro avg     0.6886    0.6839    0.6832       500
weighted avg     0.6881    0.6860    0.6840       500

Evaluating model roben_1_tok on stochastic_incl_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:11<00:00,  1.14s/it]


              precision    recall  f1-score   support

           0     0.5674    0.5021    0.5328       243
           1     0.5754    0.6381    0.6052       257

    accuracy                         0.5720       500
   macro avg     0.5714    0.5701    0.5690       500
weighted avg     0.5716    0.5720    0.5700       500

Evaluating model roben_1_tok on word_score_no_ws


500it [03:53,  2.14it/s]


              precision    recall  f1-score   support

           0     0.6898    0.5309    0.6000       243
           1     0.6358    0.7743    0.6982       257

    accuracy                         0.6560       500
   macro avg     0.6628    0.6526    0.6491       500
weighted avg     0.6621    0.6560    0.6505       500

Evaluating model roben_1_tok on word_score_incl_ws


500it [03:34,  2.33it/s]


              precision    recall  f1-score   support

           0     0.5538    0.4239    0.4802       243
           1     0.5541    0.6770    0.6095       257

    accuracy                         0.5540       500
   macro avg     0.5540    0.5505    0.5448       500
weighted avg     0.5540    0.5540    0.5466       500

Evaluating model roben_2_tok on clean


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:01<00:00,  9.78it/s]


              precision    recall  f1-score   support

           0     0.8238    0.7119    0.7638       243
           1     0.7586    0.8560    0.8044       257

    accuracy                         0.7860       500
   macro avg     0.7912    0.7840    0.7841       500
weighted avg     0.7903    0.7860    0.7847       500

Evaluating model roben_2_tok on stochastic_no_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:16<00:00,  1.60s/it]


              precision    recall  f1-score   support

           0     0.7053    0.6008    0.6489       243
           1     0.6689    0.7626    0.7127       257

    accuracy                         0.6840       500
   macro avg     0.6871    0.6817    0.6808       500
weighted avg     0.6866    0.6840    0.6817       500

Evaluating model roben_2_tok on stochastic_incl_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:15<00:00,  1.60s/it]


              precision    recall  f1-score   support

           0     0.5880    0.5226    0.5534       243
           1     0.5915    0.6537    0.6211       257

    accuracy                         0.5900       500
   macro avg     0.5898    0.5882    0.5872       500
weighted avg     0.5898    0.5900    0.5882       500

Evaluating model roben_2_tok on word_score_no_ws


500it [04:22,  1.90it/s]


              precision    recall  f1-score   support

           0     0.7610    0.6420    0.6964       243
           1     0.7051    0.8093    0.7536       257

    accuracy                         0.7280       500
   macro avg     0.7330    0.7257    0.7250       500
weighted avg     0.7322    0.7280    0.7258       500

Evaluating model roben_2_tok on word_score_incl_ws


500it [03:56,  2.11it/s]


              precision    recall  f1-score   support

           0     0.6098    0.5144    0.5580       243
           1     0.6000    0.6887    0.6413       257

    accuracy                         0.6040       500
   macro avg     0.6049    0.6016    0.5997       500
weighted avg     0.6047    0.6040    0.6008       500

Evaluating model 64k_lstm_clean_vanilla on clean


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:20<00:00,  1.26s/it]


              precision    recall  f1-score   support

           0     0.8571    0.8642    0.8607       243
           1     0.8706    0.8638    0.8672       257

    accuracy                         0.8640       500
   macro avg     0.8639    0.8640    0.8639       500
weighted avg     0.8641    0.8640    0.8640       500

Evaluating model 64k_lstm_clean_vanilla on stochastic_no_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:20<00:00, 20.07s/it]


              precision    recall  f1-score   support

           0     0.6858    0.7366    0.7103       243
           1     0.7322    0.6809    0.7056       257

    accuracy                         0.7080       500
   macro avg     0.7090    0.7088    0.7080       500
weighted avg     0.7097    0.7080    0.7079       500

Evaluating model 64k_lstm_clean_vanilla on stochastic_incl_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:26<00:00, 20.62s/it]


              precision    recall  f1-score   support

           0     0.6370    0.7078    0.6706       243
           1     0.6913    0.6187    0.6530       257

    accuracy                         0.6620       500
   macro avg     0.6642    0.6632    0.6618       500
weighted avg     0.6649    0.6620    0.6615       500

Evaluating model 64k_lstm_clean_vanilla on word_score_no_ws


500it [45:24,  5.45s/it]


              precision    recall  f1-score   support

           0     0.5804    0.6091    0.5944       243
           1     0.6122    0.5837    0.5976       257

    accuracy                         0.5960       500
   macro avg     0.5963    0.5964    0.5960       500
weighted avg     0.5968    0.5960    0.5960       500

Evaluating model 64k_lstm_clean_vanilla on word_score_incl_ws


500it [44:15,  5.31s/it]


              precision    recall  f1-score   support

           0     0.5396    0.5885    0.5630       243
           1     0.5745    0.5253    0.5488       257

    accuracy                         0.5560       500
   macro avg     0.5570    0.5569    0.5559       500
weighted avg     0.5575    0.5560    0.5557       500

Evaluating model 64k_lstm_no_whitespace_pert_vanilla on clean


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:20<00:00,  1.26s/it]


              precision    recall  f1-score   support

           0     0.8653    0.8724    0.8689       243
           1     0.8784    0.8716    0.8750       257

    accuracy                         0.8720       500
   macro avg     0.8719    0.8720    0.8719       500
weighted avg     0.8721    0.8720    0.8720       500

Evaluating model 64k_lstm_no_whitespace_pert_vanilla on stochastic_no_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:21<00:00, 20.15s/it]


              precision    recall  f1-score   support

           0     0.8041    0.8107    0.8074       243
           1     0.8196    0.8132    0.8164       257

    accuracy                         0.8120       500
   macro avg     0.8118    0.8120    0.8119       500
weighted avg     0.8121    0.8120    0.8120       500

Evaluating model 64k_lstm_no_whitespace_pert_vanilla on stochastic_incl_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:22<00:00, 20.22s/it]


              precision    recall  f1-score   support

           0     0.6972    0.7202    0.7085       243
           1     0.7269    0.7043    0.7154       257

    accuracy                         0.7120       500
   macro avg     0.7121    0.7122    0.7120       500
weighted avg     0.7125    0.7120    0.7121       500

Evaluating model 64k_lstm_no_whitespace_pert_vanilla on word_score_no_ws


500it [50:50,  6.10s/it]


              precision    recall  f1-score   support

           0     0.7288    0.7078    0.7182       243
           1     0.7311    0.7510    0.7409       257

    accuracy                         0.7300       500
   macro avg     0.7299    0.7294    0.7295       500
weighted avg     0.7300    0.7300    0.7298       500

Evaluating model 64k_lstm_no_whitespace_pert_vanilla on word_score_incl_ws


500it [48:58,  5.88s/it]


              precision    recall  f1-score   support

           0     0.6638    0.6337    0.6484       243
           1     0.6679    0.6965    0.6819       257

    accuracy                         0.6660       500
   macro avg     0.6659    0.6651    0.6652       500
weighted avg     0.6659    0.6660    0.6656       500

Evaluating model 64k_lstm_all_pert_vanilla on clean


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:20<00:00,  1.26s/it]


              precision    recall  f1-score   support

           0     0.8734    0.8519    0.8625       243
           1     0.8631    0.8833    0.8731       257

    accuracy                         0.8680       500
   macro avg     0.8683    0.8676    0.8678       500
weighted avg     0.8681    0.8680    0.8679       500

Evaluating model 64k_lstm_all_pert_vanilla on stochastic_no_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:23<00:00, 20.32s/it]


              precision    recall  f1-score   support

           0     0.8162    0.7860    0.8008       243
           1     0.8045    0.8327    0.8184       257

    accuracy                         0.8100       500
   macro avg     0.8104    0.8093    0.8096       500
weighted avg     0.8102    0.8100    0.8098       500

Evaluating model 64k_lstm_all_pert_vanilla on stochastic_incl_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:23<00:00, 20.39s/it]


              precision    recall  f1-score   support

           0     0.8103    0.7737    0.7916       243
           1     0.7948    0.8288    0.8114       257

    accuracy                         0.8020       500
   macro avg     0.8026    0.8012    0.8015       500
weighted avg     0.8023    0.8020    0.8018       500

Evaluating model 64k_lstm_all_pert_vanilla on word_score_no_ws


500it [50:40,  6.08s/it]


              precision    recall  f1-score   support

           0     0.7156    0.6420    0.6768       243
           1     0.6915    0.7588    0.7236       257

    accuracy                         0.7020       500
   macro avg     0.7035    0.7004    0.7002       500
weighted avg     0.7032    0.7020    0.7008       500

Evaluating model 64k_lstm_all_pert_vanilla on word_score_incl_ws


500it [51:17,  6.15s/it]


              precision    recall  f1-score   support

           0     0.6870    0.6502    0.6681       243
           1     0.6852    0.7198    0.7021       257

    accuracy                         0.6860       500
   macro avg     0.6861    0.6850    0.6851       500
weighted avg     0.6860    0.6860    0.6856       500

Evaluating model 64k_lstm_clean_finetuned on clean


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:20<00:00,  1.27s/it]


              precision    recall  f1-score   support

           0     0.8543    0.8683    0.8612       243
           1     0.8735    0.8599    0.8667       257

    accuracy                         0.8640       500
   macro avg     0.8639    0.8641    0.8639       500
weighted avg     0.8642    0.8640    0.8640       500

Evaluating model 64k_lstm_clean_finetuned on stochastic_no_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:22<00:00, 20.29s/it]


              precision    recall  f1-score   support

           0     0.7092    0.7325    0.7206       243
           1     0.7390    0.7160    0.7273       257

    accuracy                         0.7240       500
   macro avg     0.7241    0.7242    0.7240       500
weighted avg     0.7245    0.7240    0.7241       500

Evaluating model 64k_lstm_clean_finetuned on stochastic_incl_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:22<00:00, 20.29s/it]


              precision    recall  f1-score   support

           0     0.6718    0.7243    0.6970       243
           1     0.7185    0.6654    0.6909       257

    accuracy                         0.6940       500
   macro avg     0.6951    0.6948    0.6940       500
weighted avg     0.6958    0.6940    0.6939       500

Evaluating model 64k_lstm_clean_finetuned on word_score_no_ws


500it [47:36,  5.71s/it]


              precision    recall  f1-score   support

           0     0.5697    0.5885    0.5789       243
           1     0.5984    0.5798    0.5889       257

    accuracy                         0.5840       500
   macro avg     0.5841    0.5841    0.5839       500
weighted avg     0.5845    0.5840    0.5841       500

Evaluating model 64k_lstm_clean_finetuned on word_score_incl_ws


500it [46:00,  5.52s/it]


              precision    recall  f1-score   support

           0     0.5512    0.5761    0.5634       243
           1     0.5813    0.5564    0.5686       257

    accuracy                         0.5660       500
   macro avg     0.5662    0.5663    0.5660       500
weighted avg     0.5667    0.5660    0.5661       500

Evaluating model 64k_lstm_no_whitespace_pert_finetuned on clean


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:20<00:00,  1.28s/it]


              precision    recall  f1-score   support

           0     0.8514    0.8724    0.8618       243
           1     0.8765    0.8560    0.8661       257

    accuracy                         0.8640       500
   macro avg     0.8639    0.8642    0.8640       500
weighted avg     0.8643    0.8640    0.8640       500

Evaluating model 64k_lstm_no_whitespace_pert_finetuned on stochastic_no_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:20<00:00, 20.05s/it]


              precision    recall  f1-score   support

           0     0.7984    0.8313    0.8145       243
           1     0.8340    0.8016    0.8175       257

    accuracy                         0.8160       500
   macro avg     0.8162    0.8164    0.8160       500
weighted avg     0.8167    0.8160    0.8160       500

Evaluating model 64k_lstm_no_whitespace_pert_finetuned on stochastic_incl_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:20<00:00, 20.05s/it]


              precision    recall  f1-score   support

           0     0.7016    0.7449    0.7226       243
           1     0.7438    0.7004    0.7214       257

    accuracy                         0.7220       500
   macro avg     0.7227    0.7226    0.7220       500
weighted avg     0.7233    0.7220    0.7220       500

Evaluating model 64k_lstm_no_whitespace_pert_finetuned on word_score_no_ws


500it [52:02,  6.25s/it]


              precision    recall  f1-score   support

           0     0.7447    0.7202    0.7322       243
           1     0.7434    0.7665    0.7548       257

    accuracy                         0.7440       500
   macro avg     0.7440    0.7434    0.7435       500
weighted avg     0.7440    0.7440    0.7438       500

Evaluating model 64k_lstm_no_whitespace_pert_finetuned on word_score_incl_ws


500it [50:17,  6.04s/it]


              precision    recall  f1-score   support

           0     0.6694    0.6667    0.6680       243
           1     0.6860    0.6887    0.6874       257

    accuracy                         0.6780       500
   macro avg     0.6777    0.6777    0.6777       500
weighted avg     0.6780    0.6780    0.6780       500

Evaluating model 64k_lstm_all_pert_finetuned on clean


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:20<00:00,  1.25s/it]


              precision    recall  f1-score   support

           0     0.8601    0.8601    0.8601       243
           1     0.8677    0.8677    0.8677       257

    accuracy                         0.8640       500
   macro avg     0.8639    0.8639    0.8639       500
weighted avg     0.8640    0.8640    0.8640       500

Evaluating model 64k_lstm_all_pert_finetuned on stochastic_no_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:21<00:00, 20.12s/it]


              precision    recall  f1-score   support

           0     0.7912    0.8107    0.8008       243
           1     0.8167    0.7977    0.8071       257

    accuracy                         0.8040       500
   macro avg     0.8039    0.8042    0.8039       500
weighted avg     0.8043    0.8040    0.8040       500

Evaluating model 64k_lstm_all_pert_finetuned on stochastic_incl_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:23<00:00, 20.37s/it]


              precision    recall  f1-score   support

           0     0.7765    0.8148    0.7952       243
           1     0.8163    0.7782    0.7968       257

    accuracy                         0.7960       500
   macro avg     0.7964    0.7965    0.7960       500
weighted avg     0.7970    0.7960    0.7960       500

Evaluating model 64k_lstm_all_pert_finetuned on word_score_no_ws


500it [50:25,  6.05s/it]


              precision    recall  f1-score   support

           0     0.6815    0.6955    0.6884       243
           1     0.7063    0.6926    0.6994       257

    accuracy                         0.6940       500
   macro avg     0.6939    0.6940    0.6939       500
weighted avg     0.6942    0.6940    0.6941       500

Evaluating model 64k_lstm_all_pert_finetuned on word_score_incl_ws


500it [50:29,  6.06s/it]


              precision    recall  f1-score   support

           0     0.6875    0.6790    0.6832       243
           1     0.7000    0.7082    0.7041       257

    accuracy                         0.6940       500
   macro avg     0.6937    0.6936    0.6936       500
weighted avg     0.6939    0.6940    0.6939       500

Evaluating model 64k_cnn_no_whitespace_pert_finetuned on clean


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:17<00:00,  1.10s/it]


              precision    recall  f1-score   support

           0     0.8333    0.0206    0.0402       243
           1     0.5182    0.9961    0.6818       257

    accuracy                         0.5220       500
   macro avg     0.6758    0.5083    0.3610       500
weighted avg     0.6714    0.5220    0.3699       500

Evaluating model 64k_cnn_no_whitespace_pert_finetuned on stochastic_no_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [02:55<00:00, 17.60s/it]


              precision    recall  f1-score   support

           0     0.3333    0.0123    0.0238       243
           1     0.5112    0.9767    0.6711       257

    accuracy                         0.5080       500
   macro avg     0.4223    0.4945    0.3475       500
weighted avg     0.4248    0.5080    0.3565       500

Evaluating model 64k_cnn_no_whitespace_pert_finetuned on stochastic_incl_ws


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [02:56<00:00, 17.63s/it]


              precision    recall  f1-score   support

           0     0.1333    0.0082    0.0155       243
           1     0.5031    0.9494    0.6577       257

    accuracy                         0.4920       500
   macro avg     0.3182    0.4788    0.3366       500
weighted avg     0.3234    0.4920    0.3456       500

Evaluating model 64k_cnn_no_whitespace_pert_finetuned on word_score_no_ws


500it [09:23,  1.13s/it]


              precision    recall  f1-score   support

           0     0.1667    0.0041    0.0080       243
           1     0.5101    0.9805    0.6711       257

    accuracy                         0.5060       500
   macro avg     0.3384    0.4923    0.3396       500
weighted avg     0.3432    0.5060    0.3489       500

Evaluating model 64k_cnn_no_whitespace_pert_finetuned on word_score_incl_ws


500it [08:58,  1.08s/it]

              precision    recall  f1-score   support

           0     0.1000    0.0041    0.0079       243
           1     0.5061    0.9650    0.6640       257

    accuracy                         0.4980       500
   macro avg     0.3031    0.4845    0.3359       500
weighted avg     0.3087    0.4980    0.3451       500






In [24]:
accuracy_df

Unnamed: 0,clean,stochastic_no_ws,stochastic_incl_ws,word_score_no_ws,word_score_incl_ws
baseline,0.88,0.722,0.69,0.606,0.604
roben_1,0.794,0.686,0.572,0.656,0.554
roben_2,0.818,0.71,0.584,0.718,0.596
roben_1_tok,0.794,0.686,0.572,0.656,0.554
roben_2_tok,0.786,0.684,0.59,0.728,0.604
64k_lstm_clean_vanilla,0.864,0.708,0.662,0.596,0.556
64k_lstm_no_whitespace_pert_vanilla,0.872,0.812,0.712,0.73,0.666
64k_lstm_all_pert_vanilla,0.868,0.81,0.802,0.702,0.686
64k_lstm_clean_finetuned,0.864,0.724,0.694,0.584,0.566
64k_lstm_no_whitespace_pert_finetuned,0.864,0.816,0.722,0.744,0.678


In [25]:
f1_df

Unnamed: 0,clean,stochastic_no_ws,stochastic_incl_ws,word_score_no_ws,word_score_incl_ws
baseline,0.879906,0.721946,0.689989,0.605961,0.603842
roben_1,0.792002,0.683201,0.568959,0.649123,0.544822
roben_2,0.817104,0.708881,0.582289,0.715486,0.593653
roben_1_tok,0.792002,0.683201,0.568959,0.649123,0.544822
roben_2_tok,0.784092,0.680808,0.587224,0.725026,0.59967
64k_lstm_clean_vanilla,0.863922,0.707981,0.661771,0.595994,0.555886
64k_lstm_no_whitespace_pert_vanilla,0.871926,0.811892,0.711959,0.729523,0.665163
64k_lstm_all_pert_vanilla,0.867788,0.809597,0.801504,0.700176,0.685082
64k_lstm_clean_finetuned,0.863946,0.72396,0.693969,0.58394,0.565984
64k_lstm_no_whitespace_pert_finetuned,0.863965,0.815988,0.721999,0.743503,0.67771


In [26]:
accuracy_df.to_csv("../output/grid_accuracy.csv")

In [27]:
f1_df.to_csv("../output/grid_f1.csv")