# Adversarial attacks against Legal-BERT Model (BertForSequenceClassification)

In [1]:
# Global variables

BATCH_SIZE = 8
MODEL_NAME = 'nlpaueb/legal-bert-small-uncased'#'bert-base-uncased'
EPOCHS = 3
EMBEDDING_SIZE = 512
NUM_CLASSES = 2
VOCABULARY_SIZE = 30522
NUM_TOKENS = 5
ATTACK_LABEL = 1
import warnings 
warnings.filterwarnings("ignore")

### Installation of packages

### Imports

In [2]:
import torch
import os
import sys
import json
import argparse
from transformers import BertTokenizer
from torch.utils.data import TensorDataset, random_split
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
import numpy as np
import time
import datetime
import random
import gc
from torch.autograd import Variable
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import train_test_split
from copy import deepcopy
from tqdm import tqdm_notebook

### Device

In [3]:
# If there's a GPU available...
if torch.cuda.is_available():     
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 2 GPU(s) available.
We will use the GPU: NVIDIA A40


### Reading dataset

In [4]:
from ARAE_utils import Seq2Seq, MLP_D, MLP_G, generate
from attack_util import project_noise, one_hot_prob, GPT2_LM_loss, select_fluent_trigger, get_perplexity

In [5]:
# Funtion to read all sentences
def get_sentences(path):
    sentences= []
    for filename in sorted(os.listdir(path)):
        with open(path+filename, 'r') as f:
            for sentence in f :
                sentences.append(sentence)
    return sentences

In [6]:
# Function to read get all labels
def get_labels(path):
    all_labels = []
    for filename in sorted(os.listdir(path)):
        file_labels = []
        with open(path+filename, 'r') as f:
            for label in f :
                all_labels.append(int(label))
    return all_labels

In [7]:
# Reading sentences and labels
all_sentences = get_sentences("Sentences/")
all_labels = get_labels("Labels/")

In [8]:
# Since unfair sentences are marked as "-1", we change them to "0" for simplicity. Zero means fair, One means unfair
all_labels =  [0 if label ==-1 else label for label in all_labels]

### Bert Tokenizer

In [9]:
# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME, do_lower_case=True) # the model 'bert-base-uncased' only contains lower case sentences

Loading BERT tokenizer...


### Model BertForSequenceClassification (Load model)

In [10]:
model = BertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels = NUM_CLASSES,
    output_attentions = False,
    output_hidden_states = False,
)

model.to(device);

Some weights of the model checkpoint at nlpaueb/legal-bert-small-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification we

In [11]:
model.load_state_dict(torch.load('Bert4SeqClassif_202207072015.pt'))

<All keys matched successfully>

In [12]:
def load_ARAE_models(load_path, args):
    # function to load ARAE model.
    if not os.path.exists(load_path):
        print('Please download the pretrained ARAE model first')
        
    ARAE_args = json.load(open(os.path.join(load_path, 'options.json'), 'r'))
    vars(args).update(ARAE_args)
    autoencoder = Seq2Seq(emsize=args.emsize,
                          nhidden=args.nhidden,
                          ntokens=args.ntokens,
                          nlayers=args.nlayers,
                          noise_r=args.noise_r,
                          hidden_init=args.hidden_init,
                          dropout=args.dropout,
                          gpu=True)
    gan_gen = MLP_G(ninput=args.z_size, noutput=args.nhidden, layers=args.arch_g)
    gan_disc = MLP_D(ninput=args.nhidden, noutput=1, layers=args.arch_d)

    autoencoder = autoencoder.cuda()
    gan_gen = gan_gen.cuda()
    gan_disc = gan_disc.cuda()

    ARAE_word2idx = json.load(open(os.path.join(args.load_path, 'vocab.json'), 'r'))
    ARAE_idx2word = {v: k for k, v in ARAE_word2idx.items()}

    print('Loading models from {}'.format(args.load_path))
    loaded = torch.load(os.path.join(args.load_path, "model.pt"))
    autoencoder.load_state_dict(loaded.get('ae'))
    gan_gen.load_state_dict(loaded.get('gan_g'))
    gan_disc.load_state_dict(loaded.get('gan_d'))
    return ARAE_args, ARAE_idx2word, ARAE_word2idx, autoencoder, gan_gen, gan_disc

In [13]:
parser = argparse.ArgumentParser()
parser.add_argument('--load_path', type=str, default='oneb_pretrained',
                    help='directory to load models from')
parser.add_argument('--seed', type=int, default=1111,
                    help='random seed')
parser.add_argument('--sample', action='store_true',
                    help='sample when decoding for generation')
parser.add_argument('--len_lim', type=int, default=NUM_TOKENS,
                    help='maximum length of sentence')
parser.add_argument('--r_lim', type=float, default=1,
                    help='lim of radius of z')
parser.add_argument('--sentiment_path', type=str, default='./opinion_lexicon_English',
                    help='directory to load sentiment word from')
parser.add_argument('--z_seed', type=float, default=6.,
                    help='noise seed for z')
parser.add_argument('--avoid_l', type=int, default=4,
                    help='length to avoid repeated pattern')
parser.add_argument('--lr', type=float, default=1e3,
                    help='learn rate')
parser.add_argument('--attack_class', type=int, default=ATTACK_LABEL,
                    help='the class label to attack')
parser.add_argument('--noise_n', type=int, default=256,
                    help='number of generated noise vectors')
parser.add_argument('--tot_runs', type=int, default=5,
                    help='number of attack runs')
args = parser.parse_args([])

In [14]:
r_threshold = args.r_lim
step_bound = r_threshold / 100
max_iterations = 1000

In [15]:
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.cuda.manual_seed(args.seed)

# initialize ARAE model.
ARAE_args, ARAE_idx2word, ARAE_word2idx, autoencoder, gan_gen, gan_disc = load_ARAE_models(args.load_path, args)

Loading models from oneb_pretrained


In [16]:
# returns the wordpiece embedding weight matrix
def get_embedding_weight(language_model):
    for module in language_model.modules():
        if isinstance(module, torch.nn.Embedding):
            if module.weight.shape[0] == 30522:
                return module.weight.detach()

In [17]:
# add hooks for embeddings
def add_hooks(language_model):
    for module in language_model.modules():
        if isinstance(module, torch.nn.Embedding):
            if module.weight.shape[0] == 30522:
                module.weight.requires_grad = True
                module.register_full_backward_hook(extract_grad_hook)

In [18]:
# hook used in add_hooks()
extracted_grads = []
def extract_grad_hook(module, grad_in, grad_out):
    extracted_grads.append(grad_out[0])

In [19]:
model.eval()
model.to(device)

add_hooks(model) # add gradient hooks to embeddings
embedding_weight = get_embedding_weight(model) # save the word embedding matrix


In [20]:
ARAE_weight_embedding = []
for num in range(len(ARAE_idx2word)):
    ARAE_weight_embedding.append(embedding_weight[tokenizer.convert_tokens_to_ids(ARAE_idx2word[num])])
ARAE_weight_embedding = torch.stack(ARAE_weight_embedding)
print(ARAE_weight_embedding.shape)

torch.Size([30004, 512])


### Trigger generation

##### General functions

In [21]:
# creates the batch of target texts with -1 placed at the end of the sequences for padding (for masking out the loss).
def make_target_batch(tokenizer, device, target_texts):
    encoded_texts = []
    max_len = 0
    for target_text in target_texts:
        encoded_target_text = tokenizer.encode_plus(
            target_text,
            add_special_tokens = True,
            max_length = EMBEDDING_SIZE - NUM_TOKENS,
            pad_to_max_length = True,
            return_attention_mask = True
        )
        encoded_texts.append(encoded_target_text.input_ids)
        if len(encoded_target_text.input_ids) > max_len:
            max_len = len(encoded_target_text)

    for indx, encoded_text in enumerate(encoded_texts):
        if len(encoded_text) < max_len:
            encoded_texts[indx].extend([-1] * (max_len - len(encoded_text)))

    target_tokens_batch = None
    for encoded_text in encoded_texts:
        target_tokens = torch.tensor(encoded_text, device=device, dtype=torch.long).unsqueeze(0)
        if target_tokens_batch is None:
            target_tokens_batch = target_tokens
        else:
            target_tokens_batch = torch.cat((target_tokens, target_tokens_batch), dim=0)
    return target_tokens_batch

In [22]:
def get_input_masks_and_labels_with_tokens(sentences, labels, tokens=None):
    input_ids = []
    attention_masks = []
    for sent in sentences:
        if tokens is not None :
            sent_with_tokens = tokens+" "+sent
        else :
            sent_with_tokens = sent
        encoded_dict = tokenizer.encode_plus(
                        sent_with_tokens,
                        add_special_tokens = True,
                        max_length = 512 - NUM_TOKENS+1,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )
           
        input_ids.append(encoded_dict['input_ids'])

        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)

    return input_ids, attention_masks, labels

In [23]:
def get_loss_and_metrics(model, dataloader, device, print_logs=False):
    # get initial loss for the trigger
    model.zero_grad()

    test_preds = []
    test_targets = []

    # Tracking variables 
    total_test_accuracy = 0
    total_test_loss = 0
    io_total_test_acc = 0
    io_total_test_prec = 0
    io_total_test_recall = 0
    io_total_test_f1 = 0

    for batch in dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()
        with torch.no_grad():

            result = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask, 
                        labels=b_labels,
                        return_dict=True)

            loss = result.loss
            logits = result.logits

            test_preds.extend(logits.argmax(dim=1).cpu().numpy())
            test_targets.extend(batch[2].cpu().numpy())

            # Accumulate the validation loss.
            total_test_loss += loss.item()

            test_preds.extend(logits.argmax(dim=1).cpu().numpy())
            test_targets.extend(batch[2].cpu().numpy())

            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()   

        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.        
        test_acc = accuracy_score(test_targets, test_preds)
        test_precision = precision_score(test_targets, test_preds)
        test_recall = recall_score(test_targets, test_preds)
        test_f1 = f1_score(test_targets, test_preds)

        io_total_test_acc += test_acc
        io_total_test_prec += test_precision
        io_total_test_recall += test_recall
        io_total_test_f1 += test_f1

    io_avg_test_loss = total_test_loss/len(dataloader)
    io_avg_test_acc = io_total_test_acc / len(dataloader)
    io_avg_test_prec = io_total_test_prec / len(dataloader)
    io_avg_test_recall = io_total_test_recall / len(dataloader)
    io_avg_test_f1 = io_total_test_f1 / len(dataloader)
    
    if print_logs :
        print(
                f'Loss {io_avg_test_loss} : \t\
                Valid_acc : {io_avg_test_acc}\t\
                Valid_F1 : {io_avg_test_f1}\t\
                Valid_precision : {io_avg_test_prec}\t\
                Valid_recall : {io_avg_test_recall}'
              )

    return io_avg_test_loss, io_avg_test_acc, io_avg_test_prec, io_avg_test_recall, io_avg_test_f1

In [24]:
def change_input_ids_with_candidate_token(input_ids, position, candidate):
    input_ids[:,position] = candidate

    return input_ids

In [25]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [26]:
train_sen, test_sen, train_label, test_label = train_test_split(all_sentences, all_labels, test_size = 0.2)

In [27]:
positions_unfair_test = np.where(np.array(test_label) == 1)[0]
positions_fair_test = np.where(np.array(test_label) == 0)[0]
print(f'First 32 positions: {positions_unfair_test[0:32]} with total of unfair sentences {len(positions_unfair_test)}')
print(f'First 32 positions: {positions_fair_test[0:32]} with total of fair sentences {len(positions_fair_test)}')
target_unfair_sentences_test = []
labels_unfair_sentences_test = []
target_fair_sentences_test = []
labels_fair_sentences_test = []

for index in range(len(positions_unfair_test)):
    target_unfair_sentences_test.append(test_sen[positions_unfair_test[index]])
    labels_unfair_sentences_test.append(test_label[positions_unfair_test[index]])

for index in range(len(positions_fair_test)):
    target_fair_sentences_test.append(test_sen[positions_fair_test[index]])
    labels_fair_sentences_test.append(test_label[positions_fair_test[index]])

First 32 positions: [  9  29  43  51  56  59  61  70  74  79  85  94 108 115 118 120 134 135
 136 139 149 158 162 167 196 197 199 202 236 256 271 272] with total of unfair sentences 230
First 32 positions: [ 0  1  2  3  4  5  6  7  8 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 30 31 32 33] with total of fair sentences 1653


In [28]:
positions_unfair = np.where(np.array(train_label) == 1)[0]
positions_fair = np.where(np.array(train_label) == 0)[0]
print(f'First 32 positions: {positions_unfair[0:32]} with total of unfair sentences {len(positions_unfair)}')
print(f'First 32 positions: {positions_fair[0:32]} with total of fair sentences {len(positions_fair)}')

target_unfair_sentences = []
labels_unfair_sentences = []
target_fair_sentences = []
labels_fair_sentences = []
for index in range(len(positions_unfair)):
    target_unfair_sentences.append(train_sen[positions_unfair[index]])
    labels_unfair_sentences.append(train_label[positions_unfair[index]])

for index in range(len(positions_fair)):
    target_fair_sentences.append(train_sen[positions_fair[index]])
    labels_fair_sentences.append(train_label[positions_fair[index]])


First 32 positions: [  0   5  12  23  26  27  36  52  65  94 105 107 111 126 140 143 154 159
 188 200 206 234 237 243 248 250 251 253 259 264 265 282] with total of unfair sentences 802
First 32 positions: [ 1  2  3  4  6  7  8  9 10 11 13 14 15 16 17 18 19 20 21 22 24 25 28 29
 30 31 32 33 34 35 37 38] with total of fair sentences 6729


In [29]:
trigger_tokens = [145]*NUM_TOKENS

In [30]:
if ATTACK_LABEL == 1 :
    input_ids, attention_masks, labels = get_input_masks_and_labels_with_tokens(target_unfair_sentences, labels_unfair_sentences, tokenizer.decode(trigger_tokens))
    dataset = TensorDataset(input_ids, attention_masks, labels)
    target_sen = target_unfair_sentences
    target_label = labels_unfair_sentences
    target_sen_test = target_unfair_sentences_test
    target_label_test = labels_unfair_sentences_test

elif ATTACK_LABEL == 0:
    input_ids, attention_masks, labels = get_input_masks_and_labels_with_tokens(target_fair_sentences, labels_fair_sentences, tokenizer.decode(trigger_tokens))
    dataset = TensorDataset(input_ids, attention_masks, labels)
    target_sen = target_fair_sentences
    target_label = labels_fair_sentences
    target_sen_test = target_fair_sentences_test
    target_label_test = labels_fair_sentences_test

train_size = int(0.6 * len(dataset))
valid_size = len(dataset) - train_size

train_set, valid_set = random_split(dataset,[train_size, valid_size])    
train_loader = torch.utils.data.DataLoader(train_set, batch_size=BATCH_SIZE)
valid_loader = torch.utils.data.DataLoader(valid_set, batch_size=BATCH_SIZE)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [31]:
model;

In [32]:
def print_generated_sentences_from_ARAE(max_indices):
    max_indices = max_indices.data.cpu().numpy()
    sentences = []
    for idx in max_indices:
        # generated sentence
        words = tokenizer.convert_ids_to_tokens(idx)
        # truncate sentences to first occurrence of <eos>
        truncated_sent = []
        for w in words:
            if w != '<eos>':
                truncated_sent.append(w)
            else:
                break
        sent = " ".join(truncated_sent)
        sentences.append(sent)
    print(sentences)

In [33]:
class Squisher(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer = nn.Linear(256,1)
    def forward(self, x):
        return self.layer(x.permute(1,0)).permute(1,0)

In [34]:
squisher = Squisher()

In [35]:
squisher.to(device)

Squisher(
  (layer): Linear(in_features=256, out_features=1, bias=True)
)

In [36]:
criterion = nn.CrossEntropyLoss()

In [37]:
def forward_with_trigger(out_emb, tokens, masks, labels):
    model.train()
    token_embeddings = model.bert.embeddings.word_embeddings(tokens)
    out_emb = out_emb.repeat(token_embeddings.shape[0],1,1)
    input_embeddings = torch.cat([token_embeddings, out_emb], dim = 1)
    enc_output = model.bert.encoder(input_embeddings)
    pooler_output = model.bert.pooler(enc_output.last_hidden_state)
    dropout_output = model.dropout(pooler_output)
    return model.classifier(dropout_output)

In [38]:
maxlen = args.len_lim
# initialize noise
noise_n = args.noise_n  # this should be a factor of batch_size
tot_runs = args.tot_runs
n_repeat = 1


r_threshold = args.r_lim
step_bound = r_threshold / 100
max_iterations = 1000

patience_lim = 3
patience = 0 
max_trial = 3
all_output = list()
log_loss = 100

iter = 0

for tmp in tqdm_notebook(range(tot_runs)):
    print("=========================================================================================")
    torch.manual_seed(args.z_seed + tmp)
    print('z_seed:{}'.format(args.z_seed + tmp))
    noise = torch.randn(noise_n, ARAE_args['z_size'], requires_grad=True, device = "cuda")
    noise = Variable(noise, requires_grad=True)

#     get_loss_and_metrics(model, train_loader, device)
    step_size = args.lr
    step_scale = 0.1 
    patience = 0
    old_noise = None
    old_loss = float('-Inf')
    loss_list = list()
    update = False
    i_trial = 0


    start_noise_data = noise.data.clone()
    for _ in tqdm_notebook(range(50)):
        for i, batch in enumerate(train_loader) :

            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            model.train()
            squisher.train()
            autoencoder.train()
            gan_gen.eval()
            gan_disc.eval()

            hidden = gan_gen(noise)
            hidden = squisher(hidden)

            max_indices, decoded = autoencoder.generate_decoding(hidden=hidden, maxlen=maxlen, sample=False, avoid_l=args.avoid_l)

    #         print_generated_sentences_from_ARAE(max_indices)

            decoded = torch.stack(decoded, dim=1).float()
            if n_repeat > 1:
                decoded = torch.repeat_interleave(decoded, repeats=n_repeat, dim=0)

            decoded_prob = F.softmax(decoded, dim=-1)
            decoded_prob = one_hot_prob(decoded_prob, max_indices)
            out_emb = torch.matmul(decoded_prob, ARAE_weight_embedding)

            output = forward_with_trigger(out_emb, b_input_ids, b_input_mask, b_labels.unsqueeze(-1))

            oh_targets = F.one_hot(b_labels, num_classes=2).to(torch.float32).to(device)
            loss = criterion(output, oh_targets)
            iter += 1

            loss_list.append(loss.item())
            if noise.grad is not None:
                noise.grad.zero_()
            noise.retain_grad()
            loss.backward()

            noise_diff = step_size * noise.grad.data
            noise_diff = project_noise(noise_diff, r_threshold=step_bound)

            noise.data = noise.data + noise_diff

            whole_diff = noise.data - start_noise_data
            whole_diff = project_noise(whole_diff, r_threshold=r_threshold)
            noise.data = start_noise_data + whole_diff

            if iter % log_loss == 0:
                cur_loss = np.mean(loss_list)
                print('current iter:{}'.format(iter))
                print('current loss:{}'.format(cur_loss))

                loss_list = list()
                if cur_loss > old_loss:
                    patience = 0
                    old_loss = cur_loss
                    old_noise = noise.data.clone()
                    update = True
                else:
                    patience += 1

                print('current patience:{}'.format(patience))
                print('\n')

                if patience >= patience_lim:
                    patience = 0
                    step_size *= step_scale
                    noise.data = old_noise
                    print('current step size:{}'.format(step_size))
                    i_trial += 1
                    print('current trial:{}'.format(i_trial))
                    print('\n')
            if i_trial >= max_trial or iter >= max_iterations:
                if update:
                    with torch.no_grad():
                        noise_new = torch.ones(noise_n, ARAE_args['z_size'], requires_grad=False).cuda()
                        noise_new.data = old_noise
                        hidden = gan_gen(noise_new)
                        max_indices, decoded = autoencoder.generate_decoding(hidden=hidden, maxlen=maxlen, sample=False, avoid_l=args.avoid_l)

                        decoded = torch.stack(decoded, dim=1).float()
                        if n_repeat > 1:
                            decoded = torch.repeat_interleave(decoded, repeats=n_repeat, dim=0)

                        decoded_prob = F.softmax(decoded, dim=-1)
                        decoded_prob = one_hot_prob(decoded_prob, max_indices)

                    sen_idxs = torch.argmax(decoded_prob, dim=2)
                    sen_idxs = sen_idxs.cpu().numpy()

                    output_s = list()
                    glue = ' '
                    sentence_list = list()
                    for ss in sen_idxs:
                        sentence = [ARAE_idx2word[s] for s in ss]
                        trigger_token_ids = list()
                        last_word = None
                        last_word2 = None
                        contain_sentiment_word = False
                        new_sentence = list()
                        for word in sentence:
                            cur_idx = tokenizer.convert_tokens_to_ids(word)
                            if cur_idx != last_word and cur_idx != last_word2:
                                trigger_token_ids.append(cur_idx)
                                new_sentence.append(word)
                                last_word2 = last_word
                                last_word = cur_idx

                        threshold = 0.89
                        num_lim = 20
                        s_str = glue.join(new_sentence)
                        if not (s_str in sentence_list):
                            input_ids, attention_masks, labels = get_input_masks_and_labels_with_tokens(target_sen, target_label, tokenizer.decode(trigger_token_ids))
                            ds = TensorDataset(input_ids, attention_masks, labels)
                            loader = torch.utils.data.DataLoader(ds, batch_size=BATCH_SIZE)
                            _, accuracy, _, _ ,_ = get_loss_and_metrics(model, loader, device)
                            if accuracy < threshold:
                                sentence_list.append(s_str)
                                output_s.append((s_str, accuracy, contain_sentiment_word))

                    if len(output_s) > 0:
                        all_output = all_output + output_s
                    update = False
                break

  0%|          | 0/5 [00:00<?, ?it/s]

z_seed:6.0


  0%|          | 0/50 [00:00<?, ?it/s]

current iter:100
current loss:2.693161528110504
current patience:0


current iter:200
current loss:2.6864662647247313
current patience:1


current iter:300
current loss:2.6975222992897034
current patience:0


current iter:400
current loss:2.681127049922943
current patience:1


current iter:500
current loss:2.6792047142982485
current patience:2


current iter:600
current loss:2.7086358785629274
current patience:0


current iter:700
current loss:2.67221070766449
current patience:1


current iter:800
current loss:2.6818522930145265
current patience:2


current iter:900
current loss:2.6808158683776857
current patience:3


current step size:100.0
current trial:1


current iter:1000
current loss:2.697529332637787
current patience:1


z_seed:7.0


  0%|          | 0/50 [00:00<?, ?it/s]

z_seed:8.0


  0%|          | 0/50 [00:00<?, ?it/s]

current iter:1100
current loss:2.6708874702453613
current patience:0


z_seed:9.0


  0%|          | 0/50 [00:00<?, ?it/s]

z_seed:10.0


  0%|          | 0/50 [00:00<?, ?it/s]

current iter:1200
current loss:2.6803066590253044
current patience:0




In [43]:
for i in range(len(all_output)):
    test_set = TensorDataset(*get_input_masks_and_labels_with_tokens(target_sen_test, target_label_test, all_output[i][0]))
    test_loader = torch.utils.data.DataLoader(test_set, batch_size=BATCH_SIZE)
    get_loss_and_metrics(model, test_loader, device, print_logs = True)

Loss 1.4024241628317997 : 	                Valid_acc : 0.555732062908124	                Valid_F1 : 0.7135346175540485	                Valid_precision : 1.0	                Valid_recall : 0.555732062908124
Loss 0.8008018634442625 : 	                Valid_acc : 0.7367496815320942	                Valid_F1 : 0.8467714681715115	                Valid_precision : 1.0	                Valid_recall : 0.7367496815320942
Loss 0.763841468198546 : 	                Valid_acc : 0.7174074356649618	                Valid_F1 : 0.8347423303044249	                Valid_precision : 1.0	                Valid_recall : 0.7174074356649618
Loss 0.7018080030021996 : 	                Valid_acc : 0.760717529190136	                Valid_F1 : 0.8632455664404781	                Valid_precision : 1.0	                Valid_recall : 0.760717529190136
Loss 0.9274461860286778 : 	                Valid_acc : 0.6426217658401934	                Valid_F1 : 0.7811428353702831	                Valid_precision : 1.0	               

KeyboardInterrupt: 

In [44]:
test_set = TensorDataset(*get_input_masks_and_labels_with_tokens(target_sen_test, target_label_test))
test_loader = torch.utils.data.DataLoader(test_set, batch_size=BATCH_SIZE)
get_loss_and_metrics(model, test_loader, device, print_logs = True)

Loss 0.43301968520571443 : 	                Valid_acc : 0.8732667619189698	                Valid_F1 : 0.9320214759629916	                Valid_precision : 1.0	                Valid_recall : 0.8732667619189698


(0.43301968520571443,
 0.8732667619189698,
 1.0,
 0.8732667619189698,
 0.9320214759629916)

In [45]:
all_output

[('" she does not prepare', 0.5191425515547677, False),
 ('they were also serving a', 0.6669286224317114, False),
 ('" this thursday will be', 0.6682641629232173, False),
 ('" and venus first', 0.725403750154813, False),
 ('" there is another single', 0.6422810036524258, False),
 ('when george arrives in several', 0.6613070120824572, False),
 ('our focus says enormous why', 0.5714277852869578, False),
 ('however , using two asian', 0.7890693279464054, False),
 ("in china , he 's", 0.770066881032387, False),
 ('this is just painful cuts', 0.7111099396194125, False),
 ('<oov> were to be', 0.7039654642907573, False),
 ('" it play we saved', 0.7597506899921359, False),
 ('the blood , in a', 0.7658752615112911, False),
 ("frank head home 's social", 0.7238867906468085, False),
 ('he says there were companies', 0.6188578220878549, False),
 ('the two koreas but', 0.7345613824477668, False),
 ('back on election in that', 0.7138943122789912, False),
 ('it <oov> remarks to the', 0.72368982951784

In [46]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

In [47]:
GPT2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
GPT2_model = GPT2LMHeadModel.from_pretrained('gpt2').cuda()

triggers = all_output
select_fluent_trigger(triggers, GPT2_model, GPT2_tokenizer)

he so must know who 0.47544351503008275 7.053480625152588
" you must not live 0.5040939418908427 6.714369297027588
he will not remain in 0.5102832286779833 6.003345012664795
" i did not say 0.5133761834985784 6.155946731567383
" she does not prepare 0.5191425515547677 7.3233642578125
schwarzenegger will get both asking 0.5333220357305364 10.386038970947265
i admit it if the 0.5383842274645726 6.612600803375244
it will probably assume that 0.5411446294443476 5.846843719482422
<oov> was arrested on charges 0.5415882652777433 10.497184753417969
the economist will return after 0.5429489811352791 7.486736297607422
i also keep him in 0.5432412899381369 6.488772869110107
police say <oov> was charged 0.5435009889411876 11.340370178222656
" but i will not 0.5497794289612581 6.265939235687256
" she will not fear 0.5603536314553975 7.090897083282471
" when did you think 0.5604245796518598 6.149198055267334
so ask the one with 0.5657748561842767 6.953089237213135
" if you entered the 0.56740500334

In [54]:
with open('trigger_list.txt', 'w') as fp:
    for trigger in triggers :
        fp.write("%s\n" % trigger[0])

In [57]:
with open('test_sens.txt', 'w') as fp:
    for sen in test_sen :
        fp.write("%s" % sen)