<a href="https://colab.research.google.com/github/abyaadrafid/LDA_Lab_Defence/blob/TokensGeneration/Natural_Trigger.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import sys
import os 
import json
import argparse
import torch
import random
import numpy as np
from torch.autograd import Variable
from sklearn.model_selection import train_test_split
from google.colab import drive
import torch.nn.functional as F
import pandas as pd
drive.mount("/content/drive")
sys.path.append("/content/drive/MyDrive/Colab Notebooks")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
MODEL_NAME = 'nlpaueb/legal-bert-small-uncased'
BERT_VOCAB_SIZE = 30522
EMBEDDING_SIZE = 512
BATCH_SIZE = 32
noise_n = BATCH_SIZE*2

In [3]:
from ARAE_utils import Seq2Seq, MLP_D, MLP_G, generate
from attack_util import project_noise, one_hot_prob, GPT2_LM_loss, select_fluent_trigger

In [4]:
def load_ARAE_models(load_path, args):
    # function to load ARAE model.
    if not os.path.exists(load_path):
        print('Please download the pretrained ARAE model first')
        
    ARAE_args = json.load(open(os.path.join(load_path, 'options.json'), 'r'))
    vars(args).update(ARAE_args)
    autoencoder = Seq2Seq(emsize=args.emsize,
                          nhidden=args.nhidden,
                          ntokens=args.ntokens,
                          nlayers=args.nlayers,
                          noise_r=args.noise_r,
                          hidden_init=args.hidden_init,
                          dropout=args.dropout,
                          gpu=args.cuda)
    gan_gen = MLP_G(ninput=args.z_size, noutput=args.nhidden, layers=args.arch_g)
    gan_disc = MLP_D(ninput=args.nhidden, noutput=1, layers=args.arch_d)

    autoencoder = autoencoder.cuda()
    gan_gen = gan_gen.cuda()
    gan_disc = gan_disc.cuda()

    ARAE_word2idx = json.load(open(os.path.join(args.load_path, 'vocab.json'), 'r'))
    ARAE_idx2word = {v: k for k, v in ARAE_word2idx.items()}

    print('Loading models from {}'.format(args.load_path))
    loaded = torch.load(os.path.join(args.load_path, "model.pt"))
    autoencoder.load_state_dict(loaded.get('ae'))
    gan_gen.load_state_dict(loaded.get('gan_g'))
    gan_disc.load_state_dict(loaded.get('gan_d'))
    return ARAE_args, ARAE_idx2word, ARAE_word2idx, autoencoder, gan_gen, gan_disc

In [5]:
parser = argparse.ArgumentParser()
parser.add_argument('--load_path', type=str, default='/content/drive/MyDrive/oneb_pretrained',
                    help='directory to load models from')
parser.add_argument('--seed', type=int, default=1111,
                    help='random seed')
parser.add_argument('--sample', action='store_true',
                    help='sample when decoding for generation')
parser.add_argument('--len_lim', type=int, default=5,
                    help='maximum length of sentence')
parser.add_argument('--r_lim', type=float, default=1,
                    help='lim of radius of z')
parser.add_argument('--sentiment_path', type=str, default='./opinion_lexicon_English',
                    help='directory to load sentiment word from')
parser.add_argument('--z_seed', type=float, default=6.,
                    help='noise seed for z')
parser.add_argument('--avoid_l', type=int, default=4,
                    help='length to avoid repeated pattern')
parser.add_argument('--lr', type=float, default=1e3,
                    help='learn rate')
parser.add_argument('--attack_class', type=str, default='1',
                    help='the class label to attack')
parser.add_argument('--noise_n', type=int, default=256,
                    help='number of generated noise vectors')
parser.add_argument('--tot_runs', type=int, default=1,
                    help='number of attack runs')
args = parser.parse_args([])

In [6]:
r_threshold = args.r_lim
step_bound = r_threshold / 100
max_iterations = 1000

In [7]:
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.cuda.manual_seed(args.seed)

# initialize ARAE model.
ARAE_args, ARAE_idx2word, ARAE_word2idx, autoencoder, gan_gen, gan_disc = load_ARAE_models(args.load_path, args)

RuntimeError: ignored

In [None]:
def get_sentences(path):
    sentences= []
    for filename in os.listdir(path):
        with open(path+filename, 'r') as f:
            for sentence in f :
                sentences.append(sentence)
    return sentences
def get_labels(path):
    all_labels = []
    for filename in os.listdir(path):
        file_labels = []
        with open(path+filename, 'r') as f:
            for label in f :
                all_labels.append(int(label))
    return all_labels

In [None]:
all_sentences = get_sentences("/content/drive/MyDrive/Sentences/")
all_labels = get_labels("/content/drive/MyDrive/Labels/")

In [None]:
all_labels =  [0 if label ==-1 else label for label in all_labels]
df=pd.DataFrame({'text': all_sentences, 'labels': all_labels})
df['text'] = df['text'].str.lower()
import re
import string
df['text'] = df['text'].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '' , x))
df['text'] = df['text'].replace(r'\s+', ' ', regex=True)

In [None]:
!pip install transformers

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
from torch import nn
from transformers import BertModel

class BertClassifier(nn.Module):

    def __init__(self):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained(MODEL_NAME)
        self.dropout = nn.Dropout(0.5)
        self.l1 = nn.Linear(EMBEDDING_SIZE, 512)
        self.relu = nn.ReLU()
        self.l2 = nn.Linear(512, 2)

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        linear_output = self.l2(self.relu(self.l1(self.dropout(pooled_output))))

        return linear_output

In [None]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

In [None]:
model = BertClassifier().to(device)

In [None]:
model.load_state_dict(torch.load('/content/drive/MyDrive/LDA/best_valid_f1.pt', map_location=torch.device('cpu')))

In [None]:
def get_embedding_weight(language_model):
  for module in language_model.modules():
    if isinstance(module, torch.nn.Embedding):
      if module.weight.shape[0] == BERT_VOCAB_SIZE: # only add a hook to wordpiece embeddings, not position embeddings
        return module.weight.detach()

In [None]:
embedding_weight = get_embedding_weight(model)

In [None]:
ARAE_weight_embedding = []
for num in range(len(ARAE_idx2word)):
    ARAE_weight_embedding.append(embedding_weight[tokenizer.convert_tokens_to_ids(ARAE_idx2word[num])])
ARAE_weight_embedding = torch.stack(ARAE_weight_embedding)

In [None]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = df['labels']
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = min(tokenizer_maxlen, EMBEDDING_SIZE-num_trigger_tokens), truncation=True,
                                return_tensors="pt") for text in tqdm(df['text'])]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y
dataset = Dataset(df)
loader = torch.utils.data.DataLoader(dataset, batch_size = BATCH_SIZE)

In [None]:
patience_lim = 3
patience = 0 
max_trial = 3
all_output = list()
log_loss = int(1e2)
tot_runs = 10
maxlen = 128
n_repeat = 1


for tmp in range(tot_runs):
    model.get_metrics(reset=True)
    step_size = args.lr
    step_scale = 0.1 
    patience = 0
    old_noise = None
    old_loss = float('-Inf')
    loss_list = list()
    update = False
    i_trial = 0

    torch.manual_seed(args.z_seed + tmp)
    print('z_seed:{}'.format(args.z_seed + tmp))
    noise = torch.randn(noise_n, ARAE_args['z_size'], requires_grad=True).cuda()
    noise = Variable(noise, requires_grad=True)
    start_noise_data = noise.data.clone()
    iter = 0
    for batch in lazy_groups_of(iterator(targeted_dev_data, num_epochs=int(5e5), shuffle=True), group_size=1):
        # evaluate_batch(model, batch, trigger_token_ids, snli)
        # generate sentence with ARAE, output the word embedding instead of index.
        tokens = batch['tokens']
        label = batch['label']

        model.train()
        autoencoder.train()
        gan_gen.eval()
        gan_disc.eval()

        hidden = gan_gen(noise)


        max_indices, decoded = autoencoder.generate_decoding(hidden=hidden, maxlen=maxlen, sample=False,
                                                              avoid_l=args.avoid_l)

        decoded = torch.stack(decoded, dim=1).float()
        if n_repeat > 1:
            decoded = torch.repeat_interleave(decoded, repeats=n_repeat, dim=0)

        decoded_prob = F.softmax(decoded, dim=-1)
        decoded_prob = one_hot_prob(decoded_prob, max_indices)
        out_emb = torch.matmul(decoded_prob, ARAE_weight_embedding)
        output = model.forward_with_trigger(out_emb, tokens, label)

        loss = output["loss"]
        iter += 1

        loss_list.append(output["loss"].item())
        noise.zero_gradient()
        loss.backward()

        noise_diff = step_size * noise.grad.data
        noise_diff = project_noise(noise_diff, r_threshold=step_bound)

        noise.data = noise.data + noise_diff

        whole_diff = noise.data - start_noise_data
        whole_diff = project_noise(whole_diff, r_threshold=r_threshold)
        noise.data = start_noise_data + whole_diff

        if iter % log_loss == 0:
            cur_loss = np.mean(loss_list)
            print('current iter:{}'.format(iter))
            print('current loss:{}'.format(cur_loss))

            loss_list = list()
            if cur_loss > old_loss:
                patience = 0
                old_loss = cur_loss
                old_noise = noise.data.clone()
                update = True
            else:
                patience += 1

            print('current patience:{}'.format(patience))
            print('\n')

            if patience >= patience_lim:
                patience = 0
                step_size *= step_scale
                noise.data = old_noise
                print('current step size:{}'.format(step_size))
                i_trial += 1
                print('current trial:{}'.format(i_trial))
                print('\n')

        if i_trial >= max_trial or iter >= max_iterations:
            if update:
                with torch.no_grad():
                    noise_new = torch.ones(noise_n, ARAE_args['z_size'], requires_grad=False).cuda()
                    noise_new.data = old_noise
                    hidden = gan_gen(noise_new)  # [:1, :]
                    max_indices, decoded = autoencoder.generate_decoding(hidden=hidden, maxlen=maxlen, sample=False,
                                                                          avoid_l=args.avoid_l)

                    decoded = torch.stack(decoded, dim=1).float()
                    if n_repeat > 1:
                        decoded = torch.repeat_interleave(decoded, repeats=n_repeat, dim=0)

                    decoded_prob = F.softmax(decoded, dim=-1)
                    decoded_prob = one_hot_prob(decoded_prob, max_indices)

                sen_idxs = torch.argmax(decoded_prob, dim=2)
                sen_idxs = sen_idxs.cpu().numpy()

                output_s = list()
                glue = ' '
                sentence_list = list()
                for ss in sen_idxs:
                    sentence = [ARAE_idx2word[s] for s in ss]
                    trigger_token_ids = list()
                    last_word = None
                    last_word2 = None
                    contain_sentiment_word = False
                    new_sentence = list()
                    for word in sentence:
                        cur_idx = tokenizer.convert_tokens_to_ids(word)
                        if cur_idx != last_word and cur_idx != last_word2:
                            trigger_token_ids.append(cur_idx)
                            new_sentence.append(word)
                            last_word2 = last_word
                            last_word = cur_idx

                    threshold = 0.5
                    num_lim = 20
                    s_str = glue.join(new_sentence)
                    if not (s_str in sentence_list):
                        accuracy = get_accuracy(model, targeted_dev_data, sst_vocab, trigger_token_ids)
                        if accuracy < threshold:
                            sentence_list.append(s_str)
                            output_s.append((s_str, accuracy, contain_sentiment_word))

                if len(output_s) > 0:
                    all_output = all_output + output_s
                update = False
            break
