In [2]:
import numpy as np
from scipy.spatial.distance import cosine
import torch
from torch.autograd import Variable
import torch.nn.functional as F

from data_loader import DataLoader
from classifier import train_classifier, load_saved_model

%load_ext autoreload
%autoreload 2

### Load Data and Saved Model

In [3]:
data_loader = DataLoader()
data_loader.load_data()

loading data...


In [4]:
# this model is trained using large dataset
cnn_model = load_saved_model('CNN', 'cnn-1.pt', data_loader)

splitting data...
building vocabulary...
CNN(
  (embedding): Embedding(25002, 100)
  (convs1): ModuleList(
    (0): Conv2d(1, 100, kernel_size=(3, 100), stride=(1, 1))
    (1): Conv2d(1, 100, kernel_size=(4, 100), stride=(1, 1))
    (2): Conv2d(1, 100, kernel_size=(5, 100), stride=(1, 1))
  )
  (dropout): Dropout(p=0.5)
  (fc1): Linear(in_features=300, out_features=1, bias=True)
)


### Utility Functions

In [5]:
def get_input(data_loader, k=1):
    example = data_loader.large_train.examples[k].text
    label = data_loader.large_train.examples[k].label
    word_indices = np.array([data_loader.TEXT.vocab.stoi[word] for word in example])
    one_input = torch.from_numpy(word_indices)
        
    return one_input.unsqueeze(1), label

def get_logit(input_example, model, print_msg=False):
    logit = model(input_example)
    if print_msg:
        print('logit:', logit)
        print('pred:', torch.round(torch.sigmoid(logit)))
    
    return logit

def get_predict(logit):
    return torch.round(torch.sigmoid(logit))

def generate_sentence(words_idx, data_loader):
    sentence = ' '.join(data_loader.TEXT.vocab.itos[id] for id in words_idx)
    
    return sentence

### Custom Loss Function and Attack

In [6]:
def custom_loss(new_logit, old_logit, new_word_vecs=None, initial_word_vecs=None, data_grad=torch.Tensor([0])):
    loss = - F.mse_loss(new_logit, old_logit) + torch.sum(data_grad ** 2)
    if new_word_vecs is not None and initial_word_vecs is not None:
        loss += np.sum(np.square(list(map(cosine, new_word_vecs, initial_word_vecs))))
    
    return loss

In [7]:
def attack(input_example, model, epsilon=1, similarity_reg=False, perturb_reg=False):
    # input_example: 2D, tensor([1, number of words])
    print('--- Initial ---')
    initial_logit = get_logit(input_example, model, print_msg=True)
    initial_label = get_predict(initial_logit)
    new_logit = initial_logit.clone()
    
    # initial loss and backpropagation
    loss = custom_loss(new_logit, initial_logit)
    model.zero_grad()
    loss.backward(retain_graph=True)
    print('initial loss:', loss)

    success = False
    first_time = True
    
    print('\n--- Attack ---')
    while not success:
        # get gradient and compute new embedding
        data_grad = model.embedding.weight.grad[input_example.squeeze(0)].clone()
        input_embedding = model.embedding.weight.data[input_example.squeeze(0)].clone()
        perturbed_embedding = input_embedding - epsilon * data_grad
        
        new_words_idx = []
        for i, one_embedding in enumerate(perturbed_embedding):
            embedding_distance = torch.sum((one_embedding - model.embedding.weight.data) ** 2, dim=1)
            # set original embedding distance to the maximum
            embedding_distance[input_example.squeeze(0)[i]] = float('inf')

            min_idx = torch.argmin(embedding_distance)
            new_words_idx.append(min_idx)
        
        new_words_idx = torch.from_numpy(np.array(new_words_idx, dtype=int))   # 1D, tensor([number of words])
        first_time = False
        
        # compute new logit and check if attack successfully
        new_logit = get_logit(new_words_idx.unsqueeze(0), model, print_msg=True)
        new_label = get_predict(new_logit)
        
        # compute loss
        if perturb_reg and similarity_reg:
            loss = custom_loss(new_logit, initial_logit, perturbed_embedding, input_embedding, data_grad)
        elif similarity_reg:
            loss = custom_loss(new_logit, initial_logit, perturbed_embedding, input_embedding)
        else:
            loss = custom_loss(new_logit, initial_logit)
        
        model.zero_grad()
        loss.backward(retain_graph=True)
        print('loss:', loss, '\n')
        
        if new_label != initial_label:
            break
    
    return new_words_idx, data_grad, new_logit

In [8]:
def generate_adversarial(model, original_input, new_words_idx, data_grad, max_limit=5, print_msg=False):
    initial_logit = get_logit(original_input, model)
    initial_label = get_predict(initial_logit)
    
    # compute the magnitude of the perturb and change from the largest
    grad_magnitude = torch.sqrt(torch.sum(torch.abs(data_grad), dim=1))
    position_to_change = reversed(np.argsort(grad_magnitude))
    
    success = False
    
    print('--- Generate Adversary ---')
    # changing words from the largest perturb
    for i in range(1, len(position_to_change)):
        new_input = original_input.squeeze(0).clone()
        old_words, new_words = [], []
        for position in position_to_change[:i]:
            new_input[position] = new_words_idx[position]
            old_words.append(data_loader.TEXT.vocab.itos[original_input.squeeze(0)[position]])
            new_words.append(data_loader.TEXT.vocab.itos[new_words_idx[position]])

        if print_msg:
            print('\nold words:', old_words)
            print('new words:', new_words)
        
        new_logit = get_logit(new_input.unsqueeze(0), model, print_msg=print_msg)
        new_label = get_predict(new_logit)
        if new_label != initial_label:
            success = True
            break
        
        # change too many words
        if i > max_limit:
            break
    
    return success, new_input

### Test on examples

- 4: 92 words. change all words to `not`
- 9: 66 words. change `the` to `excellently` and `is` to `celebrate`
- 10: 52 words. change all words to `not`
- 14: 81 words. only change one word
- 21: 62 words. change `<br />` tag
- 22: 93 words. change `<br />` tage
- 37: 44 words. replace `.` to `resilience`
- 45: 38 words. change all words to `not`
- 52: 61 words. change two words
- 61: 59 words. change one word
- 72: 92 words. fail. change all words to `not`
- 87: 77 words. fail. change all words to `not`
- 100: 83 words. change one word

In [1232]:
# find examples that are less 100 words
# for i in range(33, 101):
#     one_input, _ = get_input(data_loader, k=i)
#     one_input = torch.t(one_input)
#     if one_input.shape[1] <= 100:
#         print(i, one_input.shape[1])

In [17]:
# get one example
one_input, one_label = get_input(data_loader, k=61)
one_input = torch.t(one_input)

logit = get_logit(one_input, cnn_model, print_msg=True)
print('true label:', one_label)

logit: tensor([[1.0576]], grad_fn=<ThAddmmBackward>)
pred: tensor([[1.]], grad_fn=<RoundBackward>)
true label: ['pos']


In [18]:
new_words_idx, data_grad, new_logit = attack(one_input, cnn_model, epsilon=1e-0, similarity_reg=True, perturb_reg=True)

success, new_input = generate_adversarial(cnn_model, one_input, new_words_idx, data_grad, max_limit=10, print_msg=True)
print('\nattack success:', success)

--- Initial ---
logit: tensor([[1.0576]], grad_fn=<ThAddmmBackward>)
pred: tensor([[1.]], grad_fn=<RoundBackward>)
initial loss: tensor(0., grad_fn=<ThAddBackward>)

--- Attack ---
logit: tensor([[-6.7423]], grad_fn=<ThAddmmBackward>)
pred: tensor([[0.]], grad_fn=<RoundBackward>)
loss: tensor(-60.8378, grad_fn=<AddBackward>) 

--- Generate Adversary ---

old words: ['!']
new words: ['yes']
logit: tensor([[0.7841]], grad_fn=<ThAddmmBackward>)
pred: tensor([[1.]], grad_fn=<RoundBackward>)

old words: ['!', 'bad']
new words: ['yes', 'worse']
logit: tensor([[-2.5924]], grad_fn=<ThAddmmBackward>)
pred: tensor([[0.]], grad_fn=<RoundBackward>)

attack success: True


### Generate Sentence

In [19]:
# original sentence
print(generate_sentence(one_input.squeeze(0), data_loader).replace('<br />', ''))

as long as you go into this movie knowing that it 's terrible : bad acting , bad " effects , " bad story , bad ... everything , then you 'll love it . this is one of my favorite " goof on " movies ; watch it as a comedy and have a dozen good laughs !


In [20]:
# adversarial sentence
print(generate_sentence(new_input, data_loader).replace('<br />', ''))

as long as you go into this movie knowing that it 's terrible : worse acting , bad " effects , " bad story , bad ... everything , then you 'll love it . this is one of my favorite " goof on " movies ; watch it as a comedy and have a dozen good laughs yes
