# This attack simply inserts high-confidence 'positive' words based on queries from the test set to the model on sentences around that are negative. 

- This is a black box attack, assumes only access to confidence predictions and some preexising dataset of sentences, which may be benign.

## STEPS
1. Load all test sentences
2. Score them individually
3. Get top N (10?) benign, with high-confidence 
4. Generate attack tests (vary combinations of (`a=0,1,2` and `b=0,1,2`):
 - Adding `a` sentences before and `b` sentences after, chosen randomly from step3

In [1]:
import pickle
import os 
import pandas as pd 
from collections import Counter
import sys
import tensorflow as tf
from experiments import params
from data_utils_v2.data_helpers import genFeatures, loadVocabEmb,genPOSFeatures
from model.abuse_classifier import AbuseClassifier
import numpy as np


In [2]:
#define the model type and attention loss type we are testing

# MODEL_TYPE = "model_att=encoded_checkpoints"
MODEL_TYPE = "model_att=encoded_checkpoints"
ATTENTION_LOSS_TYPE = "encoded"
comm_or_sent = "sent"

#if model type is no attention, attention lambda should be 0
if MODEL_TYPE == "model_noatt_checkpoints":
    attention_lambda = 0.0
    ATTENTION_LOSS_TYPE = "none"
else:
    attention_lambda = 0.2

In [3]:
#these thresholds were found by seeing which threshold yielded the best f1 score in the training data
#best threshold for attention model and non attention model came out to be the same
BEST_THRESHOLD = 0.3

In [4]:
flags = tf.app.flags

tf.flags.DEFINE_integer("embedding_dim", 300, "Dimensionality of character embedding (default: 128)")
tf.flags.DEFINE_integer("pos_vocab_size", 26, "Vocab size of POS tags")
tf.flags.DEFINE_integer("pos_embedding_dim", 25, "Dimensionality of pos tag embedding (default: 20)")
tf.flags.DEFINE_float("dropout_keep_prob", 1.0, "Dropout keep probability (default: 0.5)")
tf.flags.DEFINE_float("attention_lambda", attention_lambda, "Supervised attention lambda (default: 0.05)")
tf.flags.DEFINE_string("attention_loss_type", ATTENTION_LOSS_TYPE, "loss function of attention")
tf.flags.DEFINE_float("l2_reg_lambda", 0.02, "L2 regularization lambda (default: 0.05)")
tf.flags.DEFINE_integer("hidden_size", 300, "Dimensionality of RNN cell (default: 300)")
tf.flags.DEFINE_integer("pos_hidden_size", 25, "Dimensionality of POS-RNN cell")
tf.flags.DEFINE_integer("attention_size", 20, "Dimensionality of attention scheme (default: 50)")
tf.flags.DEFINE_boolean("use_pos_flag", True, "use the sequence of POS tags")
# Training parameters -- evaluate_every should be 100
tf.flags.DEFINE_integer("batch_size", 32, "Batch Size (default: 32)")
tf.flags.DEFINE_integer("num_epochs", 60, "Number of training epochs (default: 200)")
tf.flags.DEFINE_integer("evaluate_every", 50, "Evaluate model on dev set after this many steps (default: 100)")
tf.flags.DEFINE_integer("checkpoint_every", 500000, "Save model after this many steps (default: 100)")
# tf.flags.DEFINE_float("train_ratio", 1.0, "Ratio of training data")
# Misc Parameters
tf.flags.DEFINE_string("checkpoint", '', "model")
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")

tf.app.flags.DEFINE_string('f', '', 'kernel')

FLAGS = flags.FLAGS

In [5]:
model_save_folder_name = MODEL_TYPE
print(f"Running model from {model_save_folder_name}")
model_folder_path = 'model_new'
checkpoint_dir = os.path.abspath(os.path.join(model_folder_path, model_save_folder_name))
model_path = os.path.join(checkpoint_dir, "best_model")

assert os.path.isdir(checkpoint_dir)

Running model from model_att=encoded_checkpoints


In [6]:
data_path = 'preprocessing/dump/'
dump_path = "preprocessing/dump/"
print(f"Using data from {data_path} and dump from {dump_path}")
assert os.path.isdir(data_path)
assert os.path.isdir(dump_path)

Using data from preprocessing/dump/ and dump from preprocessing/dump/


In [7]:
def load_vocab(dump_folder):
    vocabulary, pos_vocabulary, init_embed = loadVocabEmb(dump_folder)
    return vocabulary, pos_vocabulary, init_embed

In [8]:
# hyperparameter
max_sent_len = 100
unk = "<UNK>"
pad = "<PAD/>"
emb_dim = 300

def load_data_sents_only_side(dump_folder, data_folder, data_type, only_negative: bool, verbose=True, type="sentence"):
    assert data_type in ["train", "test"]
    with open(os.path.join(dump_folder, "vocab.pkl"), "rb") as handle:
        vocabulary = pickle.load(handle)
    with open(os.path.join(dump_folder, "pos_vocab.pkl"), "rb") as handle:
        pos_vocabulary = pickle.load(handle)
        
    df_path = os.path.join(data_folder, data_type + f"_{type}_df")
    data_df = pd.read_pickle(df_path)
    data_df = data_df[data_df["Abusive"] == ("Yes" if only_negative else "No")]
    
    sentences = data_df["tokenized"].to_list()
    if(type=="comments"):
        labels = data_df["merged_label"].to_list()
    else:
        labels = data_df["binarized_label"].to_list()

    pos_sentences = data_df["pos_tags"].to_list()
    attention = data_df["attention"].to_list()

    # generate features & labels
    x, length, attention = genFeatures(sentences, attention, max_sent_len, vocabulary)
    pos, pos_length = genPOSFeatures(pos_sentences, max_sent_len, pos_vocabulary)
    y = np.array(labels)
    if verbose:
        print("load {} data, input sent size: {}, input POS size: {}, label size: {}".format(
            data_type, np.array(x).shape, np.array(pos).shape, np.array(y).shape))
    return x, length, attention, pos, pos_length, y, sentences

In [10]:
def get_predictions_sent(model_path, dump_folder_path, data_folder_path, only_neg_sent, data_type="test"):
    with open(os.path.join(dump_folder_path, "norm_init_embed.pkl"), "rb") as handle:
        init_embed = pickle.load(handle)

    
    
    x_test, length_test, attention_test, pos_test, pos_length_test, y_test, sentences = load_data_sents_only_side(dump_folder_path, data_folder_path, data_type, only_neg_sent, verbose=False)
    
    len_data = len(x_test)
    print(f"Running model on {len_data} {data_type} samples")

    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement
        )
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            model = AbuseClassifier(
                max_sequence_length=params.max_sent_len,
                num_classes=2,
                pos_vocab_size=FLAGS.pos_vocab_size,
                init_embed=init_embed,
                hidden_size=FLAGS.hidden_size,
                attention_size=FLAGS.attention_size,
                keep_prob=FLAGS.dropout_keep_prob,
                attention_lambda=FLAGS.attention_lambda,
                attention_loss_type=FLAGS.attention_loss_type,
                l2_reg_lambda=0.1,
                use_pos_flag=FLAGS.use_pos_flag)

            global_step = tf.Variable(0, name="global_step", trainable=False)
            saver = tf.train.Saver(tf.all_variables())
            # Initialize all variables
            sess.run(tf.initialize_all_variables())
            saver.restore(sess, model_path)

            dev_scores = []
            dev_confidences = []
            alphas = []
            pos = 0
            gap = 50
            while pos < len(x_test):
                x_batch = x_test[pos:pos + gap]
                pos_batch = pos_test[pos:pos + gap]
                y_batch = y_test[pos:pos + gap]
                length_batch = length_test[pos:pos + gap]
                pos_length_batch = pos_length_test[pos:pos + gap]
                pos += gap
                # score sentences
                feed_dict = {
                    model.input_word: x_batch,
                    model.input_pos: pos_batch,
                    model.input_y: y_batch,
                    model.sequence_length: length_batch,
                    model.dropout_keep_prob: 1.0
                }
                step, scores, alpha = sess.run([global_step, model.prob, model.alphas], feed_dict)
                dev_confidences = dev_confidences + list([[s[0],s[1]] for s in scores])
                dev_scores = dev_scores + list([s[0] for s in scores])
                alphas = alphas + list(alpha)
                
    return (dev_confidences, dev_scores, y_test, alphas), (x_test, length_test, attention_test, pos_test, pos_length_test, y_test, sentences)

def get_predictions_sent_only_pos(model_path, dump_folder_path, data_folder_path, data_type="test"):
    return get_predictions_sent(model_path, dump_folder_path, data_folder_path, False, data_type="test")
        
def get_predictions_sent_only_neg(model_path, dump_folder_path, data_folder_path, data_type="test"):
    return get_predictions_sent(model_path, dump_folder_path, data_folder_path, True, data_type="test")

In [11]:
predictions_only_pos, data_only_pos = get_predictions_sent_only_pos(model_path, dump_path, data_path, data_type="test")

padded sent: (4461, 100)
feature shape: (4461, 100)
padded pos sentences: (4461, 100)
debug padded_pos_sentences: ['V', 'A', 'N', 'O', 'V', 'E', 'D', 'N', 'V', 'O']
pos feature shape: (4461, 100)
Running model on 4461 test samples
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Please use `layer.add_weight` method instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for upd

In [12]:
dev_confidences_only_pos, dev_scores_only_pos, y_test_only_pos, alphas_only_pos = predictions_only_pos
dev_confidences_only_pos = np.array(dev_confidences_only_pos)
dev_scores_only_pos = np.array(dev_scores_only_pos)

In [13]:
x_test_only_pos, length_test_only_pos, attention_test_only_pos, \
pos_test_only_pos, pos_length_test_only_pos, y_test_only_pos, sentences_only_pos = data_only_pos

In [14]:
dev_conf_benign = dev_confidences_only_pos[:,1]
dev_conf_benign

array([0.6519251 , 0.90579915, 0.90477836, ..., 0.90324706, 0.90417606,
       0.9002203 ], dtype=float32)

In [15]:
zipped_lists = zip(*[
    list(l) for l in [
        dev_conf_benign, 
        x_test_only_pos, 
        length_test_only_pos, 
        attention_test_only_pos, 
        pos_test_only_pos, 
        pos_length_test_only_pos, 
        y_test_only_pos, 
        sentences_only_pos
    ]
])
sorted_lists = tuple(zip(*sorted(zipped_lists, key=lambda x: x[0])))

dev_conf_benign_sorted, \
x_test_only_pos_sorted, \
length_test_only_pos_sorted, \
attention_test_only_pos_sorted, \
pos_test_only_pos_sorted, \
pos_length_test_only_pos_sorted, \
y_test_only_pos_sorted, \
sentences_only_pos_sorted = sorted_lists

# Attack

### Make new Comment DF with modified comments

#### STEPS:
1. Load the original comments DF
2. Get top N positive sentences 
3. Loop for each for each comment ID
4. Add a sentences before orignal comment
5. Add b sentences after original comment
6. Save new DF

In [16]:
def flatten(t):
    return [item for sublist in t for item in sublist]

In [17]:
def make_adv_examples_df(data_path, select_top_n, a_sentences_before, b_sentences_after, sorted_lists, save_path):
    # STEP 1: Load the original comments DF
    df_path = os.path.join(data_path, "test_comments_df")
    data_df = pd.read_pickle(df_path)
    data_df_only_neg = data_df[data_df["merged_label"].apply(lambda lbl: lbl[0] == 1)]
    
    # STEP 2: Get top N positive sentences 
    attack_data_dev_conf_benign = sorted_lists[0][-select_top_n:]
    attack_data_x_test_benign = sorted_lists[1][-select_top_n:]
    attack_data_length_test_benign = sorted_lists[2][-select_top_n:]
    attack_data_attention_test_benign = sorted_lists[3][-select_top_n:]
    attack_data_pos_test_benign = sorted_lists[4][-select_top_n:]
    attack_data_pos_length_test_benign = sorted_lists[5][-select_top_n:]
    attack_data_y_test_benign = sorted_lists[6][-select_top_n:]
    attack_data_sentences_tokenized_benign = sorted_lists[7][-select_top_n:]
    attack_data_sentences_benign = [" ".join(sent) for sent in attack_data_sentences_tokenized_benign]
    
    # STEP 3: Loop for each for each comment ID
    neg_comment_count = data_df_only_neg.shape[0]
    for comment_index in range(neg_comment_count):
        row = data_df_only_neg.iloc[comment_index]

        # STEP 4, 5: Add a sentences before orignal comment, Add b sentences after orignal comment

        sentences_before = [np.random.choice(range(select_top_n)) for _ in range(a_sentences_before)]
        sentences_after = [np.random.choice(range(select_top_n)) for _ in range(b_sentences_after)]

        comment_before = [attack_data_sentences_benign[i] for i in sentences_before]
        comment_after = [attack_data_sentences_benign[i] for i in sentences_after]
        row["Comment"] = comment_before + row["Comment"] + comment_after

        row["labels"] = ([0] * a_sentences_before) + row["labels"] + ([0] * b_sentences_after)
        row["merged_comment"] = ". ".join(row["Comment"])

        # Labelled as toxic, this would be the "human label", we want the model to get this wrong
        row["merged_label"] = [1, 0] 

        tokenized_before = flatten([attack_data_sentences_tokenized_benign[i] for i in sentences_before])
        tokenized_after = flatten([attack_data_sentences_tokenized_benign[i] for i in sentences_after])

        row["tokenized"] = tokenized_before + row["tokenized"] + tokenized_after

        row["attention"] = [] # ignore this

        pos_before = flatten([attack_data_pos_test_benign[i] for i in sentences_before])
        pos_after = flatten([attack_data_pos_test_benign[i] for i in sentences_after])
        row["pos_tags"] = pos_before + row["pos_tags"] + pos_after
        
    # STEP 6: Save new DF
    save_file_name = f"adv_sent_mimicry_n{select_top_n}_a{a_sentences_before}_b{b_sentences_after}_ENCATT_df"
    data_df_only_neg.to_pickle(f"{save_path}/{save_file_name}")

In [18]:
top_n_options = [10, 50, 100]
ab_sent_options = [0, 1, 2, 3, 5]
for n in top_n_options:
    for a in ab_sent_options:
        for b in ab_sent_options:
            make_adv_examples_df(
                'preprocessing/dump/',  # DATA PATH
                n, a, b,                # select_top_n, a_sentences_before, b_sentences_after                
                sorted_lists,           # SORTED DATA
                'adv_sentence_mimicry'
            )