# This attack simply inserts high-confidence 'positive' words based on queries from the test set to the model on sentences around that are negative. 

- This is a black box attack, assumes only access to confidence predictions and some preexising dataset of sentences, which may be benign.

## STEPS
1. Load all test sentences
2. Score them individually
3. Get top N (10?) benign, with high-confidence 
4. Generate attack tests (vary combinations of (`a=0,1,2` and `b=0,1,2`):
 - Adding `a` sentences before and `b` sentences after, chosen randomly from step3

In [8]:
import pickle
import os 
import pandas as pd 
from collections import Counter

In [25]:
#tokenized test data looking at sentence 
path_to_training = r'../preprocessing/dump/test_sent.data'

#path to testing data we want to attack
exp_folder = r"../experiments/data/exp_sentence_mimicry/"

target_path = os.path.join(exp_folder, "test_comments.csv") # TODO: get actual test comments

#path to save adversarial examples
adv_folder = os.path.join(exp_folder, "adv")

#create if it does not already exist
if not os.path.isdir(adv_folder):
    os.mkdir(adv_folder)
    
adv_path = os.path.join(adv_folder, "test_comments.csv")

In [26]:
with open(path_to_training, "rb") as handle:
    sent_list, label_list = pickle.load(handle)

In [27]:
for i in range(5):
    print(sent_list[i],label_list[i][0])

['nice', 'try'] 0
["i'm", 'a', 'conservative', 'libertarian'] 0
['but', 'i', 'can', 'spot', 'a', 'racist', 'from', 'a', 'mile', 'away'] 0
['because', 'they', 'make', 'comments', 'like', 'that'] 0
['stupid', ',', 'senseless', ',', 'idiotic', ',', 'mind', 'numbing', ',', 'low', 'i', 'q', 'comments'] 1


In [28]:
#create a dataframe using sentence and labels

test_df = pd.DataFrame(columns=["sentence", "label"])

In [29]:
test_df["sentence"] = sent_list
test_df["label"] = [l[0] for l in label_list]

In [30]:
test_df

Unnamed: 0,sentence,label
0,"[nice, try]",0
1,"[i'm, a, conservative, libertarian]",0
2,"[but, i, can, spot, a, racist, from, a, mile, ...",0
3,"[because, they, make, comments, like, that]",0
4,"[stupid, ,, senseless, ,, idiotic, ,, mind, nu...",1
...,...,...
5012,"[it, 's, not, them, dying, if, they, vote, for...",0
5013,"[they, got, all, of, the, privileges, with, no...",1
5014,"[omg, that, bitch, is, going, to, have, a, hea...",1
5015,"[that, was, the, most, well, spoken, video, u,...",0


In [31]:
test_df['label'].value_counts()/len(test_df)

0    0.827387
1    0.172613
Name: label, dtype: float64

In [35]:
#load data we will be attacking
target = pd.read_csv(target_path)
target

Unnamed: 0,Comment #,Assigned to:,Sentence,Prosocial,Abusive
0,1,Lauren,@user06652 You can't be serious?,No,No
1,1,Lauren,"Just google ""United Nations"".",No,No
2,1,Lauren,You may learn something.,No,No
3,2,Lauren,Feminist = Misandrist,No,Yes
4,3,Lauren,Where the fuck do these fat cunts learn this s...,No,Yes
...,...,...,...,...,...
26367,13038,Tiana,I will donate and I would encourage others to ...,No,No
26368,13038,Tiana,Do anything it takes to destroy these stupid c...,No,Yes
26369,13039,Tiana,"""100 lkies"". Try 6k :D",No,No
26370,13040,Tiana,You bet your faggot ass im sending you money r...,No,Yes


In [52]:
pos_sentences = test_df[test_df['label']==0]
pos_sentences

Unnamed: 0,sentence,label
0,"[nice, try]",0
1,"[i'm, a, conservative, libertarian]",0
2,"[but, i, can, spot, a, racist, from, a, mile, ...",0
3,"[because, they, make, comments, like, that]",0
5,"[designed, to, not, only, insult, one, person,...",0
...,...,...
5009,"[$mention$, i, 've, caught, crabs, twice, ,, b...",0
5010,"[try, saying, that, with, your, mouth, full]",0
5012,"[it, 's, not, them, dying, if, they, vote, for...",0
5015,"[that, was, the, most, well, spoken, video, u,...",0


In [55]:
only_positive_test_sentences = []
for i, sentence in enumerate(pos_sentences["sentence"]):
    only_positive_test_sentences.append((
        i,
        "N/A",
        " ".join(sentence),
        "No",
        "No"
    ))

In [59]:
only_positive_test_sentences_df = pd.DataFrame(only_positive_test_sentences, columns = target.columns.copy())
only_positive_test_sentences_df.to_csv("only_positive_test_sentences.csv", index=False)
only_positive_test_sentences_df

Unnamed: 0,Comment #,Assigned to:,Sentence,Prosocial,Abusive
0,0,,nice try,No,No
1,1,,i'm a conservative libertarian,No,No
2,2,,but i can spot a racist from a mile away,No,No
3,3,,because they make comments like that,No,No
4,4,,"designed to not only insult one person , but a...",No,No
...,...,...,...,...,...
4146,4146,,"$mention$ i 've caught crabs twice , but i'm a...",No,No
4147,4147,,try saying that with your mouth full,No,No
4148,4148,,it 's not them dying if they vote for a warmonger,No,No
4149,4149,,that was the most well spoken video u have don...,No,No


In [36]:
target_toxic_sentences = target[target["Abusive"] == 'Yes']
target_toxic_sentences

Unnamed: 0,Comment #,Assigned to:,Sentence,Prosocial,Abusive
3,2,Lauren,Feminist = Misandrist,No,Yes
4,3,Lauren,Where the fuck do these fat cunts learn this s...,No,Yes
13,8,Lauren,I feel sorry for the birds being around such a...,No,Yes
14,9,Lauren,"Wow, I actually thought she was decent looking...",No,Yes
15,9,Lauren,Dat's one dumb cunt.ï»¿,No,Yes
...,...,...,...,...,...
26356,13031,Tiana,Illegal immigrants do need to get the fuck out...,No,Yes
26357,13032,Tiana,@user11016 bec he's a retard who profits of th...,No,Yes
26364,13037,Tiana,wait woah I'm no fag but I love your vids,No,Yes
26368,13038,Tiana,Do anything it takes to destroy these stupid c...,No,Yes


In [313]:
import random
import math
def attack_row(row):
    """
    Attacks a single row of the dataframe to evade the toxicity detector by adding a fixed % of words by length at the beginning and
    end of the sentence. Returns the modified sentence
    """
    PERC_INS = 0.5 
   
    sentence = row["Sentence"]

    
    #only attack abusive points
    if row["Abusive"]=="Yes":
        len_sent = len(sentence.split(" "))
        
        NUM_INS = min(len(attack_words),math.ceil(PERC_INS*len_sent))
        insert_first = random.sample(attack_words, NUM_INS)
        insert_last = random.sample(attack_words, NUM_INS)
        
        return " ".join(insert_first) + " " + sentence + " " + " ".join(insert_last)
    else:
        return sentence
        
        
    
    
    

In [314]:
attacked_target = target
    
    

In [315]:
attacked_target["Sentence"] = target.apply(attack_row, axis=1)

In [316]:
attacked_target

Unnamed: 0,Comment #,Assigned to:,Sentence,Prosocial,Abusive
0,1,Lauren,@user06652 You can't be serious?,No,No
1,1,Lauren,"Just google ""United Nations"".",No,No
2,1,Lauren,You may learn something.,No,No
3,2,Lauren,credit growing Feminist = Misandrist communist...,No,Yes
4,3,Lauren,benefit familiar private communists activism W...,No,Yes
...,...,...,...,...,...
26367,13038,Tiana,I will donate and I would encourage others to ...,No,No
26368,13038,Tiana,belief field shall sources tend Do anything it...,No,Yes
26369,13039,Tiana,"""100 lkies"". Try 6k :D",No,No
26370,13040,Tiana,janice throughout accusing wise list forgotten...,No,Yes


In [317]:
#save the attacked data to test!

attacked_target.to_csv(adv_path, index=False)

In [318]:
attacked_target.iloc[4]["Sentence"]

'benefit familiar private communists activism Where the fuck do these fat cunts learn this shit?ï»¿ student posting wives games de'

In [284]:
adv_path

'../experiments/data/exp_3/adv\\test_comments.csv'