In [2]:
import json
import requests
import time
import sys
sys.path.append('../../')
from config import HUGGINGFACE_API_TOKEN, PERSPECTIVE_API_KEY

with open('../../data/huggingface_cache.json') as f:
  huggingface_cache = f.read()
huggingface_cache_list = json.loads(huggingface_cache)
huggingface_cache = {}
for item in huggingface_cache_list:
    huggingface_cache[tuple(item[0])] = item[1]

def huggingface_predict(row,col_name, model_name, label_set, positive_label):
    sentence = row[col_name]
    if (model_name,sentence) not in huggingface_cache:
        time.sleep(2)
        API_URL = "https://api-inference.huggingface.co/models/" + model_name
        API_TOKEN = HUGGINGFACE_API_TOKEN
        headers = {"Authorization": f"Bearer {API_TOKEN}"}
        data = json.dumps(sentence)
        response = requests.request("POST", API_URL, headers=headers, data=data)
        res = json.loads(response.content.decode("utf-8"))
        print(res)
        while 'error' in res or len(res[0]) < 2 or res[0][1]['label'] not in label_set:
            print(sentence, res)
            time.sleep(30)
            response = requests.request("POST", API_URL, headers=headers, data=data)
            res = json.loads(response.content.decode("utf-8"))
        huggingface_cache[(model_name,sentence)] = res[0][0]['score'] if res[0][0]['label'] == positive_label else res[0][1]['score'] 
    return huggingface_cache[(model_name,sentence)]

In [31]:
from googleapiclient import discovery

with open('../../data/perspective_cache.json') as f:
  perspective_cache_f = f.read()
perspective_cache_list = json.loads(perspective_cache_f)
perspective_cache = {}
for item in perspective_cache_list:
    perspective_cache[item[0]] = item[1]


API_KEY = PERSPECTIVE_API_KEY

client = discovery.build(
    "commentanalyzer",
    "v1alpha1",
    developerKey=API_KEY,
    discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
    static_discovery=False,
)
def perspective_api(row, t=0.7,col_name='Answer.perturbed'):
    sentence = row[col_name]
    if not sentence in perspective_cache:
        time.sleep(2)
        analyze_request = {
                    'comment': { 'text': sentence },
                    'requestedAttributes': {'TOXICITY': {}},
                    'languages': ["en"],
                    'doNotStore':True
                }
        
        response = client.comments().analyze(body=analyze_request).execute()
        perspective_cache[sentence] = response["attributeScores"]["TOXICITY"]["spanScores"][0]["score"]["value"]
        print(sentence, perspective_cache[sentence])
    return perspective_cache[sentence]


In [14]:
import pandas as pd

grouped_multiple = pd.read_csv("../../toxic detection model test set with perturbations.csv") 

In [24]:
from nltk.corpus import words
import Levenshtein

place_holder_list = ['_','-','*','~','/']


word_set = set(words.words())

def classify_perturbation_type(row, func):
    c,p = row['clean_version'].split()[row['location']], row['pert_word']
    return 1 if func(c,p) else 0



def locate_pert(row):
    # print(row)
    c,p = row['clean_version'].split(), row['perturbed_version'].split()
    for i in range(len(c)):
        if c[i] != p[i]:
            return i
    return -1


def distance(row):
    l = row['location']
    c,p = row['clean_version'].split()[l], row['perturbed_version'].split()[l]
    return Levenshtein.distance(c.lower(),p.lower())


def show_pert(row):
    l = row['location']
    return row['perturbed_version'].split()[l]


def simplify_word(word):
    pw = list(word.lower())
    i = 1
    while i < len(pw):
        if pw[i] == pw[i-1]:
            pw.pop(i)
        else:
            i += 1
    return ''.join(pw)


def lowercase_uppercase(clean_word, pert_word):
    return pert_word[1:].lower() != pert_word[1:] and pert_word.upper() != pert_word


def interesting_lowercase_uppercase(clean_word, pert_word):
    collect = ''
    for c in pert_word:
        if c.lower() != c:
            collect += c
    return pert_word.lower() != pert_word and collect.lower() in word_set and len(collect) != len(pert_word) and len(collect) > 1


def repeat_char(clean_word, pert_word):
    c,p = clean_word.lower(),pert_word.lower()
    simp_c, simp_p = simplify_word(c), simplify_word(p)
    return len(p) > len(c) and simp_c == simp_p and simp_p != p


def abbr(clean_word, pert_word):
    c,p = clean_word.lower(),pert_word.lower()
    i = 0
    for character in c:
        if character == p[i]:
            i += 1
            if i == len(p):
                return len(p) < len(c)
    return False


def placeholder(self, clean_word, pert_word):
    if len(pert_word) < 4:
        return False
    for c in list(pert_word):
        if c in self.place_holder_list:
            return True
    return False


def special_character(clean_word, pert_word):
    for c in list(pert_word.lower()):
        if (c < 'a' or c > 'z') and c not in place_holder_list:
            return True
    return False
grouped_multiple['location'] = grouped_multiple.apply(locate_pert,axis=1)
# grouped_multiple['distance'] = grouped_multiple.apply(distance,axis=1)
grouped_multiple['pert_word'] = grouped_multiple.apply(show_pert,axis=1)
grouped_multiple['is_special'] = grouped_multiple.apply(classify_perturbation_type, args=(special_character,), axis=1)
grouped_multiple['is_repeat'] = grouped_multiple.apply(classify_perturbation_type, args=(repeat_char,), axis=1)
grouped_multiple['is_interesting_up'] = grouped_multiple.apply(classify_perturbation_type, args=(interesting_lowercase_uppercase,), axis=1)
grouped_multiple['is_up'] = grouped_multiple.apply(classify_perturbation_type, args=(lowercase_uppercase,), axis=1)
grouped_multiple['is_abbr'] = grouped_multiple.apply(classify_perturbation_type, args=(abbr,), axis=1)

print(grouped_multiple[grouped_multiple["is_special"] == 1].shape)
print(grouped_multiple[grouped_multiple["is_repeat"] == 1].shape)
print(grouped_multiple[grouped_multiple["is_interesting_up"] == 1].shape)
print(grouped_multiple[grouped_multiple["is_up"] == 1].shape)
print(grouped_multiple[grouped_multiple["is_abbr"] == 1].shape)

(174, 51)
(471, 51)
(207, 51)
(523, 51)
(210, 51)


# Perspective API

In [33]:
grouped_multiple['perspective_result_on_pert'] = grouped_multiple.apply(perspective_api, args=(0.5,'perturbed_version'), axis=1)
grouped_multiple['perspective_result_on_clean'] = grouped_multiple.apply(perspective_api, args=(0.5,'clean_version'), axis=1)

thats an Assinine comment 0.32128486
we just got sCrEWeD 0.47473195
thats an asinine comment 0.45921504
we just got screwed 0.50503236


# BERT

In [27]:
model_name = 'cardiffnlp/bertweet-base-offensive'
label_set = set(['LABEL_0', 'LABEL_1'])
pos_label = 'LABEL_1'

grouped_multiple[model_name + '_on_clean'] = grouped_multiple.apply(huggingface_predict, args=('clean_version',model_name,label_set,pos_label,), axis=1)
print("clean acc: ", grouped_multiple[grouped_multiple[model_name + '_on_clean'] > 0.5].shape[0] / grouped_multiple.shape[0])
grouped_multiple[model_name + '_on_perturb'] = grouped_multiple.apply(huggingface_predict, args=('perturbed_version',model_name,label_set,pos_label,), axis=1)
print("perturb acc: ", grouped_multiple[grouped_multiple[model_name + '_on_perturb'] > 0.5].shape[0] / grouped_multiple.shape[0])



{'error': 'Model cardiffnlp/bertweet-base-offensive is currently loading', 'estimated_time': 20.0}
thats an asinine comment {'error': 'Model cardiffnlp/bertweet-base-offensive is currently loading', 'estimated_time': 20.0}
thats an asinine comment {'error': 'Service Unavailable'}
[[{'label': 'LABEL_1', 'score': 0.7726978063583374}, {'label': 'LABEL_0', 'score': 0.22730223834514618}]]
clean acc:  0.8976848394324123
[[{'label': 'LABEL_0', 'score': 0.7121782302856445}, {'label': 'LABEL_1', 'score': 0.2878217101097107}]]
[[{'label': 'LABEL_0', 'score': 0.729638397693634}, {'label': 'LABEL_1', 'score': 0.27036160230636597}]]
perturb acc:  0.5571321882001493


# RoBERTa

In [18]:
model_name = 'cardiffnlp/roberta-base-offensive'
label_set = set(['offensive', 'non-offensive'])
pos_label = 'offensive'

grouped_multiple[model_name + '_on_clean'] = grouped_multiple.apply(huggingface_predict, args=('clean_version',model_name,label_set,pos_label,), axis=1)
print("clean acc: ", grouped_multiple[grouped_multiple[model_name + '_on_clean'] > 0.5].shape[0] / grouped_multiple.shape[0])
grouped_multiple[model_name + '_on_perturb'] = grouped_multiple.apply(huggingface_predict, args=('perturbed_version',model_name,label_set,pos_label,), axis=1)
print("perturb acc: ", grouped_multiple[grouped_multiple[model_name + '_on_perturb'] > 0.5].shape[0] / grouped_multiple.shape[0])



{'error': 'Model cardiffnlp/roberta-base-offensive is currently loading', 'estimated_time': 20.0}
thats an asinine comment {'error': 'Model cardiffnlp/roberta-base-offensive is currently loading', 'estimated_time': 20.0}
[[{'label': 'offensive', 'score': 0.9631127715110779}, {'label': 'non-offensive', 'score': 0.03688724339008331}]]
clean acc:  0.9156086631814787
[[{'label': 'non-offensive', 'score': 0.8302836418151855}, {'label': 'offensive', 'score': 0.16971634328365326}]]
[[{'label': 'non-offensive', 'score': 0.9694981575012207}, {'label': 'offensive', 'score': 0.030501825734972954}]]
perturb acc:  0.5212845407020165


In [34]:
type_list = ["is_special", "is_repeat", "is_interesting_up", "is_up", "is_abbr"]
for t in type_list:
    print(t)
    t_generator = (round(x * 0.01,2) for x in range(0, 101))
    grouped_multiple_with_t = grouped_multiple[grouped_multiple[t] == 1]
    for t in t_generator:
        print(t,round(grouped_multiple_with_t[grouped_multiple_with_t["perspective_result_on_pert"] > t].shape[0] / grouped_multiple_with_t.shape[0],3),round(grouped_multiple_with_t[grouped_multiple_with_t["perspective_result_on_clean"] > t].shape[0] / grouped_multiple_with_t.shape[0],3))



is_special
0.0 1.0 1.0
0.01 1.0 1.0
0.02 1.0 1.0
0.03 0.994 1.0
0.04 0.994 1.0
0.05 0.994 1.0
0.06 0.994 1.0
0.07 0.989 1.0
0.08 0.989 1.0
0.09 0.989 1.0
0.1 0.989 1.0
0.11 0.983 1.0
0.12 0.977 1.0
0.13 0.977 1.0
0.14 0.971 1.0
0.15 0.971 1.0
0.16 0.971 1.0
0.17 0.971 1.0
0.18 0.96 1.0
0.19 0.96 1.0
0.2 0.96 1.0
0.21 0.96 1.0
0.22 0.96 1.0
0.23 0.96 1.0
0.24 0.96 1.0
0.25 0.96 1.0
0.26 0.948 1.0
0.27 0.948 1.0
0.28 0.943 1.0
0.29 0.925 1.0
0.3 0.925 1.0
0.31 0.902 1.0
0.32 0.902 1.0
0.33 0.897 1.0
0.34 0.874 0.994
0.35 0.874 0.994
0.36 0.868 0.994
0.37 0.851 0.983
0.38 0.816 0.983
0.39 0.81 0.977
0.4 0.81 0.96
0.41 0.782 0.954
0.42 0.77 0.954
0.43 0.753 0.943
0.44 0.724 0.931
0.45 0.695 0.92
0.46 0.649 0.902
0.47 0.649 0.897
0.48 0.609 0.856
0.49 0.575 0.833
0.5 0.563 0.805
0.51 0.506 0.782
0.52 0.477 0.753
0.53 0.454 0.713
0.54 0.454 0.707
0.55 0.431 0.695
0.56 0.402 0.667
0.57 0.397 0.649
0.58 0.351 0.615
0.59 0.322 0.58
0.6 0.31 0.58
0.61 0.259 0.506
0.62 0.247 0.506
0.63 0.241 0.48