In [None]:
from datetime import datetime
from pathlib import Path
import re

from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.metrics import classification_report
from tqdm.notebook import tnrange, tqdm
import pandas as pd
pd.set_option('display.max_colwidth', None)

import  torch
if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"
print(f"Using device: {device}")
    
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [None]:
negation_words = set(["no", "not", "never", "neither"])

In [None]:
nli_model_checkpoint = "facebook/bart-large-mnli"
model = AutoModelForSequenceClassification.from_pretrained(nli_model_checkpoint")
tokenizer = AutoTokenizer.from_pretrained(nli_model_checkpoint)

In [None]:
def test_single_hypothesis(premise, hypothesis, target_group=None, mask_token="[X]"):
    if target_group:  # target_group not none
        hypothesis = hypothesis.replace(mask_token, target_group)
    # run through model pre-trained on MNLI
    x = tokenizer.encode(premise, hypothesis, return_tensors='pt', truncation='only_first')
    with torch.no_grad():
        logits = model(x.to(device))[0]
    # we throw away "neutral" (dim 1) and take the probability of
    # "entailment" (2) as the probability of the label being true 
    entail_contradiction_logits = logits[:,[0,2]]
    probs = entail_contradiction_logits.softmax(dim=1)
    return probs[:,1].item()

## 0. Loading the dataset

In [None]:
data_folder = Path("../datasets/gpt3.5-generated")
output_folder = Path("output/")

# TODO: changeme
target_group = "women"  # options: "black people", "Muslims", "disabled people", "gays", "immigrants", "trans", "women"
data_file = f"dataset_{target_group}.csv"

In [None]:
if (output_folder/data_file).is_file():
    df = pd.read_csv(output_folder/data_file)
else:
    df = pd.read_csv(data_folder/data_file)
df.drop_duplicates(subset=["message"], keep='first', inplace=True, ignore_index=True)

print(df.columns)

## 1. General hypothesis tests
### A. Identity hypothesis test

In [None]:
hypothesis = "That example is about [X]."
result = list()
for tup in tqdm(df.itertuples(), total=df.shape[0]):
    message, functionality, hate_label = tup[1], tup[2], tup[3]
    prob = test_single_hypothesis(message, hypothesis, target_group=target_group)
    result.append(int(prob > 0.5))
df['nli_identity'] = result

In [None]:
df.to_csv(output_folder/data_file, index=False)

### B. Hate hypothesis test

In [None]:
hypothesis = "That contains hate speech."
result = list()
for tup in tqdm(df.itertuples(), total=df.shape[0]):
    message, functionality, hate_label = tup[1], tup[2], tup[3]
    prob = test_single_hypothesis(message, hypothesis)
    result.append(int(prob > 0.5))
df['nli_hate'] = result

In [None]:
# a message is hate speech if it mentions the identity and it's hateful
df['nli_hate_label'] = df.apply(lambda x: int(x['nli_identity'] and x['nli_hate']), axis=1)

In [None]:
df.to_csv(output_folder/data_file, index=False)

## 2. Functionality-Level Hypothesis Tests
### F1: Expression of strong negative emotions (explicit)

In [None]:
hypothesis = "That text contains expression of strong negative emotions."
func = "F1: Expression of strong negative emotions (explicit)"
result = list()
for tup in tqdm(df.itertuples(), total=df.shape[0]):
    message, functionality, hate_label = tup[1], tup[2], tup[3]
    if functionality == func:  # only apply to the target functionality
        prob = test_single_hypothesis(message, hypothesis)
        result.append(int(prob > 0.5))
    else:
        result.append(-1)
df['nli_f1_neg_emotion'] = result

In [None]:
df.to_csv(output_folder/data_file, index=False)

### F2: Description using very negative attributes (explicit)

In [None]:
#hypothesis = "That text mentions a negative attribute of [X]."
hypothesis = "That text describes [X] using very negative attributes."
func = "F2: Description using very negative attributes (explicit)"

result = list()
for tup in tqdm(df.itertuples(), total=df.shape[0]):
    message, functionality, hate_label = tup[1], tup[2], tup[3]
    if functionality == func:  # only apply to the target functionality
        prob = test_single_hypothesis(message, hypothesis, target_group=target_group)
        result.append(int(prob > 0.5))
    else:
        result.append(-1)
df['nli_f2_neg_attr'] = result

In [None]:
df.to_csv(output_folder/data_file, index=False)

### F3: Dehumanisation (explicit)

In [None]:
hypothesis = "That text contains dehumanization."
func = "F3: Dehumanisation (explicit)"

result = list()
for tup in tqdm(df.itertuples(), total=df.shape[0]):
    message, functionality, hate_label = tup[1], tup[2], tup[3]
    if functionality == func:  # only apply to the target functionality
        prob = test_single_hypothesis(message, hypothesis)
        result.append(int(prob > 0.5))
    else:
        result.append(-1)
df['nli_f3_dehum'] = result

In [None]:
df.to_csv(output_folder/data_file, index=False)

### F4: Implicit derogation

In [None]:
hypothesis_explicit = "That text contains explicit obscene language."
hypothesis_derogation = "That text contains derogation"
func = "F4: Implicit derogation"

result = list()
derogation_result = list()
for tup in tqdm(df.itertuples(), total=df.shape[0]):
    message, functionality, hate_label = tup[1], tup[2], tup[3]
    if functionality == func:  # only apply to the target functionality
        prob = test_single_hypothesis(message, hypothesis_explicit)
        result.append(int(prob < 0.5))
        prob = test_single_hypothesis(message, hypothesis_derogation)
        derogation_result.append(int(prob > 0.5))
    else:
        result.append(-1)
        derogation_result.append(-1)
df['nli_f4_implicit'] = result
df['nli_f4_derogation'] = derogation_result

In [None]:
df.to_csv(output_folder/data_file, index=False)

### F5: Direct threat

In [None]:
hypothesis = "That text contains a direct threat."
func = "F5: Direct threat"

result = list()
for tup in tqdm(df.itertuples(), total=df.shape[0]):
    message, functionality, hate_label = tup[1], tup[2], tup[3]
    if functionality == func:  # only apply to the target functionality
        prob = test_single_hypothesis(message, hypothesis)
        result.append(int(prob > 0.5))
    else:
        result.append(-1)
df['nli_f5_dir_threat'] = result

In [None]:
df.to_csv(output_folder/data_file, index=False)

### F6: Threat as normative statement

In [None]:
hypothesis = "That text contains a threat as normative statement."
func = "F6: Threat as normative statement"

result = list()
for tup in tqdm(df.itertuples(), total=df.shape[0]):
    message, functionality, hate_label = tup[1], tup[2], tup[3]
    if functionality == func:  # only apply to the target functionality
        prob = test_single_hypothesis(message, hypothesis)
        result.append(int(prob > 0.5))
    else:
        result.append(-1)
df['nli_f6_norm_threat'] = result

In [None]:
df.to_csv(output_folder/data_file, index=False)

### F7: Hate expressed using slur

In [None]:
hypothesis = "That text contains a slur referring to [X]."
func = "F7: Hate expressed using slur"

result = list()
for tup in tqdm(df.itertuples(), total=df.shape[0]):
    message, functionality, hate_label = tup[1], tup[2], tup[3]
    if functionality == func:  # only apply to the target functionality
        prob = test_single_hypothesis(message, hypothesis, target_group=target_group)
        result.append(int(prob > 0.5))
    else:
        result.append(-1)
df['nli_f7_slur'] = result

In [None]:
df.to_csv(output_folder/data_file, index=False)

### F9: Reclaimed slurs

Using the prompts in the hypothesis engineering paper.

```
  "rec_slur": {
    "myself": "This text is about myself.",
    "us": "This text is about us.",
    "neg-senti": "This text has a negative sentiment."
  }
```

In [None]:
hypothesis = "This text is about myself."
func = "F9: Reclaimed slurs"

result = list()
for tup in tqdm(df.itertuples(), total=df.shape[0]):
    message, functionality, hate_label = tup[1], tup[2], tup[3]
    if functionality == func:  # only apply to the target functionality
        prob = test_single_hypothesis(message, hypothesis)
        result.append(int(prob > 0.5))
    else:
        result.append(-1)
df['nli_f9_myself'] = result

In [None]:
hypothesis = "This text is about us."
func = "F9: Reclaimed slurs"

result = list()
for tup in tqdm(df.itertuples(), total=df.shape[0]):
    message, functionality, hate_label = tup[1], tup[2], tup[3]
    if functionality == func:  # only apply to the target functionality
        prob = test_single_hypothesis(message, hypothesis)
        result.append(int(prob > 0.5))
    else:
        result.append(-1)
df['nli_f9_us'] = result

In [None]:
hypothesis = "This text has a negative sentiment."
func = "F9: Reclaimed slurs"

result = list()
for tup in tqdm(df.itertuples(), total=df.shape[0]):
    message, functionality, hate_label = tup[1], tup[2], tup[3]
    if functionality == func:  # only apply to the target functionality
        prob = test_single_hypothesis(message, hypothesis)
        result.append(int(prob > 0.5))
    else:
        result.append(-1)
df['nli_f9_negative'] = result

In [None]:
# a message is reclaim slur if it is positive or it refers to the speakers own group
def aggregate_f9_label(x):
    if x['nli_f9_negative'] == -1:
        return -1
    else:
        return int((not x['nli_f9_negative']) or x['nli_f9_myself'] or x['nli_f9_us'])
    
df['nli_f9_aggregate'] = df.apply(lambda x: aggregate_f9_label(x), axis=1)

In [None]:
df.to_csv(output_folder/data_file, index=False)

### F10: Hate expressed using profanity

In [None]:
hypothesis = "That text contains obscene language."
func = "F10: Hate expressed using profanity"

result = list()
for tup in tqdm(df.itertuples(), total=df.shape[0]):
    message, functionality, hate_label = tup[1], tup[2], tup[3]
    if functionality == func:  # only apply to the target functionality
        prob = test_single_hypothesis(message, hypothesis)
        result.append(int(prob > 0.5))
    else:
        result.append(-1)
df['nli_f10_profane'] = result

In [None]:
df.to_csv(output_folder/data_file, index=False)

### F11: Non-hateful use of profanity

In [None]:
hypothesis = "That text contains obscene language."
func = "F11: Non-hateful use of profanity"

result = list()
for tup in tqdm(df.itertuples(), total=df.shape[0]):
    message, functionality, hate_label = tup[1], tup[2], tup[3]
    if functionality == func:  # only apply to the target functionality
        prob = test_single_hypothesis(message, hypothesis)
        result.append(int(prob > 0.5))
    else:
        result.append(-1)
df['nli_f11_profane'] = result

In [None]:
df.to_csv(output_folder/data_file, index=False)

### F12: Hate expressed through reference in subsequent clauses

In [None]:
hypothesis = "That contains hate speech."
func = "F12: Hate expressed through reference in subsequent clauses"

result = list()
for tup in tqdm(df.itertuples(), total=df.shape[0]):
    message, functionality, hate_label = tup[1], tup[2], tup[3]
    if functionality == func:  # only apply to the target functionality
        if "," not in message:  # if has fewer than 2 phrases -> fail
            result.append(0)
        else:
            subsequent_clause = message[message.index(",")+1:]
            prob = test_single_hypothesis(subsequent_clause, hypothesis)
            result.append(int(prob > 0.5))
    else:
        result.append(-1)
df['nli_f12_subsequent_clause_hate'] = result

In [None]:
df.to_csv(output_folder/data_file, index=False)

### F13: Hate expressed through reference in subsequent sentences

In [None]:
hypothesis = "That contains hate speech."
func = "F13: Hate expressed through reference in subsequent sentences"

result = list()
for tup in tqdm(df.itertuples(), total=df.shape[0]):
    message, functionality, hate_label = tup[1], tup[2], tup[3]
    if functionality == func:  # only apply to the target functionality
        sents = sent_tokenize(message)
        if len(sents) < 2:  # if has fewer than 2 sentences -> fail
            result.append(0)
        else:
            subsequent_sent = " ".join(sents[1:])
            prob = test_single_hypothesis(subsequent_sent, hypothesis)
            result.append(int(prob > 0.5))
    else:
        result.append(-1)
df['nli_f13_subsequent_sent_hate'] = result

In [None]:
df.to_csv(output_folder/data_file, index=False)

### F14: Hate expressed using negated positive statement

In [None]:
hypothesis = "That text contains a negation."
func = "F14: Hate expressed using negated positive statement"

result = list()
for tup in tqdm(df.itertuples(), total=df.shape[0]):
    message, functionality, hate_label = tup[1], tup[2], tup[3]
    if functionality == func:  # only apply to the target functionality
        prob = test_single_hypothesis(message, hypothesis)
        result.append(int(prob > 0.5))
    else:
        result.append(-1)
df['nli_f14_negation'] = result

In [None]:
hypothesis = "That text is a positive statement towards [X]."
func = "F14: Hate expressed using negated positive statement"

result = list()
for tup in tqdm(df.itertuples(), total=df.shape[0]):
    message, functionality, hate_label = tup[1], tup[2], tup[3]
    if functionality == func:  # only apply to the target functionality
        orig_statement =" ".join([word for word in word_tokenize(message) if word.lower() not in negation_words])
        prob = test_single_hypothesis(orig_statement, hypothesis, target_group=target_group)
        result.append(int(prob > 0.5))
    else:
        result.append(-1)
df['nli_f14_positive_statement'] = result

In [None]:
df.to_csv(output_folder/data_file, index=False)

### F15: Non-hate expressed using negated hateful statement

In [None]:
hypothesis = "That text contains a negation."
func = "F15: Non-hate expressed using negated hateful statement"

result = list()
for tup in tqdm(df.itertuples(), total=df.shape[0]):
    message, functionality, hate_label = tup[1], tup[2], tup[3]
    if functionality == func:  # only apply to the target functionality
        prob = test_single_hypothesis(message, hypothesis)
        result.append(int(prob > 0.5))
    else:
        result.append(-1)
df['nli_f15_negation'] = result

In [None]:
hypothesis = "That text contains hate speech against [X]."
func = "F15: Non-hate expressed using negated hateful statement"

result = list()
for tup in tqdm(df.itertuples(), total=df.shape[0]):
    message, functionality, hate_label = tup[1], tup[2], tup[3]
    if functionality == func:  # only apply to the target functionality
        orig_statement =" ".join([word for word in word_tokenize(message) if word.lower() not in negation_words])
        prob = test_single_hypothesis(orig_statement, hypothesis, target_group=target_group)
        result.append(int(prob > 0.5))
        #print(f"{int(prob > 0.5)}\t{orig_statement}\t{message}")
    else:
        result.append(-1)
df['nli_f15_hate_statement'] = result

In [None]:
df.to_csv(output_folder/data_file, index=False)

### F16: Hate phrased as a question

In [None]:
hypothesis = "That text is a question."
func = "F16: Hate phrased as a question"

result = list()
for tup in tqdm(df.itertuples(), total=df.shape[0]):
    message, functionality, hate_label = tup[1], tup[2], tup[3]
    if functionality == func:  # only apply to the target functionality
        prob = test_single_hypothesis(message, hypothesis)
        result.append(int(prob > 0.5))
    else:
        result.append(-1)
df['nli_f16_question'] = result

In [None]:
df.to_csv(output_folder/data_file, index=False)

### F17: Hate phrased as an opinion

In [None]:
hypothesis = "That text is a subjective opinion."
func = "F17: Hate phrased as an opinion"

result = list()
for tup in tqdm(df.itertuples(), total=df.shape[0]):
    message, functionality, hate_label = tup[1], tup[2], tup[3]
    if functionality == func:  # only apply to the target functionality
        prob = test_single_hypothesis(message, hypothesis)
        result.append(int(prob > 0.5))
    else:
        result.append(-1)
df['nli_f17_opinion'] = result

In [None]:
df.to_csv(output_folder/data_file, index=False)

### F18: Neutral statements using protected group identifier

In [None]:
hypothesis = "That text is a very positive statement."
func = "F18: Neutral statements using protected group identifier"

result = list()
for tup in tqdm(df.itertuples(), total=df.shape[0]):
    message, functionality, hate_label = tup[1], tup[2], tup[3]
    if functionality == func:  # only apply to the target functionality
        prob = test_single_hypothesis(message, hypothesis)
        result.append(int(prob < 0.5))
    else:
        result.append(-1)
df['nli_f18_neutral'] = result

In [None]:
df.to_csv(output_folder/data_file, index=False)

### F19: Positive statements using protected group identifier

In [None]:
hypothesis = "That text is a positive statement."
func = "F19: Positive statements using protected group identifier"

result = list()
for tup in tqdm(df.itertuples(), total=df.shape[0]):
    message, functionality, hate_label = tup[1], tup[2], tup[3]
    if functionality == func:  # only apply to the target functionality
        prob = test_single_hypothesis(message, hypothesis)
        result.append(int(prob > 0.5))
    else:
        result.append(-1)
df['nli_f19_positive'] = result

In [None]:
df.to_csv(output_folder/data_file, index=False)

### F20: Denouncements of hate that quote it

In [None]:
quote_reg = re.compile(r" [‘“'\"](.*?)[”’'\"]")
hate_hypothesis = "That contains hate speech."
denounce_hypothesis = "This text supports [X]"
func = "F20: Denouncements of hate that quote it"

hate_result = list()
denounce_result = list()
for tup in tqdm(df.itertuples(), total=df.shape[0]):
    message, functionality, hate_label = tup[1], tup[2], tup[3]
    
    if functionality == func:  # only apply to the target functionality
        quotes = quote_reg.findall(f" {message}")
        if len(quotes) == 1:  # contains exactly 1 quote
            prob = test_single_hypothesis(quotes[0], hate_hypothesis)
            hate_result.append(int(prob > 0.5))
            
            surround_text = quote_reg.sub(" [X]", f" {message}").strip()
            prob = test_single_hypothesis(surround_text, denounce_hypothesis)
            denounce_result.append(int(prob < 0.5))
        else:
            hate_result.append(0)
            denounce_result.append(0)
            print(f"[INFO]: Doesn't contain quote \t {message}")
    else:
        hate_result.append(-1)
        denounce_result.append(-1)
df['nli_f20_hate_quote'] = hate_result
df['nli_f20_denounce'] = denounce_result

In [None]:
df.to_csv(output_folder/data_file, index=False)

### F21: Denouncements of hate that make direct reference to it

In [None]:
hypothesis = "That text is a denouncement."
func = "F21: Denouncements of hate that make direct reference to it"

result = list()
for tup in tqdm(df.itertuples(), total=df.shape[0]):
    message, functionality, hate_label = tup[1], tup[2], tup[3]
    if functionality == func:  # only apply to the target functionality
        prob = test_single_hypothesis(message, hypothesis)
        result.append(int(prob > 0.5))
    else:
        result.append(-1)
df['nli_f21_denounce'] = result

In [None]:
df.to_csv(output_folder/data_file, index=False)

### F22: Abuse targeted at objects

In [None]:
hypothesis = "That text is about a non-human material object."
func = "F22: Abuse targeted at objects"

result = list()
for tup in tqdm(df.itertuples(), total=df.shape[0]):
    message, functionality, hate_label = tup[1], tup[2], tup[3]
    if functionality == func:  # only apply to the target functionality
        prob = test_single_hypothesis(message, hypothesis)
        result.append(int(prob > 0.5))
    else:
        result.append(-1)
df['nli_f22_object'] = result

In [None]:
df.to_csv(output_folder/data_file, index=False)

### F23: Abuse targeted at individuals (not as member of a prot. group)

In [None]:
hypothesis = "That text is about you."
func = "F23: Abuse targeted at individuals (not as member of a prot. group)"

result = list()
for tup in tqdm(df.itertuples(), total=df.shape[0]):
    message, functionality, hate_label = tup[1], tup[2], tup[3]
    if functionality == func:  # only apply to the target functionality
        prob = test_single_hypothesis(message, hypothesis)
        result.append(int(prob > 0.5))
    else:
        result.append(-1)
df['nli_f23_you'] = result

In [None]:
df.to_csv(output_folder/data_file, index=False)

## 3. Functionality-Level Label Aggregation

Aggregate the NLI label based on specific hypothesis tests using predefined rules.

In [None]:
functionality2tests = {
    "F1: Expression of strong negative emotions (explicit)": ['nli_identity', 'nli_f1_neg_emotion'],
    "F2: Description using very negative attributes (explicit)": ['nli_identity', 'nli_f2_neg_attr'],
    "F3: Dehumanisation (explicit)": ['nli_identity', 'nli_f3_dehum'],
    "F4: Implicit derogation": ['nli_identity', 'nli_f4_implicit', 'nli_f4_derogation'],
    "F5: Direct threat": ['nli_identity', 'nli_f5_dir_threat'],
    "F6: Threat as normative statement": ['nli_identity', 'nli_f6_norm_threat'],
    "F7: Hate expressed using slur": ['nli_identity', 'nli_hate', 'nli_f7_slur'],
    "F8: Non-hateful homonyms of slurs": ['NOT nli_identity', 'NOT nli_hate'],
    "F9: Reclaimed slurs": ['nli_identity', 'nli_f9_aggregate'],
    "F10: Hate expressed using profanity": ['nli_identity', 'nli_hate', 'nli_f10_profane'],
    "F11: Non-hateful use of profanity": ['NOT nli_identity', 'nli_f11_profane'],
    "F12: Hate expressed through reference in subsequent clauses": ['nli_identity', 'nli_hate', 'nli_f12_subsequent_clause_hate'],
    "F13: Hate expressed through reference in subsequent sentences": ['nli_identity', 'nli_hate', 'nli_f13_subsequent_sent_hate'],
    "F14: Hate expressed using negated positive statement": ['nli_identity', 'nli_f14_negation', 'nli_f14_positive_statement'],
    "F15: Non-hate expressed using negated hateful statement": ['nli_identity', 'nli_f15_negation', 'nli_f15_hate_statement'],
    "F16: Hate phrased as a question": ['nli_identity', 'nli_hate', 'nli_f16_question'],
    "F17: Hate phrased as an opinion": ['nli_identity', 'nli_hate', 'nli_f17_opinion'],
    "F18: Neutral statements using protected group identifier": ['nli_identity','NOT nli_hate', 'nli_f18_neutral'],
    "F19: Positive statements using protected group identifier": ['nli_identity', 'nli_f19_positive'],
    "F20: Denouncements of hate that quote it": ['nli_identity', 'nli_f20_hate_quote', 'nli_f20_denounce'],
    "F21: Denouncements of hate that make direct reference to it": ['nli_identity', 'nli_f21_denounce'],
    "F22: Abuse targeted at objects": ['NOT nli_identity', 'nli_hate', 'nli_f22_object'],
    "F23: Abuse targeted at individuals (not as member of a prot. group)": ['NOT nli_identity', 'nli_hate', 'nli_f23_you'],
    "F24: Abuse targeted at non-protected groups (e.g. professions)": ['NOT nli_identity', 'nli_hate'],
    "F25-29: Spelling variation": ['nli_identity', 'nli_hate']
}

In [None]:
def aggregate_label(x):
    tests = functionality2tests[x['functionality']]
    pass_test = True
    for test in tests:
        answer = 1
        if test.startswith("NOT"):
            answer = 0
            test = test.split(" ")[-1]
        if x[test] < 0:
            print(f"Sth is wrong. {x}")
        elif x[test] != answer:
            pass_test = False
            break
    return int(pass_test)

In [None]:
df['nli_pass_test'] = df.apply(lambda x: aggregate_label(x), axis=1)

In [None]:
df[['functionality', 'nli_pass_test', 'hate_label']].groupby(['functionality'], sort=False).mean().reset_index()

In [None]:
df.to_csv(output_folder/data_file, index=False)

## Appendix: Data analysis

In [None]:
df.groupby("functionality", sort=False).mean().reset_index()

In [None]:
df[df['nli_hate_label']==df['hate_label']].sample(10)

In [None]:
df[df['nli_hate']==df['hate_label']].shape

In [None]:
df[df['nli_hate_label']==df['hate_label']].shape