In [None]:
from tqdm.notebook import trange, tqdm
import pandas as pd
from pathlib import Path
from time import monotonic
import numpy as np 

from nltk.translate.bleu_score import sentence_bleu
from nltk import word_tokenize
import numpy as np
from scipy import stats

# HS Dataset Analysis

Compare dataset properties (ChatGPT vs. HateCheck)

- Perplexity
- Diversity
- Topic distribution
- Following the prompt

## 1. Loading the dataset

In [None]:
# cloud config
data_root = Path(".")
hatecheck_config = {"path": data_root/"hatecheck-data/test_suite_cases.csv",
                    "text_col": "test_case",
                    "func_col": "functionality",
                    "excluded_func_prefix": "spell_"
}

gpt_hate_config = {"path": Path("gpt-dataset"),
                    "text_col": "message",
                    "func_col": "functionality",
                    "excluded_func_prefix": "F25-29: Spelling variation"
}

In [None]:
# local config
data_root = Path("../datasets/")
hatecheck_config = {"path": data_root/"hatecheck-data/test_suite_cases.csv",
                    "text_col": "test_case",
                    "func_col": "functionality",
                    "excluded_func_prefix": "spell_"
}

gpt_hate_config = {"path": Path("../nli_hypothesis_test/output"),
                    "text_col": "message",
                    "func_col": "functionality",
                    "excluded_func_prefix": "F25-29: Spelling variation"
}

In [None]:
dataset = "gpt"  # TODO: change me. {"hatecheck", "gpt"}

if dataset == "hatecheck":
    config = hatecheck_config
elif dataset == "gpt":
    config = gpt_hate_config
else:
    print(f"Unidentified dataset: {dataset}")
    config = None

In [None]:
if dataset == "hatecheck":
    df = pd.read_csv(config['path'], encoding='utf-8', low_memory=False)
    print(f"Loaded {len(df)} examples.")
elif dataset == "gpt":
    dfs = []
    for f in config["path"].glob("dataset_*.csv"):
        dfs.append(pd.read_csv(f))
    df = pd.concat(dfs)
    
    # filter by pass NLI test
    print(f"Before NLI test: {len(df)} examples.")
    df = df[df["nli_pass_test"]==1]
    print(f"After NLI test: {len(df)} examples.")

In [None]:
print(f"Before dropping duplicates {len(df)} entries.")
df.drop_duplicates(subset=[config['text_col']], keep='first', inplace=True, ignore_index=True)
print(f"After dropping duplicates {len(df)} entries.")

In [None]:
# Remove the test cases involving spelling errors since they'll influnce the stats calculation
df  = df.loc[~df[config['func_col']].str.startswith(config["excluded_func_prefix"], na=False)]
print(f"Remaining {len(df)} examples after excluding spelling errors.")

## 2. Calculate the stats

### 2.1 Self-BLEU to calculate the diversity

- Use `Self-BLEU-1` and `Self-BLEU-2`
- The lower the more diverse

In [None]:
candidates = df[config['text_col']].tolist()

In [None]:
candidate_tokenized = [[word.lower() for word in word_tokenize(str(candidate)) if word.isalpha()] for candidate in candidates]

In [None]:
bleu2_scores, bleu3_scores, bleu4_scores = [], [], []
for i in trange(len(candidate_tokenized)):
    references = candidate_tokenized.copy()
    references.pop(i)
    b2, b3, b4 = sentence_bleu(references=references, hypothesis=candidate_tokenized[i], 
        weights=[(1./2., 1./2.), (1./3., 1./3., 1./3.),(1./4., 1./4., 1./4., 1./4.)])
    bleu2_scores.append(b2)
    bleu3_scores.append(b3)
    bleu4_scores.append(b4)
sum(bleu2_scores)/len(bleu2_scores), sum(bleu3_scores)/len(bleu3_scores), sum(bleu4_scores)/len(bleu4_scores)

In [None]:
# exp cell
i = 0
references = candidate_tokenized[:i] + candidate_tokenized[i+1 :]
sentence_bleu(references=references, hypothesis=candidate_tokenized[i], 
                  weights=[(1., 0), (1./2., 1./2.)])

In [None]:
# exp cell
bleu1_scores, bleu2_scores = [], []
for reference in references:
    b1, b2 = sentence_bleu(references=[reference], hypothesis=candidate_tokenized[i], 
                  weights=[(1., 0), (1./2., 1./2.)])
    bleu1_scores.append(b1)
    bleu2_scores.append(b2)
sum(bleu1_scores)/len(bleu1_scores), sum(bleu2_scores)/len(bleu2_scores)

In [None]:
sb2s, sb3s, sb4s = [], [], []
if dataset == "gpt":
    for i in range(10):
        print(f"Run {i}:")
        df_sample = df.sample(n=2968, random_state=i)
        
        candidates = df_sample[config['text_col']].tolist()
        candidate_tokenized = [[word.lower() for word in word_tokenize(str(candidate)) if word.isalpha()] for candidate in candidates]

        bleu2_scores, bleu3_scores, bleu4_scores = [], [], []
        for j in trange(len(candidate_tokenized)):
            references = candidate_tokenized.copy()
            references.pop(j)
            b2, b3, b4 = sentence_bleu(references=references, hypothesis=candidate_tokenized[j], 
                weights=[(1./2., 1./2.), (1./3., 1./3., 1./3.),(1./4., 1./4., 1./4., 1./4.)])
            bleu2_scores.append(b2)
            bleu3_scores.append(b3)
            bleu4_scores.append(b4)
        bleu_2 = sum(bleu2_scores)/len(bleu2_scores)
        bleu_3 = sum(bleu3_scores)/len(bleu3_scores)
        bleu_4 = sum(bleu4_scores)/len(bleu4_scores)
        print(f"bleu-2={bleu_2}, bleu-3={bleu_3}, bleu-4={bleu_4}")
        sb2s.append(bleu_2)
        sb3s.append(bleu_3)
        sb4s.append(bleu_4)
        with open(f"run_{i}.log", "w+") as f:
            f.write(f"bleu-2={bleu_2}, bleu-3={bleu_3}, bleu-4={bleu_4}")

In [None]:
np.mean(sb2s), np.std(sb2s)

In [None]:
np.mean(sb3s), np.std(sb3s)

In [None]:
np.mean(sb4s), np.std(sb4s)

In [None]:
import numpy as np
from scipy import stats
stats.ttest_1samp(sb2s, popmean=0.937088016679527)

In [None]:
stats.ttest_1samp(sb3s, popmean=0.8627437449674054)

In [None]:
stats.ttest_1samp(sb4s, popmean=0.7611379109893396)

## 2.2 Perplexity to calculate fluency

In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

ppl_model_name = "gpt2-large"
device = "cuda"

In [None]:
ppl_model = GPT2LMHeadModel.from_pretrained(ppl_model_name).to(device)
ppl_tokenizer = GPT2TokenizerFast.from_pretrained(ppl_model_name)

In [None]:
def ppl_score(texts, verbose=False):
    """ Calculate the negative log likelihood score.
    Since we care only about the rank, it's no difference from the perplexity
    :return:
    """
    nnl_loss = list()
    for i in trange(len(texts)):
        encoded_input = ppl_tokenizer(texts[i], return_tensors='pt').to(device)
        target_ids = encoded_input['input_ids'].clone()
        with torch.no_grad():
            outputs = ppl_model(encoded_input['input_ids'], labels=target_ids)
            nnl_loss.append(outputs['loss'].item())
    return nnl_loss

In [None]:
candidates = df[config['text_col']].tolist()

In [None]:
nnls = ppl_score(candidates)

In [None]:
ppl = torch.exp(torch.Tensor(nnls).mean())
print(f"Perplexity score: {ppl}")

In [None]:
ppls = []
if dataset == "gpt":
    for i in range(10):
        print(f"Run {i}:")
        start_time = monotonic()
        
        df_sample = df.sample(n=2968, random_state=i)
        
        candidates = df_sample[config['text_col']].tolist()
        nnls = ppl_score(candidates)
        ppl = torch.exp(torch.Tensor(nnls).mean())
        print(f"Perplexity score: {ppl}")
        ppls.append(ppl)
        print(f"Run time {monotonic() - start_time} seconds")

In [None]:
np.mean(ppls), np.std(ppls)