## Load Data

In [211]:
import pandas as pd

model_name = "gpt2-small-1337-5"

path_dev = f"tmp/output/{model_name}/output_dev.csv"
path_test = f"tmp/output/{model_name}/output_test.csv"

df_dev = pd.read_csv(path_dev)
df_test = pd.read_csv(path_test)
df_test.head(1)

Unnamed: 0,prompt,post,md5_hash,lewd,off,intention,grp,ing,group_set,statement_set,...,sampling_output_4,sampling_output_5,sampling_output_6,sampling_output_7,sampling_output_8,sampling_output_9,sampling_output_10,best_sampling_output,best_sampling_output_idx,best_sampling_output_score
0,<|startoftext|> Fuc u say ? @AyyyeThatsChubb: ...,Fuc u say ? @AyyyeThatsChubb: All these hoes l...,99f1f9c8d4abca821626837048093f3b,0,0,0,0,0,[],[],...,<|startoftext|> Fuc u say ? @AyyyeThatsChubb: ...,<|startoftext|> Fuc u say ? @AyyyeThatsChubb: ...,<|startoftext|> Fuc u say ? @AyyyeThatsChubb: ...,<|startoftext|> Fuc u say ? @AyyyeThatsChubb: ...,<|startoftext|> Fuc u say ? @AyyyeThatsChubb: ...,<|startoftext|> Fuc u say ? @AyyyeThatsChubb: ...,<|startoftext|> Fuc u say ? @AyyyeThatsChubb: ...,<|startoftext|> Fuc u say ? @AyyyeThatsChubb: ...,6,-7.557428


In [212]:
len(df_dev), len(df_test)

(4673, 4698)

In [213]:
df_dev.columns

Index(['prompt', 'post', 'md5_hash', 'lewd', 'off', 'intention', 'grp', 'ing',
       'group_set', 'statement_set', 'group_multiple', 'statement_multiple',
       'greedy_output', 'sampling_output_1', 'sampling_output_2',
       'sampling_output_3', 'sampling_output_4', 'sampling_output_5',
       'sampling_output_6', 'sampling_output_7', 'sampling_output_8',
       'sampling_output_9', 'sampling_output_10', 'best_sampling_output',
       'best_sampling_output_idx', 'best_sampling_output_score'],
      dtype='object')

In [214]:
df_dev["best_sampling_output_idx"].unique()

array([7, 1, 0, 5, 6, 2, 3, 4, 8, 9])

## Create Lists of True Labels and Predictions

True labels:

In [215]:
from typing import List, Dict
import ast


def get_true_labels(df: pd.DataFrame) -> Dict[str, List[int]]:
    true_labels = {
        "lewd": [int(item) for item in df["lewd"].tolist()],
        "off": [int(item) for item in df["off"].tolist()],
        "intention": [int(item) for item in df["intention"].tolist()],
        "grp": [int(item) for item in df["grp"].tolist()],
        "ing": [int(item) for item in df["ing"].tolist()],
        "group_set": df["group_set"].tolist(),
        "group_multiple": df["group_multiple"].tolist(),
        "statement_set": df["statement_set"].tolist(),
        "statement_multiple": df["statement_multiple"].tolist(),
    }

    # convert strings back to lists
    true_labels["group_set"] = [
        ast.literal_eval(item) for item in true_labels["group_set"]
    ]
    true_labels["group_multiple"] = [
        ast.literal_eval(item) for item in true_labels["group_multiple"]
    ]
    true_labels["statement_set"] = [
        ast.literal_eval(item) for item in true_labels["statement_set"]
    ]
    true_labels["statement_multiple"] = [
        ast.literal_eval(item) for item in true_labels["statement_multiple"]
    ]

    assert isinstance(true_labels["group_set"][0], list)
    assert isinstance(true_labels["group_multiple"][0], list)
    assert isinstance(true_labels["statement_set"][0], list)
    assert isinstance(true_labels["statement_multiple"][0], list)

    return true_labels


true_labels_dev = get_true_labels(df_dev)
true_labels_test = get_true_labels(df_test)

Predictions of the model:

In [216]:
import config
from utils.helper import remove_substrings


def get_predictions(df: pd.DataFrame, pred_col: str) -> Dict[str, List[int]]:
    model_output = df[pred_col].tolist()

    predictions = {
        "lewd": [1 if config.LEWD_TOKEN[1] in output else 0 for output in model_output],
        "off": [1 if config.OFF_TOKEN[1] in output else 0 for output in model_output],
        "intention": [
            1 if config.INT_TOKEN[1] in output else 0 for output in model_output
        ],
        "grp": [1 if config.GRP_TOKEN[1] in output else 0 for output in model_output],
        "ing": [1 if config.ING_TOKEN[1] in output else 0 for output in model_output],
    }

    group, statement = [], []
    for output in model_output:
        # remove original prompt from the output
        if len(output.split(config.SEP_TOKEN)) > 1:
            clean_output = output.split(config.SEP_TOKEN)[1]
        else:
            # model gave empty response
            group.append("")
            statement.append("")
            print("Empty response: ", output)
            continue

        # no [SEP] token found --> empty group and statement
        if config.HELP_SEP not in clean_output:
            group.append("")
            statement.append("")
            continue

        # if only 1 [SEP] token found --> we just guess that it is the group
        elif clean_output.count(config.HELP_SEP) == 1:
            # remove any classification tokens from the output
            clean_text = remove_substrings(clean_output.split(config.HELP_SEP)[1])
            group.append(clean_text)
            statement.append("")
            continue

        else:
            # remove any classification tokens from the output
            clean_grp = remove_substrings(clean_output.split(config.HELP_SEP)[1])
            clean_stmt = remove_substrings(clean_output.split(config.HELP_SEP)[2])
            group.append(clean_grp)
            statement.append(clean_stmt)

    predictions["group"] = group
    predictions["statement"] = statement

    return predictions


list_sampling_col = ["sampling_output_" + str(i + 1) for i in range(10)]

predictions_greedy_dev = get_predictions(df_dev, "greedy_output")
predictions_greedy_test = get_predictions(df_test, "greedy_output")

predictions_sampling_dev = get_predictions(df_dev, "best_sampling_output")
predictions_sampling_test = get_predictions(df_test, "best_sampling_output")

# predictions_sampling_dev = [get_predictions(df_dev, col) for col in list_sampling_col]
# predictions_sampling_test = [get_predictions(df_test, col) for col in list_sampling_col]

## Run the Evaluation for the Categorical Variables

In [217]:
from evaluate import evaluate_categorical_variables


def get_categorical_variable_evaluation(
    y_true: List[int], y_pred: List[int]
) -> Dict[str, float]:
    return {
        "lewd": evaluate_categorical_variables(y_true["lewd"], y_pred["lewd"]),
        "off": evaluate_categorical_variables(y_true["off"], y_pred["off"]),
        "intention": evaluate_categorical_variables(
            y_true["intention"], y_pred["intention"]
        ),
        "grp": evaluate_categorical_variables(y_true["grp"], y_pred["grp"]),
        "ing": evaluate_categorical_variables(y_true["ing"], y_pred["ing"]),
    }


classification_result_greedy_dev = get_categorical_variable_evaluation(
    true_labels_dev, predictions_greedy_dev
)
classification_result_greedy_test = get_categorical_variable_evaluation(
    true_labels_test, predictions_greedy_test
)
classification_result_sampling_dev = get_categorical_variable_evaluation(
    true_labels_dev, predictions_sampling_dev
)
classification_result_sampling_test = get_categorical_variable_evaluation(
    true_labels_test, predictions_sampling_test
)

Save the results to JSON:

In [218]:
import json
import os


def save_results_to_json(results: Dict[str, float], filename: str) -> None:
    """Save result dictionary to json file."""
    dir_path = os.path.dirname(filename)
    os.makedirs(dir_path, exist_ok=True)

    with open(filename, "w") as f:
        json.dump(results, f, indent=4)


save_results_to_json(
    {
        "greedy_dev": classification_result_greedy_dev,
        "greedy_test": classification_result_greedy_test,
        "sampling_dev": classification_result_sampling_dev,
        "sampling_test": classification_result_sampling_test,
    },
    f"tmp/results/{model_name}/classification.json",
)

## Run the Evaluation for the Generated Group and Statement

Get results for __greedy__ decoding:

In [219]:
from evaluate import evaluate_generated_text
from tqdm import tqdm


def evaluate_wrapper(
    predictions: List[str], references: List[List[str]]
) -> List[Dict[str, float]]:
    results = []
    for pred, refs in tqdm(zip(predictions, references), desc="Evaluating predictions"):
        if not pred.strip() or not refs:  # TODO change back to 'and'
            continue

        elif not pred.strip() or not refs:
            results.append(
                {
                    "bleu": 0.0,
                    "rouge": 0.0,
                }
            )
        else:
            if refs and not isinstance(refs[0], str):
                print("Error in Refs: ", refs)
                continue
            results.append(evaluate_generated_text(pred, refs))

    # get average results
    avg_results = {}
    for key in results[0].keys():
        sum_value = sum(d[key] for d in results)
        avg_results[key] = sum_value / len(results)

    return avg_results


save_results_to_json(
    {
        # groups
        "group_set_result_greedy_dev": evaluate_wrapper(
            predictions_greedy_dev["group"], true_labels_dev["group_set"]
        ),
        "group_multiple_result_greedy_dev": evaluate_wrapper(
            predictions_greedy_dev["group"], true_labels_dev["group_multiple"]
        ),
        "group_set_result_greedy_test": evaluate_wrapper(
            predictions_greedy_test["group"], true_labels_test["group_set"]
        ),
        "group_multiple_result_greedy_test": evaluate_wrapper(
            predictions_greedy_test["group"], true_labels_test["group_multiple"]
        ),
        # statements
        "statement_set_result_greedy_dev": evaluate_wrapper(
            predictions_greedy_dev["statement"], true_labels_dev["statement_set"]
        ),
        "statement_multiple_result_greedy_dev": evaluate_wrapper(
            predictions_greedy_dev["statement"], true_labels_dev["statement_multiple"]
        ),
        "statement_set_result_greedy_test": evaluate_wrapper(
            predictions_greedy_test["statement"], true_labels_test["statement_set"]
        ),
        "statement_multiple_result_greedy_test": evaluate_wrapper(
            predictions_greedy_test["statement"], true_labels_test["statement_multiple"]
        ),
    },
    f"tmp/results/{model_name}/greedy_generation.json",
)

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Evaluating predictions: 4673it [00:00, 22732.58it/s]
Evaluating predictions: 4673it [00:00, 22621.65it/s]
Evaluating predictions: 4698it [00:00, 64103.92it/s]
Evaluating predictions: 4698it [00:00, 24703.31it/s]
Evaluating predictions: 4673it [00:00, 16213.97it/s]
Evaluating predictions: 4673it [00:00, 16006.23it/s]
Evaluating predictions: 4698it [00:00, 17679.40it/s]
Evaluating predictions: 4698it [00:00, 17473.83it/s]


Get results for __sampling__ based decoding:

In [220]:
save_results_to_json(
    {
        # groups
        "group_set_result_sampling_dev": evaluate_wrapper(
            predictions_sampling_dev["group"], true_labels_dev["group_set"]
        ),
        "group_multiple_result_sampling_dev": evaluate_wrapper(
            predictions_sampling_dev["group"], true_labels_dev["group_multiple"]
        ),
        "group_set_result_sampling_test": evaluate_wrapper(
            predictions_sampling_test["group"], true_labels_test["group_set"]
        ),
        "group_multiple_result_sampling_test": evaluate_wrapper(
            predictions_sampling_test["group"], true_labels_test["group_multiple"]
        ),
        # statements
        "statement_set_result_sampling_dev": evaluate_wrapper(
            predictions_sampling_dev["statement"], true_labels_dev["statement_set"]
        ),
        "statement_multiple_result_sampling_dev": evaluate_wrapper(
            predictions_sampling_dev["statement"], true_labels_dev["statement_multiple"]
        ),
        "statement_set_result_sampling_test": evaluate_wrapper(
            predictions_sampling_test["statement"], true_labels_test["statement_set"]
        ),
        "statement_multiple_result_sampling_test": evaluate_wrapper(
            predictions_sampling_test["statement"],
            true_labels_test["statement_multiple"],
        ),
    },
    f"tmp/results/{model_name}/sampling_generation.json",
)

Evaluating predictions: 0it [00:00, ?it/s]

Evaluating predictions: 4673it [00:00, 92809.98it/s]
Evaluating predictions: 4673it [00:00, 36590.75it/s]
Evaluating predictions: 4698it [00:00, 93412.66it/s]
Evaluating predictions: 4698it [00:00, 33480.37it/s]
Evaluating predictions: 4673it [00:00, 25065.81it/s]
Evaluating predictions: 4673it [00:00, 24219.46it/s]
Evaluating predictions: 4698it [00:00, 22705.40it/s]
Evaluating predictions: 4698it [00:00, 22300.18it/s]


## Interpretation

- The evaluation for the generated texts is split between reference texts that can contain duplicates and references that only contain unique sentences -> Which performs better?