<h1>Evaluation of generated counterfactuals</h1>

In [1]:
import torch
import transformers
import numpy as np
import pandas as pd
import yaml
import itertools
import sklearn
import nltk

In [2]:
READ_SETTINGS_FROM_FILE = False

SETTINGS_PATH = "/home/diego/counterfactuals-generation/sentiment_task/zs_gpt2_experiments/settings/"
SETTING_NAME = "zs_prompt_1_validation.yaml"
RESULTS_NAME = "zs_prompt_1_validation-"

if READ_SETTINGS_FROM_FILE:
    a_yaml_file = open(f"{SETTINGS_PATH}{SETTING_NAME}")
    parsed_yaml_file = yaml.load(a_yaml_file, Loader=yaml.FullLoader)

    # the following params will be included in a yaml file
    RESULTS_PATH = parsed_yaml_file['RESULTS_PATH']
    FOLDS = parsed_yaml_file['FOLDS']
    CUDA_DEVICE = parsed_yaml_file['CUDA_DEVICE']
    CLASSIFIER_NAME = 'distilbert-base-uncased-finetuned-sst-2-english'
    CLASSIFIER_LABEL_MAP = {'LABEL_0':0, 'LABEL_1':1}
    GEN_ARGS = parsed_yaml_file['GEN_ARGS']
    print("Evaluation's params read from yaml file")

else:
    RESULTS_PATH = "/home/diego/counterfactuals-generation/sentiment_task/zs_gpt2_experiments/validation/"
    FOLDS = ["0"] # FOLDS = ["0", "1", "2", "3", "4"]
    CUDA_DEVICE = 0
    CLASSIFIER_NAME = 'distilbert-base-uncased-finetuned-sst-2-english'
    CLASSIFIER_LABEL_MAP = {'NEGATIVE':0, 'POSITIVE':1}
    GEN_ARGS = {
        "no_repeat_ngram_size": [4],
        "num_beams": [5],
        "repetition_penalty": [1.0],
        "temperature": [0.7]
    }
    print("Evaluation's params read from notebook")

Evaluation's params read from notebook


In [3]:
def load_results(folds, results_path, results_name, params):
    results = {}
    for fold in folds:
        results[fold] = {}
        for pars in params:
            results[fold][pars] = pd.read_csv(f"{results_path}fold_{fold}/{results_name}{pars}.gen", sep='\t')
    return results

In [4]:
class Evaluator:

    def __init__(self, s_dataset, s_tokenizer, s_model, s_label_dict, s_device):
        """Constructor of the evaluator
        @param: s_dataset The dataset for the sentiment classifier
        @param: s_tokenizer The tokenizer for the sentiment classifier
        @param: s_model The sentiment classifier
        @param: s_label_dict The mapping between classifier's outputs and labels
        """
        self.s_dataset = s_dataset
        self.s_tokenizer = s_tokenizer
        self.s_model = s_model
        self.s_label_dict = s_label_dict
        if torch.cuda.is_available():
            s_device = -1
        self.classifier = transformers.pipeline(
            task="sentiment-analysis",
            model=s_model,
            tokenizer=s_tokenizer,
            framework="pt",
            device=s_device)
        self.predicted_labels = []
        self.score_labels = []

        # remove some nan values in the generated counterfactuals
        print(f"# of nan values removed in generated counterfactuals:{self.s_dataset['generated_counter'].isna().sum()}")
        self.s_dataset = self.s_dataset.dropna()

    def infer_predictions(self):

        texts = self.s_dataset["generated_counter"].values

        for text_to_classify in texts:
            if len(text_to_classify) > 512:
                result = self.classifier(text_to_classify[:511])[0]
            else:
                result = self.classifier(text_to_classify)[0]

            self.predicted_labels.append(self.s_label_dict[result['label']])
            self.score_labels.append(result['score'])


    def lf_score(self):
        """Calculate the Label Flip Score (LFS)
        """
        y_desired = self.s_dataset["label_counter"].values
        return sklearn.metrics.accuracy_score(y_desired, self.predicted_labels)

    def get_conf_score_pred(self):
        return np.mean(self.score_labels)

    def blue_score(self):
        """Calculate the BLUE score for a pair of example-counter.

           Returns mean and variance of the BLUE scores.
        """
        BLEUscore = []
        smoothing_function = nltk.translate.bleu_score.SmoothingFunction()
        true_counters = self.s_dataset["counterfactual"].values
        gen_counters = self.s_dataset["generated_counter"].values
        for true_counter, gen_counter in zip(true_counters, gen_counters):
            # example and counterfactual need to be tokenized first

            try:
                # the reference is the true counterfactual
                reference = nltk.tokenize.word_tokenize(true_counter)

                # the hypothesis is the generated counterfactual
                hypothesis = nltk.tokenize.word_tokenize(gen_counter)

            except TypeError:
                continue
                # print(true_counter)

            BLEUscore.append(nltk.translate.bleu_score.sentence_bleu([reference],
                                                                     hypothesis,
                                                                     smoothing_function=smoothing_function.method1))
        return np.mean(BLEUscore), np.var(BLEUscore)

def evaluate_fold_results(f_res, o_file, o_str):
    for cfg_gen in f_res:
        print(f"Cfg:{cfg_gen}")
        cfg_res = f_res[cfg_gen]
        evaluator = Evaluator(cfg_res, tokenizer, lm, CLASSIFIER_LABEL_MAP, CUDA_DEVICE)
        blue_score, var_blue_score = evaluator.blue_score()
        evaluator.infer_predictions()
        conf_score = evaluator.get_conf_score_pred()
        lf_score = evaluator.lf_score()

        o_str += f"{cfg_gen}\t{blue_score}\t{var_blue_score}\t{lf_score}\t{conf_score}\n"
        o_file.write(o_str)

<h3>Load sentiment classifier</h3>

In [5]:
tokenizer = transformers.AutoTokenizer.from_pretrained(CLASSIFIER_NAME)
lm = transformers.AutoModelForSequenceClassification.from_pretrained(CLASSIFIER_NAME)

In [6]:
all_pars = sorted(GEN_ARGS)
gen_grid = list(itertools.product(*(GEN_ARGS[par] for par in all_pars)))

results_dict = load_results(FOLDS, RESULTS_PATH, RESULTS_NAME, gen_grid)
print("Results loaded...")
# results_dict

Results loaded...


<h3>Calculate metrics for the generated counterfactuals for all different folds</h3>

In [7]:
def evaluate_fold_results(f_res, o_file, o_str):
    for cfg_gen in f_res:
        print(f"Cfg:{cfg_gen}")
        cfg_res = f_res[cfg_gen]
        evaluator = Evaluator(cfg_res, tokenizer, lm, CLASSIFIER_LABEL_MAP, CUDA_DEVICE)
        blue_score, var_blue_score = evaluator.blue_score()
        evaluator.infer_predictions()
        conf_score = evaluator.get_conf_score_pred()
        lf_score = evaluator.lf_score()

        o_str += f"{cfg_gen}\t{blue_score}\t{var_blue_score}\t{lf_score}\t{conf_score}\n"
        o_file.write(o_str)

with open(RESULTS_PATH + "val_results.csv", 'w') as outfile:
    # print file header
    outfile.write("fold\tcfg\tblue\tvar_blue\tlfs\tconf_score_pred\n")
    for foldd in results_dict:
        fold_results = results_dict[foldd]
        # add fold and cfg
        out_str = f"{foldd}\t"
        evaluate_fold_results(fold_results, outfile, out_str)

Cfg:(4, 5, 1.0, 0.7)
# of nan values removed in generated counterfactuals:3


<h3>Calculate best cfg for each fold</h3>

In [8]:
df_results = pd.read_csv(f"{RESULTS_PATH}val_results.csv", sep='\t')
print("Results loaded...")
# results_dict

Results loaded...


In [9]:
max_values_lfs = df_results.loc[df_results.reset_index().groupby(['fold'])['lfs'].idxmax()]
max_values_blue = df_results.loc[df_results.reset_index().groupby(['fold'])['blue'].idxmax()]

In [10]:
max_values_lfs.to_csv(f"{RESULTS_PATH}val_best_lfs.csv", sep="\t", header=True, index=False)
max_values_blue.to_csv(f"{RESULTS_PATH}val_best_blue.csv", sep="\t", header=True, index=False)

In [11]:
max_values_lfs

Unnamed: 0,fold,cfg,blue,var_blue,lfs,conf_score_pred
0,0,"(4, 5, 1.0, 0.7)",0.019246,0.000396,0.503778,0.973354


In [12]:
df_results

Unnamed: 0,fold,cfg,blue,var_blue,lfs,conf_score_pred
0,0,"(4, 5, 1.0, 0.7)",0.019246,0.000396,0.503778,0.973354
